In [2]:
import os
import pandas as pd

root_path = 'data/original/train'
data_path = os.path.join(root_path, 'text')
label_path = os.path.join(root_path, 'label')

## 将数据合并到一个csv中

In [28]:
data = {
    'ID': [],
    'Text': []
}

for filename in sorted(os.listdir(data_path), key=lambda x: int(x.split('.')[0])):
    if filename.endswith('.txt'):
        id = filename[:-4]
        data['ID'].append(id)
        with open(os.path.join(data_path, filename), 'r') as file:
            text = file.read()
            data['Text'].append(text)

data = pd.DataFrame(data)
data = data.astype({
    'ID': str,
    'Text': str
})
# data.to_csv('data.csv', index=False)

In [29]:
data.dtypes

ID      object
Text    object
dtype: object

In [30]:
print(len(data))
data.tail(5)

3956


Unnamed: 0,ID,Text
3951,3951,来自土耳其的加拉塔萨雷上上赛季主场输给埃因霍温，随后战胜利物浦。本场看好3/0打出。
3952,3952,何以如此？据笔者了解，自从中国银行业协会提出“银行受理客户7折房贷利率优惠申请不得搭售其他金...
3953,3953,台湾《大魔竞》总冠军，湖南卫视金牌魔术团金牌魔术师，央视《时尚中国》时尚金牌魔术师＜我要上春...
3954,3954,小爽是一名基层女民警，与男同志一样参加值班执勤，管理社区，巡逻破案与治安防范，快乐工作幸福生...
3955,3955,上赛季德甲主队3胜1平。博天堂平赔高达3.30，两队战和可能最小。


In [3]:
labels = []

for filename in sorted(os.listdir(label_path), key=lambda x: int(x.split('.')[0])):
    if filename.endswith('.csv'):
        label = pd.read_csv(os.path.join(label_path, filename), dtype={
            'ID': str,
            'Category': str,
            'Pos_b': int,
            'Pos_e': int,
            'Privacy': str
        })
        labels.append(label)

labels = pd.concat(labels)
# labels.to_csv('label.csv', index=False)

FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'data/original/train\\label'

In [10]:
labels.dtypes

ID          object
Category    object
Pos_b        int64
Pos_e        int64
Privacy     object
dtype: object

In [11]:
print(len(labels))
print(labels['ID'].nunique())
labels.tail(5)

20503
2515


Unnamed: 0,ID,Category,Pos_b,Pos_e,Privacy
4,2513,address,16,19,馬來西亞
5,2513,address,5,6,中國
6,2513,address,33,36,马来西亚
0,2514,QQ,6,14,229215757
1,2514,organization,91,95,平均律乐队


## 获取所有类别(Category)

In [12]:
categories = labels['Category'].unique()
categories = ['O'] + [f"{prefix}-{category}" for category in categories for prefix in ('B', 'I')]
categories

['O',
 'B-position',
 'I-position',
 'B-name',
 'I-name',
 'B-movie',
 'I-movie',
 'B-organization',
 'I-organization',
 'B-company',
 'I-company',
 'B-book',
 'I-book',
 'B-address',
 'I-address',
 'B-scene',
 'I-scene',
 'B-mobile',
 'I-mobile',
 'B-email',
 'I-email',
 'B-game',
 'I-game',
 'B-government',
 'I-government',
 'B-QQ',
 'I-QQ',
 'B-vx',
 'I-vx']

In [13]:
category2idx = {label: index for index, label in enumerate(categories)}
category2idx

{'O': 0,
 'B-position': 1,
 'I-position': 2,
 'B-name': 3,
 'I-name': 4,
 'B-movie': 5,
 'I-movie': 6,
 'B-organization': 7,
 'I-organization': 8,
 'B-company': 9,
 'I-company': 10,
 'B-book': 11,
 'I-book': 12,
 'B-address': 13,
 'I-address': 14,
 'B-scene': 15,
 'I-scene': 16,
 'B-mobile': 17,
 'I-mobile': 18,
 'B-email': 19,
 'I-email': 20,
 'B-game': 21,
 'I-game': 22,
 'B-government': 23,
 'I-government': 24,
 'B-QQ': 25,
 'I-QQ': 26,
 'B-vx': 27,
 'I-vx': 28}

## 对齐Text和Label

In [14]:
transformed_labels = []
for _, row in data.iterrows():
  text = row['Text']
  id = row['ID']
  labels_for_text = labels[labels['ID'] == row['ID']]
  transformed_label = ['O' for _ in range(len(text))]
  for _, label_row in labels_for_text.iterrows():
    start = label_row['Pos_b']
    end = label_row['Pos_e']
    assert text[start : end + 1] == label_row['Privacy'], '标签位置和文本不匹配'
    category = label_row['Category']
    transformed_label[start] = 'B-' + category
    for i in range(start + 1, end + 1):
      transformed_label[i] = 'I-' + category
  transformed_labels.append(transformed_label)
data['Label'] = transformed_labels

In [15]:
data.tail(5)

Unnamed: 0,ID,Text,Label
2510,2510,“这起案件之所以近半年才暴露，完全是因李云飞钻了信用卡使用过程中的空子。”民警告诉记者，sk...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2511,2511,《前线任务》系列是1995年发源于超级任天堂的机器人战略游戏，游戏由THQ旗下的KAOSST...,"[B-game, I-game, I-game, I-game, I-game, I-gam..."
2512,2512,还能在著名的雅拉河中坐游船感受南半球最大的港口风光哦。,"[O, O, O, O, O, O, B-scene, I-scene, I-scene, ..."
2513,2513,馬來西亞、中國演員、藝人.香港，馬來西亞及内地工作請私信本人作品：马来西亚-情牵南苑（玉兰，...,"[B-address, I-address, I-address, I-address, O..."
2514,2514,QQ交流群：229215757人人网：http://www.renren.com/27114...,"[O, O, O, O, O, O, B-QQ, I-QQ, I-QQ, I-QQ, I-Q..."


## Tokenize Text

In [31]:
data['Text'] = data['Text'].apply(list)

In [17]:
data.apply(lambda row: len(row.Text) != len(row.Label), axis=1).any()

False

In [18]:
data.tail(5)

Unnamed: 0,ID,Text,Label
2510,2510,"[“, 这, 起, 案, 件, 之, 所, 以, 近, 半, 年, 才, 暴, 露, ，, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2511,2511,"[《, 前, 线, 任, 务, 》, 系, 列, 是, 1, 9, 9, 5, 年, 发, ...","[B-game, I-game, I-game, I-game, I-game, I-gam..."
2512,2512,"[还, 能, 在, 著, 名, 的, 雅, 拉, 河, 中, 坐, 游, 船, 感, 受, ...","[O, O, O, O, O, O, B-scene, I-scene, I-scene, ..."
2513,2513,"[馬, 來, 西, 亞, 、, 中, 國, 演, 員, 、, 藝, 人, ., 香, 港, ...","[B-address, I-address, I-address, I-address, O..."
2514,2514,"[Q, Q, 交, 流, 群, ：, 2, 2, 9, 2, 1, 5, 7, 5, 7, ...","[O, O, O, O, O, O, B-QQ, I-QQ, I-QQ, I-QQ, I-Q..."


## 将Label转换为Category index

In [19]:
data['Label'] = data['Label'].apply(lambda x: [category2idx[label] for label in x])

In [20]:
data.tail(5)

Unnamed: 0,ID,Text,Label
2510,2510,"[“, 这, 起, 案, 件, 之, 所, 以, 近, 半, 年, 才, 暴, 露, ，, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2511,2511,"[《, 前, 线, 任, 务, 》, 系, 列, 是, 1, 9, 9, 5, 年, 发, ...","[21, 22, 22, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, ..."
2512,2512,"[还, 能, 在, 著, 名, 的, 雅, 拉, 河, 中, 坐, 游, 船, 感, 受, ...","[0, 0, 0, 0, 0, 0, 15, 16, 16, 0, 0, 0, 0, 0, ..."
2513,2513,"[馬, 來, 西, 亞, 、, 中, 國, 演, 員, 、, 藝, 人, ., 香, 港, ...","[13, 14, 14, 14, 0, 13, 14, 0, 0, 0, 0, 0, 0, ..."
2514,2514,"[Q, Q, 交, 流, 群, ：, 2, 2, 9, 2, 1, 5, 7, 5, 7, ...","[0, 0, 0, 0, 0, 0, 25, 26, 26, 26, 26, 26, 26,..."


## 保存处理后数据到磁盘

In [21]:
data.rename(columns={'ID': 'id', 'Text': 'tokens', 'Label': 'ner_tags'}, inplace=True)

In [22]:
data.tail(5)

Unnamed: 0,id,tokens,ner_tags
2510,2510,"[“, 这, 起, 案, 件, 之, 所, 以, 近, 半, 年, 才, 暴, 露, ，, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2511,2511,"[《, 前, 线, 任, 务, 》, 系, 列, 是, 1, 9, 9, 5, 年, 发, ...","[21, 22, 22, 22, 22, 22, 0, 0, 0, 0, 0, 0, 0, ..."
2512,2512,"[还, 能, 在, 著, 名, 的, 雅, 拉, 河, 中, 坐, 游, 船, 感, 受, ...","[0, 0, 0, 0, 0, 0, 15, 16, 16, 0, 0, 0, 0, 0, ..."
2513,2513,"[馬, 來, 西, 亞, 、, 中, 國, 演, 員, 、, 藝, 人, ., 香, 港, ...","[13, 14, 14, 14, 0, 13, 14, 0, 0, 0, 0, 0, 0, ..."
2514,2514,"[Q, Q, 交, 流, 群, ：, 2, 2, 9, 2, 1, 5, 7, 5, 7, ...","[0, 0, 0, 0, 0, 0, 25, 26, 26, 26, 26, 26, 26,..."


In [26]:
data.to_csv(os.path.join('data', 'processed', 'data.csv'), index=False)

In [None]:
categories_df = pd.Series(category2idx).to_frame().reset_index()
categories_df.columns = ['category', 'index']
categories_df

Unnamed: 0,category,index
0,O,0
1,B-position,1
2,I-position,2
3,B-name,3
4,I-name,4
5,B-movie,5
6,I-movie,6
7,B-organization,7
8,I-organization,8
9,B-company,9


In [None]:
categories_df.to_csv(os.path.join('data', 'processed', 'categories.csv'), index=False)