In [30]:
import pandas as pd
from tqdm.notebook import tqdm
import os
from sklearn.model_selection import KFold

In [31]:
df_train = pd.read_csv(f'../datasets/train.csv')
df_train.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [32]:
def agg_essays(train_flg):
    folder = 'train' if train_flg else 'test'
    names, texts = [], []
    for f in tqdm(list(os.listdir(f'../datasets/{folder}'))):
        names.append(f.replace('.txt', ''))  # id
        texts.append(open(f'../datasets/{folder}/' + f, 'r').read())  # text

    df_texts = pd.DataFrame({'id': names, 'text': texts})
    df_texts['text_split'] = df_texts.text.str.split()  # text按空格切分
    print('Completed tokenizing texts.')
    return df_texts


def ner(df_texts, df_train):
    all_entities = []
    for _, row in tqdm(df_texts.iterrows(), total=len(df_texts)):
        total = len(row['text_split'])
        entities = ['O'] * total

        for _, row2 in df_train[df_train['id'] == row['id']].iterrows():
            discourse = row2['discourse_type']
            list_ix = [int(x) for x in row2['predictionstring'].split(' ')]
            entities[list_ix[0]] = f'B-{discourse}'  # 实体标注(开头)
            for k in list_ix[1:]: entities[k] = f'I-{discourse}'  # 实体标注(中间或结尾)
        all_entities.append(entities)

    df_texts['entities'] = all_entities
    print('Completed mapping discourse to each token.')
    return df_texts


def preprocess(df_train=None):
    if df_train is None:
        train_flg = False
    else:
        train_flg = True

    df_texts = agg_essays(train_flg)

    if train_flg:
        df_texts = ner(df_texts, df_train)
    return df_texts


def split_fold(df_train):
    ids = df_train['id'].unique()
    kf = KFold(n_splits=5, shuffle=True, random_state=42)  # 5折交叉验证  
    for i_fold, (_, valid_index) in enumerate(kf.split(ids)):
        df_train.loc[valid_index, 'fold'] = i_fold
    return df_train

In [33]:
alltrain_texts = preprocess(df_train)
alltrain_texts = split_fold(alltrain_texts)
alltrain_texts

  0%|          | 0/15594 [00:00<?, ?it/s]

Completed tokenizing texts.


  0%|          | 0/15594 [00:00<?, ?it/s]

Completed mapping discourse to each token.


Unnamed: 0,id,text,text_split,entities,fold
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[Some, people, belive, that, the, so, called, ...","[B-Position, I-Position, I-Position, I-Positio...",0.0
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[Driverless, cars, are, exaclty, what, you, wo...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",4.0
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[Dear:, Principal, I, am, arguing, against, th...","[O, O, B-Position, I-Position, I-Position, I-P...",3.0
3,001552828BD0,Would you be able to give your car up? Having ...,"[Would, you, be, able, to, give, your, car, up...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",0.0
4,0016926B079C,I think that students would benefit from learn...,"[I, think, that, students, would, benefit, fro...","[B-Position, I-Position, I-Position, I-Positio...",4.0
...,...,...,...,...,...
15589,FFF1442D6698,"Every student looks forward to summer break, i...","[Every, student, looks, forward, to, summer, b...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",2.0
15590,FFF1ED4F8544,Many citizens argue that the Electoral college...,"[Many, citizens, argue, that, the, Electoral, ...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",2.0
15591,FFF868E06176,"Every summer break, students are given project...","[Every, summer, break,, students, are, given, ...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea...",4.0
15592,FFFD0AF13501,"In the article ""A Cowboy Who Rode the Waves"" L...","[In, the, article, ""A, Cowboy, Who, Rode, the,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",0.0


In [34]:
test_texts = preprocess()
test_texts

  0%|          | 0/5 [00:00<?, ?it/s]

Completed tokenizing texts.


Unnamed: 0,id,text,text_split
0,0FB0700DAF44,"During a group project, have you ever asked a ...","[During, a, group, project,, have, you, ever, ..."
1,18409261F5C2,80% of Americans believe seeking multiple opin...,"[80%, of, Americans, believe, seeking, multipl..."
2,D46BCB48440A,"When people ask for advice,they sometimes talk...","[When, people, ask, for, advice,they, sometime..."
3,D72CB1C11673,Making choices in life can be very difficult. ...,"[Making, choices, in, life, can, be, very, dif..."
4,DF920E0A7337,Have you ever asked more than one person for h...,"[Have, you, ever, asked, more, than, one, pers..."


In [35]:
alltrain_texts.to_pickle('alltrain_texts.pkl')
test_texts.to_pickle('test_texts.pkl')