In [35]:
! pip3 install autocorrect -qq 
! pip install glove_python_binary | grep satistfied

In [42]:
from autocorrect import Speller
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from glove import Glove
import random
import re

In [43]:
BASE_DIR = "../input/feedback-prize-2021/"
TRAIN_DIR = BASE_DIR + 'train'
SAVE_DIR = BASE_DIR + 'train_oversamples'

if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)


In [44]:
DISCOURSE_TYPES = ['Rebuttal', 'Counterclaim', 'Lead', 'Concluding Statement', 'Claim', 'Position', 'Evidence']
WEAK_DISCOURSE_TYPES = ['Rebuttal', 'Counterclaim']
FULL_DISCOURSE_TYPES = ['Lead', 'Concluding Statement', 'Claim', 'Position', 'Evidence']

glove = Glove.load('glove.model')

df = pd.read_csv(BASE_DIR + "corrected_train.csv")      

df.drop(['discourse_start', 'discourse_end', 'discourse_text', 'predictionstring'], axis='columns', inplace=True)
df.rename(columns = {'new_start':'discourse_start', 'new_end':'discourse_end', 'new_discourse_text':'discourse_text', 'new_predictionstring':'predictionstring'}, inplace=True)

df.head()

Unnamed: 0,id,discourse_id,discourse_type,discourse_type_num,discourse_start,discourse_end,discourse_text,predictionstring
0,423A1CA112E2,1622628000000.0,Lead,Lead 1,8,229,Modern humans today are always on their phone....,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,Position,Position 1,230,312,They are some really bad consequences when stu...,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,Evidence,Evidence 1,313,400,Some certain areas in the United States ban ph...,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,Evidence,Evidence 2,402,757,"When people have phones, they know about certa...",76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,Claim,Claim 1,759,886,Driving is one of the way how to get around. P...,139 140 141 142 143 144 145 146 147 148 149 15...


In [45]:
def open_txt():
    train_names, train_texts = [], []
    for f in tqdm(list(os.listdir(TRAIN_DIR))):
        train_names.append(f.replace('.txt', ''))
        train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r', encoding='utf-8').read())
    train_text_df = pd.DataFrame({'id': train_names, 'text': train_texts})

    return train_text_df

train_text_df = open_txt()
train_text_df.head()

100%|██████████| 15594/15594 [00:00<00:00, 120661.21it/s]


Unnamed: 0,id,text
0,B7E936A82959,The advantages of limiting car usages is that ...
1,7D59699BB70F,One of the advantages of getting rid of cars i...
2,6163745AA739,Have you ever been driving a car and wish that...
3,6770F94889E2,Ever thought of a car that drives itself? Seem...
4,F8173F146C1B,I've never had to complete a summer project be...


In [46]:
X_ids = list(df['id'].unique())
id2types = df.groupby('id')['discourse_type'].unique().to_dict()

def train_ids2discourse_type_counts(X_ids):
    
    no_num = re.compile('[^0-9]')
    type_count = dict([(dt, 0) for dt  in DISCOURSE_TYPES])
    
    for _id in X_ids:
        for dt in id2types[_id]:
            dt_name = "".join(no_num.findall(dt)).rstrip(" ")
            type_count[dt_name] += 1
            
    return pd.Series(type_count).sort_values()

type_count = train_ids2discourse_type_counts(X_ids)
type_count


Rebuttal                 3598
Counterclaim             4576
Lead                     9301
Concluding Statement    13418
Claim                   14927
Position                15366
Evidence                15550
dtype: int64

In [47]:
FILL_TO = max(type_count)
add_ids = []
# Oversample to Maximum Sample Count
for dt in tqdm(WEAK_DISCOURSE_TYPES):
    print(dt)
    # Get current Discourse Type Count
    type_count = train_ids2discourse_type_counts(X_ids)
    dt_sample_count = type_count[dt]
    if dt_sample_count < FILL_TO:
        while dt_sample_count < FILL_TO:
            # Take Random ID
            random_id = str(np.random.choice(X_ids, 1).squeeze())
            if dt in id2types[random_id] :
                X_ids.append(random_id)
                add_ids.append(random_id)
                dt_sample_count += 1

  0%|          | 0/2 [00:00<?, ?it/s]

Rebuttal


100%|██████████| 2/2 [00:37<00:00, 18.96s/it]

Counterclaim





In [48]:
print("sampling id count :", len(add_ids))
print("MAX count :", FILL_TO)
train_ids2discourse_type_counts(X_ids)

sampling id count : 11952
MAX count : 15550


Rebuttal                15550
Counterclaim            16527
Lead                    17473
Concluding Statement    24285
Claim                   26563
Position                27310
Evidence                27464
dtype: int64

In [49]:
def correct_misspelling(train_text):
    correct_count = 0
    spell = Speller(lang='en')
    for idx, word in enumerate(train_text):
        cor_word = spell(word)
        if word != cor_word:
            train_text[idx] = cor_word
            correct_count += 1
    return train_text, correct_count

# txt = 'mismatch! so cannot oversampling this discorse tpye'
# train_text = txt.split()
# correct_misspelling(train_text)

In [52]:
def do_change(example, txt):
    discourse_text = example['discourse_text'].split()
    predictionstring = example['predictionstring'].split()
    # train_text = example['text'].split()
    train_text = txt.split()
    
    if len(discourse_text) != len(predictionstring):
        print("mismatch! so cannot oversampling this discorse type")
        return 
    
    # correct misspelling
    train_text, correction_count = correct_misspelling(train_text)
    
    # synonym replacement
    is_replaced = False
    while not is_replaced:
        list_idx = int(np.random.choice(len(predictionstring), 1))
        txt_idx = int(predictionstring[list_idx])
        origin_word = discourse_text[list_idx]
        
        try:
            replace_word = glove.most_similar(origin_word)[0][0]
        except:
            print(origin_word)
            continue
        
        train_text[txt_idx] = replace_word
        discourse_text[list_idx] = replace_word
        is_replaced = True
            
    example['predictionstring'] = example['predictionstring']
    example['discourse_text'] = " ".join(discourse_text)
    txt = " ".join(train_text)
    
    return example, txt, correction_count

In [55]:
def oversamplig(df):
    new_df = pd.DataFrame(columns=df.columns)
    for id in add_ids:
        count = 0
        new_id = "{0}_S".format(id)
        
        examples = df[df.id == id]          # id의 모든 discourse type 행을 추출
        text = train_text_df[train_text_df.id == id]['text'].values[0]  # id에 맞는 원본 text를 로드
        
        for i, example in examples.iterrows(): 
            # id내 annotation된 discourse type을 살피면서 워드를 바꾼다.
            # Rebuttal, Counterclaim인 경우, 무조건 바꿈
            # 그외 타입인 경우, 확률적으로 바꿈.
            #
            # discourse type이 변경된 경우, df에 계속해서 추가한다.
            # text는 모든 변경사항을 누적시키며, for문이 끝난 후 새로운 파일로 저장함.
            new_example = None
            if example['discourse_type'] in WEAK_DISCOURSE_TYPES:
                new_example, text, correction_count = do_change(example.copy(), text)
                count += 1
            else:
                if random.random() > 0.8:
                    new_example, text, correction_count = do_change(example.copy(), text)
                    count += 1
            if new_example is not None:
                new_example['id'] = new_id
                new_df = new_df.append(new_example)
            
    
        # save txt 
        with open(SAVE_DIR+ '/{0}.txt'.format(new_id), 'w') as f:
            f.write(text)
            
        print("new_id: {0} \t Add {1} / {2} discourse types. correction count : {3}".format(new_id, count, len(examples), correction_count))     
    print("complete oversampling --  total new rows : ", len(new_df))         
    return new_df.drop(['text'], axis=1)
                
        

In [None]:
new_df = df.copy()
print(len(new_df))
new_df = oversamplig(new_df)

In [None]:
new_df.to_csv("sample.csv", index=False)

Unnamed: 0,id,discourse_id,discourse_type,discourse_type_num,discourse_start,discourse_end,discourse_text,predictionstring
105905,71D3F6E61C14_S,1620241000000.0,Evidence,Evidence 1,212,389,Especially teens they would love the fact that...,36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 5...
105907,71D3F6E61C14_S,1620241000000.0,Evidence,Evidence 2,543,621,they'll wait until a free time to call their t...,95 96 97 98 99 100 101 102 103 104 105 106 107...
105911,71D3F6E61C14_S,1620241000000.0,Evidence,Evidence 4,877,980,Instead of having their team worried sick abou...,157 158 159 160 161 162 163 164 165 166 167 16...
105912,71D3F6E61C14_S,1620241000000.0,Counterclaim,Counterclaim 1,982,1102,If you use policy 2 students will bring them a...,176 177 178 179 180 181 182 183 184 185 186 18...
105913,71D3F6E61C14_S,1620241000000.0,Rebuttal,Rebuttal 1,1104,1145,So let learn bring there phones to school.,200 201 202 203 204 205 206 207
