In [1]:
! pip3 install autocorrect -qq 
! pip install glove_python_binary | grep satistfied

In [141]:
from autocorrect import Speller
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import random
import re
from numpy import dot
from numpy.linalg import norm
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/scijspirit/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [142]:
BASE_DIR = "../input/feedback-prize-2021/"
TRAIN_DIR = BASE_DIR + 'train'
SAVE_DIR = BASE_DIR + 'train_oversamples'

if not os.path.exists(SAVE_DIR):
    os.mkdir(SAVE_DIR)

In [143]:
DISCOURSE_TYPES = ['Rebuttal', 'Counterclaim', 'Lead', 'Concluding Statement', 'Claim', 'Position', 'Evidence']
WEAK_DISCOURSE_TYPES = ['Rebuttal', 'Counterclaim']
FULL_DISCOURSE_TYPES = ['Lead', 'Concluding Statement', 'Claim', 'Position', 'Evidence']


df = pd.read_csv(BASE_DIR + "train_corrected.csv")      

df.drop(['discourse_start', 'discourse_end', 'discourse_text', 'predictionstring'], axis='columns', inplace=True)
df.rename(columns = {'new_start':'discourse_start', 'new_end':'discourse_end', 'new_discourse_text':'discourse_text', 'new_predictionstring':'predictionstring'}, inplace=True)

df.head()

Unnamed: 0,id,discourse_id,discourse_type,discourse_type_num,discourse_start,discourse_end,discourse_text,predictionstring
0,423A1CA112E2,1622628000000.0,Lead,Lead 1,8,229,Modern humans today are always on their phone....,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,Position,Position 1,230,312,They are some really bad consequences when stu...,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,Evidence,Evidence 1,313,400,Some certain areas in the United States ban ph...,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,Evidence,Evidence 2,402,757,"When people have phones, they know about certa...",76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,Claim,Claim 1,759,886,Driving is one of the way how to get around. P...,139 140 141 142 143 144 145 146 147 148 149 15...


In [144]:
def open_txt():
    train_names, train_texts = [], []
    for f in tqdm(list(os.listdir(TRAIN_DIR))):
        train_names.append(f.replace('.txt', ''))
        train_texts.append(open('../input/feedback-prize-2021/train/' + f, 'r', encoding='utf-8').read())
    train_text_df = pd.DataFrame({'id': train_names, 'text': train_texts})

    return train_text_df

train_text_df = open_txt()
train_text_df.head()

100%|██████████| 15594/15594 [00:00<00:00, 112407.28it/s]


Unnamed: 0,id,text
0,B7E936A82959,The advantages of limiting car usages is that ...
1,7D59699BB70F,One of the advantages of getting rid of cars i...
2,6163745AA739,Have you ever been driving a car and wish that...
3,6770F94889E2,Ever thought of a car that drives itself? Seem...
4,F8173F146C1B,I've never had to complete a summer project be...


In [145]:
X_ids = list(df['id'].unique())
id2types = df.groupby('id')['discourse_type'].unique().to_dict()

def train_ids2discourse_type_counts(X_ids):
    
    no_num = re.compile('[^0-9]')
    type_count = dict([(dt, 0) for dt  in DISCOURSE_TYPES])
    
    for _id in X_ids:
        for dt in id2types[_id]:
            dt_name = "".join(no_num.findall(dt)).rstrip(" ")
            type_count[dt_name] += 1
            
    return pd.Series(type_count).sort_values()

type_count = train_ids2discourse_type_counts(X_ids)
type_count


Rebuttal                 3598
Counterclaim             4576
Lead                     9301
Concluding Statement    13418
Claim                   14927
Position                15366
Evidence                15550
dtype: int64

In [146]:
FILL_TO = max(type_count)
add_ids = []
# Oversample to Maximum Sample Count
for dt in tqdm(WEAK_DISCOURSE_TYPES):
    print(dt)
    # Get current Discourse Type Count
    type_count = train_ids2discourse_type_counts(X_ids)
    dt_sample_count = type_count[dt]
    if dt_sample_count < FILL_TO:
        while dt_sample_count < FILL_TO:
            # Take Random ID
            random_id = str(np.random.choice(X_ids, 1).squeeze())
            if dt in id2types[random_id] :
                X_ids.append(random_id)
                add_ids.append(random_id)
                dt_sample_count += 1

  0%|          | 0/2 [00:00<?, ?it/s]

Rebuttal


100%|██████████| 2/2 [00:37<00:00, 18.58s/it]

Counterclaim





In [147]:
print("sampling id count :", len(add_ids))
print("MAX count :", FILL_TO)
train_ids2discourse_type_counts(X_ids)

sampling id count : 11952
MAX count : 15550


Rebuttal                15550
Counterclaim            16521
Lead                    17580
Concluding Statement    24310
Claim                   26560
Position                27299
Evidence                27475
dtype: int64

In [148]:
def correct_misspelling(train_text):
    correct_count = 0
    spell = Speller(lang='en')
    for idx, word in enumerate(train_text):
        cor_word = spell(word)
        if word != cor_word:
            train_text[idx] = cor_word
            correct_count += 1
    return train_text, correct_count

# txt = 'mismatch! so cannot oversampling this discorse tpye'
# train_text = txt.split()
# correct_misspelling(train_text)

In [149]:
GLOVE_PATH = '../glove/glove_6B/glove.6B.50d.txt'
with open(GLOVE_PATH, 'r') as f:
    lines = f.readlines()
glove_model = {}
for line in tqdm(lines):
    split_line = line.split()
    word = split_line[0]
    embedding = np.array(split_line[1:], dtype=np.float64)
    glove_model[word] = embedding
print(f"{len(glove_model)} words loaded!")

100%|██████████| 400000/400000 [00:02<00:00, 187511.18it/s]

400000 words loaded!





In [150]:
def most_similar(word):
    a = glove_model[word]
    best = (None, 0.0)
    for w, b in glove_model.items():
        value = dot(a, b)/(norm(a)*norm(b))
        if w != word and value > best[1]:
            best = (w, value)
    return best[0]

In [152]:
def do_change(example, txt):
    PUNCTUATION = set([".",",",";"])
    ALLOW_POS_TAGS = ['JJ', 'JJR', 'JJS', 'NN', 'NNS', 'NNPS', 'RBR', 'RBS', 'VB', 'VBD', 'VBG']
    
    discourse_text = example['discourse_text'].split()
    predictionstring = example['predictionstring'].split()
    # train_text = example['text'].split()
    train_text = txt.split()
    
    if len(discourse_text) != len(predictionstring):
        print("mismatch! so cannot oversampling this discorse type")
        return 
    
    # correct misspelling
    train_text, correction_count = correct_misspelling(train_text)
    
    # synonym replacement
    
    is_replaced = False
    while not is_replaced:
        list_idx = int(np.random.choice(len(predictionstring), 1))
        txt_idx = int(predictionstring[list_idx])
        origin_word = discourse_text[list_idx]
        
        pos_tag = nltk.pos_tag([origin_word])[0][1]
        if PUNCTUATION & set(list(origin_word)) or (origin_word not in glove_model.keys()) or (pos_tag not in ALLOW_POS_TAGS):
            continue
        
        replace_word = most_similar(origin_word)
        
        train_text[txt_idx] = replace_word
        discourse_text[list_idx] = replace_word
        is_replaced = True
            
    example['predictionstring'] = example['predictionstring']
    example['discourse_text'] = " ".join(discourse_text)
    txt = " ".join(train_text)
    
    return example, txt, correction_count

In [153]:
def oversamplig(df):
    new_df = pd.DataFrame(columns=df.columns)
    for id in tqdm(add_ids):
        count = 0
        new_id = "{0}_S".format(id)
        
        examples = df[df.id == id]          # id의 모든 discourse type 행을 추출
        text = train_text_df[train_text_df.id == id]['text'].values[0]  # id에 맞는 원본 text를 로드
        
        for i, example in examples.iterrows(): 
            # id내 annotation된 discourse type을 살피면서 워드를 바꾼다.
            # Rebuttal, Counterclaim인 경우, 무조건 바꿈
            # 그외 타입인 경우, 확률적으로 바꿈.
            #
            # discourse type이 변경된 경우, df에 계속해서 추가한다.
            # text는 모든 변경사항을 누적시키며, for문이 끝난 후 새로운 파일로 저장함.
            new_example = example.copy()
            if example['discourse_type'] in WEAK_DISCOURSE_TYPES:
                new_example, text, correction_count = do_change(new_example, text)
                count += 1
                ## for debug
                # print(example['discourse_type_num'])
                # print(example['discourse_text'])
                # print(new_example['discourse_text'])
                # print(train_text_df[train_text_df.id == id].text.values[0])
                # print(text)
            else:
                if random.random() > 0.8:
                    new_example, text, correction_count = do_change(new_example, text)
                    count += 1
                    ## for debug
                    # print(example['discourse_type_num'])
                    # print(example['discourse_text'])
                    # print(new_example['discourse_text'])
                    # print(train_text_df[train_text_df.id == id].text.values[0])
                    # print(text)
                
            new_example['id'] = new_id
            new_df = new_df.append(new_example)
            
        # save txt 
        with open(SAVE_DIR+ '/{0}.txt'.format(new_id), 'w') as f:
            f.write(text)
            
        print("new_id: {0} \t Add {2} discourse types. (# of synonym raplacement: {3}, # of correction misspelling: {4}".format(new_id, len(new_df), len(examples), count, correction_count))     
    print("complete oversampling --  total new rows : ", len(new_df))         
    return new_df
                
        

In [None]:
new_df = df.copy()
print(len(new_df))
new_df = oversamplig(new_df)

In [None]:
len(new_df)

31

In [None]:
new_df.to_csv('train_oversampled.csv", index=False)