In [1]:
import gensim, os, re
import nlpaug.augmenter.word as naw
import pandas as pd
from nltk.corpus import stopwords

Objectives

Test / Train split : 10 / 90

Input : tf-idf / word2vec

Experiments

1. n (augment 횟수) (Fixed-value 할수도 있다.)
2. N (training-set size)
3. Ratio (original / word2vec)

In [3]:
model_path = "models/"
data_path = "phrasebank/"
output_path = "augmented/"

stop_words = list(set(stopwords.words('english')))

aug_max, aug_p = 4, 0.1

aug_word2vec = naw.WordEmbsAug(model_type = 'word2vec', model_path=model_path + "word2vec.bin", aug_max=aug_max, aug_p=aug_p, stopwords=stop_words)
#aug_syn = naw.SynonymAug(aug_src='wordnet', aug_max=None, aug_p=0.1)
aug_base = naw.TfIdfAug(model_path=model_path, aug_max=aug_max, aug_p=aug_p)

test_df = pd.read_csv(data_path + "phr_test.csv", sep=",")
train_df = pd.read_csv(data_path + "phr_train.csv", sep=",")

In [4]:
def run_aug(n, N, R, train_df, aug_word2vec, aug_base, output_fname, path='') :
    train_df_sample = train_df.sample(n=N, ignore_index=True)
    #save sample as file
    train_df_sample.to_csv(path + output_fname[:-4] + "sample_{N}_.csv".format(N=N), sep=",")

    n_word2vec = int(n*R)
    n_base = n - n_word2vec

    #aug.augment(data, n)
    augmented_data_sentences = []
    augmented_data_labels = []

    for idx, row in train_df_sample.iterrows() :
        sentence = row['sentence']
        label = row['label']
        augmented_word2vec = aug_word2vec.augment(sentence, n=n_word2vec) if n_word2vec > 0 else []
        augmented_base = aug_base.augment(sentence, n=n_base) if n_base > 0 else []
        #label-preserving
        augmented_data_sentences += ([sentence] + augmented_word2vec + augmented_base)
        augmented_data_labels += [label for i in range(n+1)]
    
    #save new dataset to csv-file
    output_df = pd.DataFrame(list(zip(augmented_data_sentences, augmented_data_labels)), columns=['sentence', 'label'])
    output_df.to_csv(path + output_fname, sep=",")

    return path + output_fname

aug_min=1(default), aug_max=None, aug_p=0.1, n=8


1. N : 500; 4000; +500 / R = 1 (8개)
2. R : 1; 0.0; -0.25 / N = best from 1. (5개)

In [14]:
fnames_exp1 = [("1_aug_tf_" + str(i) + "_100_.csv") for i in range(500,4001,500)]
fnames_exp2 = [("2_aug_tf_" + str(4000) + "_" + str(i) + "_.csv") for i in range(100, -1, -25)]

In [None]:
for fname in fnames_exp1 :
    tokens = fname.split("_")
    N, R = int(tokens[3]), int(tokens[4]) / 100
    print("Exp1 - N : {N} / R : {R}".format(N=N, R=R))
    run_aug(8, N, R, train_df, aug_word2vec, aug_base, fname, path=output_path)


In [16]:
for fname in fnames_exp2 :
    tokens = fname.split("_")
    N, R = int(tokens[3]), int(tokens[4]) / 100
    print("Exp2 - N : {N} / R : {R}".format(N=N, R=R))
    run_aug(8, N, R, train_df, aug_word2vec, aug_base, fname, path=output_path)

Exp2 - N : 4000 / R : 1.0
Exp2 - N : 4000 / R : 0.75
Exp2 - N : 4000 / R : 0.5
Exp2 - N : 4000 / R : 0.25
Exp2 - N : 4000 / R : 0.0


In [7]:
#Exp-3 : 1000 samples / preserved label ratios
train_df_exp3 = pd.read_csv(data_path + "phr_train_only1000.csv", sep=",")
fnames_exp3 = ["3_aug_tf_{N}_{i}_.csv".format(N=1000, i=i) for i in range(100, -1, -25)]

for fname in fnames_exp3 :
    tokens = fname.split("_")
    N, R = int(tokens[3]), int(tokens[4]) / 100
    print("Exp3 - N : {N} / R : {R}".format(N=N, R=R))
    run_aug(8, N, R, train_df_exp3, aug_word2vec, aug_base, fname, path=output_path)

Exp3 - N : 1000 / R : 1.0
Exp3 - N : 1000 / R : 0.75
Exp3 - N : 1000 / R : 0.5
Exp3 - N : 1000 / R : 0.25
Exp3 - N : 1000 / R : 0.0
