-
Notifications
You must be signed in to change notification settings - Fork 0
/
glove_aug
39 lines (32 loc) · 1.23 KB
/
glove_aug
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as naw
from tqdm import tqdm
# Load Data
df = pd.read_csv('path/data.csv')
# Augment Argument
glove = naw.WordEmbsAug(model_type='glove', model_path='path/glove.6B.300d.txt', action='substitute')
# Augment Function
def augmentMyData(df, selectedclass, repetitions=1, samples=200):
augmented_texts = []
# select only the class to augment
filter_df = df[df['label'] == selectedclass].reset_index(drop=True) # removes unecessary index column
for i in tqdm(np.random.randint(0, len(filter_df), samples)):
# generating 'n_samples' augmented texts
for _ in range(repetitions):
augmented_text = glove.augment(filter_df['description'].iloc[i])
augmented_texts.append(augmented_text)
data = {
'description': augmented_texts,
'label': selectedclass
}
aug_df = pd.DataFrame(data)
return aug_df
# Run Augment
aug_data = pd.DataFrame()
aug_data = aug_data.append(augmentMyData(df, selectedclass="selectedclass", samples=5))
# Convert augmented data to string type
aug_data['description'] = aug_data['description'].str[0]
aug_data
# Append augmented dataset to original dataset
df = df.append(aug_data)