-
Notifications
You must be signed in to change notification settings - Fork 0
/
wordnet_aug
37 lines (30 loc) · 1.13 KB
/
wordnet_aug
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
import numpy as np
import nlpaug.augmenter.word as naw
from tqdm import tqdm
# Load Data
df = pd.read_csv('path/data.csv')
# Augment Argument
wordnet = naw.SynonymAug(aug_src='wordnet')
# Augment Function
def augmentMyData(df, selectedclass, repetitions=1, samples=200):
augmented_texts = []
# select only the class to augment
filter_df = df[df['label'] == selectedclass].reset_index(drop=True) # removes unecessary index column
for i in tqdm(np.random.randint(0, len(filter_df), samples)):
# generating 'n_samples' augmented texts
for _ in range(repetitions):
augmented_text = wordnet.augment(filter_df['description'].iloc[i])
augmented_texts.append(augmented_text)
data = {
'description': augmented_texts,
'label': selectedclass
}
aug_df = pd.DataFrame(data)
return aug_df
# Run Augment
aug_data = pd.DataFrame()
aug_data = aug_data.append(augmentMyData(df, selectedclass="selectedclass", samples=samples))
# Convert augmented data to string type
aug_data['description'] = aug_data['description'].str[0]
df = df.append(aug_data)