<a href="https://colab.research.google.com/github/satyajitghana/TSAI-DeepNLP-END2.0/blob/main/07_Seq2Seq/SST_Redo/SST_Dataset_Augmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stanford Sentiment TreeBank Dataset

In [None]:
! pip install pytorch-lightning --quiet
! pip install nlpaug --quiet
! pip install google-trans-new --quiet
! pip install swifter --quiet

[K     |████████████████████████████████| 634kB 7.8MB/s 
[K     |████████████████████████████████| 296kB 12.8MB/s 
[K     |████████████████████████████████| 583kB 25.3MB/s 
[K     |████████████████████████████████| 48.5MB 78kB/s 
[K     |████████████████████████████████| 17.2MB 256kB/s 
[K     |████████████████████████████████| 71kB 8.3MB/s 
[K     |████████████████████████████████| 204kB 57.6MB/s 
[K     |████████████████████████████████| 133kB 51.7MB/s 
[K     |████████████████████████████████| 81kB 9.0MB/s 
[K     |████████████████████████████████| 81kB 8.1MB/s 
[K     |████████████████████████████████| 3.1MB 26.2MB/s 
[K     |████████████████████████████████| 92kB 8.6MB/s 
[?25h  Building wheel for swifter (setup.py) ... [?25l[?25hdone
  Building wheel for gpustat (setup.py) ... [?25l[?25hdone
[31mERROR: modin 0.9.1 has requirement pandas==1.2.3, but you'll have pandas 1.1.5 which is incompatible.[0m


## Get to know RAW Dataset

In [None]:
! wget http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip

--2021-06-03 14:54:32--  http://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip [following]
--2021-06-03 14:54:32--  https://nlp.stanford.edu/~socherr/stanfordSentimentTreebank.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6372817 (6.1M) [application/zip]
Saving to: ‘stanfordSentimentTreebank.zip’


2021-06-03 14:54:32 (25.2 MB/s) - ‘stanfordSentimentTreebank.zip’ saved [6372817/6372817]



In [None]:
! unzip stanfordSentimentTreebank.zip

Archive:  stanfordSentimentTreebank.zip
   creating: stanfordSentimentTreebank/
  inflating: stanfordSentimentTreebank/datasetSentences.txt  
   creating: __MACOSX/
   creating: __MACOSX/stanfordSentimentTreebank/
  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSentences.txt  
  inflating: stanfordSentimentTreebank/datasetSplit.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._datasetSplit.txt  
  inflating: stanfordSentimentTreebank/dictionary.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._dictionary.txt  
  inflating: stanfordSentimentTreebank/original_rt_snippets.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._original_rt_snippets.txt  
  inflating: stanfordSentimentTreebank/README.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._README.txt  
  inflating: stanfordSentimentTreebank/sentiment_labels.txt  
  inflating: __MACOSX/stanfordSentimentTreebank/._sentiment_labels.txt  
  inflating: stanfordSentimentTreebank/SOStr.txt  
  inflating: stanfo

In [None]:
import os

import pandas as pd
from tqdm.auto import tqdm
import swifter

tqdm.pandas()

In [None]:
sst_dir = 'stanfordSentimentTreebank'

In [None]:
sentiment_labels = pd.read_csv(os.path.join(sst_dir, "sentiment_labels.txt"), names=['phrase_ids', 'sentiment_values'], sep="|", header=0)

In [None]:
sentiment_labels.head()

Unnamed: 0,phrase_ids,sentiment_values
0,0,0.5
1,1,0.5
2,2,0.44444
3,3,0.5
4,4,0.42708


In [None]:
def discretize_label(label):
    if label <= 0.2: return 0
    if label <= 0.4: return 1
    if label <= 0.6: return 2
    if label <= 0.8: return 3
    return 4

In [None]:
sentiment_labels['sentiment_values'] = sentiment_labels['sentiment_values'].apply(discretize_label)

In [None]:
sentiment_labels.head()

Unnamed: 0,phrase_ids,sentiment_values
0,0,2
1,1,2
2,2,2
3,3,2
4,4,2


In [None]:
sentence_ids = pd.read_csv(os.path.join(sst_dir, "datasetSentences.txt"), sep="\t")

In [None]:
sentence_ids.head()

Unnamed: 0,sentence_index,sentence
0,1,The Rock is destined to be the 21st Century 's...
1,2,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha..."


In [None]:
dictionary = pd.read_csv(os.path.join(sst_dir, "dictionary.txt"), sep="|", names=['phrase', 'phrase_ids'])

In [None]:
dictionary.head()

Unnamed: 0,phrase,phrase_ids
0,!,0
1,! ',22935
2,! '',18235
3,! Alas,179257
4,! Brilliant,22936


In [None]:
train_test_split = pd.read_csv(os.path.join(sst_dir, "datasetSplit.txt"))

In [None]:
train_test_split.head()

Unnamed: 0,sentence_index,splitset_label
0,1,1
1,2,1
2,3,2
3,4,2
4,5,2


In [None]:
sentence_phrase_merge = pd.merge(sentence_ids, dictionary, left_on='sentence', right_on='phrase')
sentence_phrase_split = pd.merge(sentence_phrase_merge, train_test_split, on='sentence_index')
dataset = pd.merge(sentence_phrase_split, sentiment_labels, on='phrase_ids')

In [None]:
dataset['phrase_cleaned'] = dataset['sentence'].str.replace(r"\s('s|'d|'re|'ll|'m|'ve|n't)\b", lambda m: m.group(1))

In [None]:
dataset.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase_ids,splitset_label,sentiment_values,phrase_cleaned
0,1,The Rock is destined to be the 21st Century 's...,The Rock is destined to be the 21st Century 's...,226166,1,3,The Rock is destined to be the 21st Century's ...
1,2,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of `` Th...,226300,1,4,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic,Effective but too-tepid biopic,13995,2,2,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...,If you sometimes like to go to the movies to h...,14123,2,3,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha...","Emerges as something rare , an issue movie tha...",13999,2,4,"Emerges as something rare , an issue movie tha..."


In [None]:
dataset.iloc[100]

sentence_index                                                    101
sentence            If nothing else , this movie introduces a prom...
phrase              If nothing else , this movie introduces a prom...
phrase_ids                                                      14114
splitset_label                                                      2
sentiment_values                                                    3
phrase_cleaned      If nothing else , this movie introduces a prom...
Name: 100, dtype: object

In [None]:
dataset.describe()

Unnamed: 0,sentence_index,phrase_ids,splitset_label,sentiment_values
count,11286.0,11286.0,11286.0,11286.0
mean,5910.961102,132003.589846,1.373294,2.059986
std,3422.455572,68214.62643,0.647295,1.287835
min,1.0,3467.0,1.0,0.0
25%,2951.25,67402.0,1.0,1.0
50%,5904.5,144063.5,1.0,2.0
75%,8865.75,188139.75,2.0,3.0
max,11855.0,238977.0,3.0,4.0


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11286 entries, 0 to 11285
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   sentence_index    11286 non-null  int64 
 1   sentence          11286 non-null  object
 2   phrase            11286 non-null  object
 3   phrase_ids        11286 non-null  int64 
 4   splitset_label    11286 non-null  int64 
 5   sentiment_values  11286 non-null  int64 
 6   phrase_cleaned    11286 non-null  object
dtypes: int64(4), object(3)
memory usage: 705.4+ KB


In [None]:
dataset.to_csv('sst_dataset_cleaned.csv')

## Augmentation Time

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm

import swifter

tqdm.pandas()

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

In [None]:
! gdown https://drive.google.com/uc?id=1p-e89cyFD2_U1Wx8r9iFmjqSCTnLUt7y

In [None]:
dataset = pd.read_csv('sst_dataset_cleaned.csv', index_col=0)

## BackTranslate Module

In [None]:
from nlpaug.augmenter.word import WordAugmenter

import google_trans_new
from google_trans_new import google_translator  

import random

class BackTranslateAug(WordAugmenter):
    def __init__(self, name='BackTranslateAug', aug_min=1, aug_max=10, 
                 aug_p=0.3, stopwords=None, tokenizer=None, reverse_tokenizer=None, 
                 device='cpu', verbose=0, stopwords_regex=None):
        super(BackTranslateAug, self).__init__(
            action=Action.SUBSTITUTE, name=name, aug_min=aug_min, aug_max=aug_max, 
                 aug_p=aug_p, stopwords=stopwords, tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, 
                 device=device, verbose=0, stopwords_regex=stopwords_regex)
        

        self.translator = google_translator()
        
    def substitute(self, data):
        if not data:
            return data
            
        if self.prob() < self.aug_p:
            trans_lang = random.choice(list(google_trans_new.LANGUAGES.keys()))
            trans_text = self.translator.translate(data, lang_src='en', lang_tgt=trans_lang) 

            en_text = self.translator.translate(trans_text, lang_src=trans_lang, lang_tgt='en') 

            return en_text

        return data

Random Deletion

In [None]:
text = dataset['sentence'].iloc[0]
text

"The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal ."

In [None]:
aug = naw.RandomWordAug(aug_max=3)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
Augmented Text:
The Rock is destined to be the 21st ' s new ` ` Conan ' ' that ' s going to make a splash even greater than Arnold Schwarzenegger, Jean - Claud Van Damme or Steven Segal.


Random Swap

In [None]:
aug = naw.RandomWordAug(action="swap", aug_max=3)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
Augmented Text:
The Rock is destined to be the 21st Century ' s new ` ` Conan ' ' and he that ' s going to make a splash even greater than Arnold, Schwarzenegger Jean Claud - Van Damme or Steven Segal.


Back Translator

In [None]:
aug = BackTranslateAug(aug_max=3, aug_p=1)
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
Augmented Text:
The Rock is intended to be the 21st century new `` Conan 'and that he will do a splash even larger than Arnold Schwarzenegger, Jean-Claud Van Damme or Steven Segal. 


In [None]:
aug = naw.SynonymAug(aug_src='wordnet')
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The Rock is destined to be the 21st Century 's new `` Conan '' and that he 's going to make a splash even greater than Arnold Schwarzenegger , Jean-Claud Van Damme or Steven Segal .
Augmented Text:
The Rock is destined to follow the 21st Hundred ' s new ` ` Conan ' ' and that helium ' s going to make a splash yet swell than Arnold Schwarzenegger, Blue jean - Claud Van Damme or Steven Segal.


Use the backtranslator model to augment the entire dataset, call this backtranslated column of the dataset

In [None]:
dataset_aug = dataset.copy()

## Synonym Augmentor

In [None]:
aug = naw.SynonymAug(aug_src='wordnet')
synonym_sentences = dataset_aug['sentence'].progress_apply(aug.augment)

HBox(children=(FloatProgress(value=0.0, max=11286.0), HTML(value='')))




In [None]:
synonym_sentences.head()

0    The Rock is destined to be the 21st C ' s nove...
1    The gorgeously elaborated continuance of ` ` T...
2                Effective but likewise - tepid biopic
3    If you sometimes same to blend to the movies t...
4    Emerges as something rare, an issue movie that...
Name: sentence, dtype: object

In [None]:
dataset_aug['synonym_sentences'] = synonym_sentences

In [None]:
dataset_aug.to_csv('sst_dataset_synonym.csv')

## Back Translate

See https://docs.google.com/spreadsheets/d/e/2PACX-1vQ5G4wKHEXkseaSy_8khXdmUqfx2jVUK4T-ITSeq8AMB1QWJoyZrpzelCf8Sb70mhs0knjqCEdZguWT/pubhtml for how it was done

In [None]:
! gdown https://drive.google.com/uc?id=1eD_yJb4avApTCET1Q-eNco89FT-QF46g

Downloading...
From: https://drive.google.com/uc?id=1eD_yJb4avApTCET1Q-eNco89FT-QF46g
To: /content/sst_dataset_translated.csv
6.50MB [00:00, 20.1MB/s]


In [None]:
translated_dset = pd.read_csv('/content/sst_dataset_translated.csv', index_col=0)

In [None]:
translated_dset.head()

Unnamed: 0,sentence_index,sentence,lang,temp,sentence_trans_aug,phrase,phrase_ids,splitset_label,sentiment_values,phrase_cleaned
0,1,The Rock is destined to be the 21st Century 's...,et,Rock on määratud olema 21. sajandi 's uus `` C...,Rock is set to be the 21st century's new `` Co...,The Rock is destined to be the 21st Century 's...,226166,1,3,The Rock is destined to be the 21st Century's ...
1,2,The gorgeously elaborate continuation of `` Th...,gd,Tha gorgeously saoithreachail leantainn air ``...,The gorgeously elaborate continue to `` The Lo...,The gorgeously elaborate continuation of `` Th...,226300,1,4,The gorgeously elaborate continuation of `` Th...
2,3,Effective but too-tepid biopic,en,Effective but too-tepid biopic,Effective but too-tepid biopic,Effective but too-tepid biopic,13995,2,2,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...,haw,"Inā 'oe i kekahi manawa, makemake e hele i nā ...",If you sometimes want to go to the movies to p...,If you sometimes like to go to the movies to h...,14123,2,3,If you sometimes like to go to the movies to h...
4,5,"Emerges as something rare , an issue movie tha...",ta,அரிதான ஒன்று 'எனவே நேர்மையான மற்றும் முனைப்போ...,One of the rare 'and therefore does not feel h...,"Emerges as something rare , an issue movie tha...",13999,2,4,"Emerges as something rare , an issue movie tha..."


In [None]:
dataset_aug['backtranslated'] = translated_dset['sentence_trans_aug'].copy()

## Final Augmented Dataset

In [None]:
dataset_aug.head()

Unnamed: 0,sentence_index,sentence,phrase,phrase_ids,splitset_label,sentiment_values,phrase_cleaned,synonym_sentences,backtranslated
0,1,The Rock is destined to be the 21st Century 's...,The Rock is destined to be the 21st Century 's...,226166,1,3,The Rock is destined to be the 21st Century's ...,The Rock is destine to be the twenty first Cen...,Rock is set to be the 21st century's new `` Co...
1,2,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of `` Th...,226300,1,4,The gorgeously elaborate continuation of `` Th...,The gorgeously elaborate continuation of ` ` T...,The gorgeously elaborate continue to `` The Lo...
2,3,Effective but too-tepid biopic,Effective but too-tepid biopic,13995,2,2,Effective but too-tepid biopic,Effective but too - lukewarm biopic,Effective but too-tepid biopic
3,4,If you sometimes like to go to the movies to h...,If you sometimes like to go to the movies to h...,14123,2,3,If you sometimes like to go to the movies to h...,If you sometimes like to go to the motion pict...,If you sometimes want to go to the movies to p...
4,5,"Emerges as something rare , an issue movie tha...","Emerges as something rare , an issue movie tha...",13999,2,4,"Emerges as something rare , an issue movie tha...","Emerges as something rare, an effect movie tha...",One of the rare 'and therefore does not feel h...


In [None]:
dataset_aug.to_csv('sst_dataset_augmented.csv')