In [None]:
! pip install transformers==4.10.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.10.1
  Using cached transformers-4.10.1-py3-none-any.whl (2.8 MB)
Collecting huggingface-hub>=0.0.12
  Using cached huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
Collecting tokenizers<0.11,>=0.10.1
  Using cached tokenizers-0.10.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
Collecting sacremoses
  Using cached sacremoses-0.0.53-py3-none-any.whl
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 sacremoses-0.0.53 tokenizers-0.10.3 transformers-4.10.1


# **Data**

# WASSA dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

wassa_train_csv_path = '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/WASSA_train_all.csv'
wassa_dev_csv_path = '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/WASSA_dev_all.csv'

wassa_train = pd.read_csv(wassa_train_csv_path)
wassa_dev = pd.read_csv(wassa_dev_csv_path)


In [None]:
print('wassa_train shape:', wassa_train.shape)
wassa_train['essay'].head(10)

wassa_train shape: (1860, 2)


0    it is really diheartening to read about these ...
1    the phone lines from the suicide prevention li...
2    no matter what your heritage, you should be ab...
3    it is frightening to learn about all these sha...
4    the eldest generation of russians aren't being...
5    middle east is fucked up, I've honestly never ...
6    well first of all whoever wrote this article d...
7    well well well, look at what we have well, the...
8    just another fucked up mental sickness of amer...
9    it seems a horny male college student has fina...
Name: essay, dtype: object

In [None]:
wassa_train['emotion'].head(10)

0    0
1    0
2    6
3    2
4    0
5    3
6    3
7    4
8    3
9    4
Name: emotion, dtype: int64

In [None]:
X_train = wassa_train['essay']
y_train = wassa_train['emotion']

X_test = wassa_dev['essay']
y_test = wassa_dev['emotion']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1860,)
(1860,)
(270,)
(270,)


In [None]:
# label: 'sadness': 0, 'neutral': 6, 'fear': 2, 'anger': 3, 'disgust': 4, 'surprise': 5, 'joy': 6
y_train.value_counts()

0    647
3    349
6    275
2    194
5    164
4    149
1     82
Name: emotion, dtype: int64

In [None]:
y_test.value_counts()

0    98
3    76
2    31
6    25
1    14
5    14
4    12
Name: emotion, dtype: int64

# **Model**


In [None]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


tfidf train shape: (1860, 9688)
tfidf train type: float64
tfidf test: (270, 9688)


In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train_tfidf, y_train)

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

accuracy is:  47.40740740740741
[[76  1  6  7  1  2  5]
 [ 6  0  1  3  1  1  2]
 [10  1 16  0  1  1  2]
 [35  0  0 27  7  2  5]
 [ 5  0  0  4  1  0  2]
 [ 6  0  0  4  1  2  1]
 [ 8  1  1  7  2  0  6]]


              precision    recall  f1-score   support

           0       0.52      0.78      0.62        98
           1       0.00      0.00      0.00        14
           2       0.67      0.52      0.58        31
           3       0.52      0.36      0.42        76
           4       0.07      0.08      0.08        12
           5       0.25      0.14      0.18        14
           6       0.26      0.24      0.25        25

    accuracy                           0.47       270
   macro avg       0.33      0.30      0.31       270
weighted avg       0.45      0.47      0.45       270



# Augmentation using Bertarg

In [None]:
! pip install nlpaug==1.1.7

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug==1.1.7
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.1/405.1 KB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.7


In [None]:
# apply augmentation to the train data and save the results into a file
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action
import nlpaug.flow as naf

aug_bert = naf.Sequential([naf.Sometimes([
    naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", device ='cuda')]),
    naf.Sometimes([naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device ='cuda')
])])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

(1860, 2)

In [None]:
rep = 5 # how many repetitions of the record


bert_train = base_train.copy()
bert_train['paraphrase'] = bert_train['text'].progress_apply(lambda x:aug_bert.augment(x, rep))
bert_train = bert_train.explode('paraphrase').reset_index(drop=True)


  0%|          | 0/1860 [00:00<?, ?it/s]

In [None]:
bert_train.head(20)

Unnamed: 0,text,label,paraphrase
0,it is really diheartening to read about these ...,0,it is really diheartening to read about titani...
1,it is really diheartening to read about these ...,0,it is really diheartening to so read about the...
2,it is really diheartening to read about these ...,0,it is really diheartening to read about these ...
3,it is really diheartening to read about these ...,0,it next its really powerful diheartening to re...
4,it is really diheartening to read about these ...,0,it is really diheartening to read about these ...
5,the phone lines from the suicide prevention li...,0,the protection lines underneath the suicide ch...
6,the phone lines from the suicide prevention li...,0,the phone lines from the suicide prevention li...
7,the phone lines from the suicide prevention li...,0,the phone lines from the suicide prevention ag...
8,the phone lines from the suicide prevention li...,0,lately the phone lines from the suicide preven...
9,the phone lines from the suicide prevention li...,0,the phone lines from the suicide prevention li...


# Model with augmented train data (paraphrased data)

In [None]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

# !! we should concatenate original train with paraphrased train
new_data = X_train.append(bert_train['paraphrase'])

vectorizer = TfidfVectorizer() #max_features=3000)
X_train_tfidf = vectorizer.fit_transform(new_data).toarray() #bert_train['paraphrase']).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


tfidf train shape: (11160, 14084)
tfidf train type: float64
tfidf test: (270, 14084)


In [None]:
from sklearn.svm import LinearSVC

# !! we should concatenate original labels with paraphrased text labels
new_labels = y_train.append(bert_train['label'])

clf = LinearSVC().fit(X_train_tfidf, new_labels) # bert_train['label'])

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

accuracy is:  47.03703703703704
[[71  1  9  9  1  1  6]
 [ 6  0  1  2  2  1  2]
 [12  1 13  0  1  1  3]
 [25  0  1 32  9  2  7]
 [ 4  0  0  3  3  0  2]
 [ 5  0  0  4  1  2  2]
 [ 9  1  1  6  2  0  6]]


              precision    recall  f1-score   support

           0       0.54      0.72      0.62        98
           1       0.00      0.00      0.00        14
           2       0.52      0.42      0.46        31
           3       0.57      0.42      0.48        76
           4       0.16      0.25      0.19        12
           5       0.29      0.14      0.19        14
           6       0.21      0.24      0.23        25

    accuracy                           0.47       270
   macro avg       0.33      0.31      0.31       270
weighted avg       0.46      0.47      0.45       270



# Create CSV file from augmented dataset

In [None]:
!cd '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/'
bert_train.to_csv('/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/wassa_all_BertAug_5.csv', encoding='utf-8', index=False, sep=',')