In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.1-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.2/199.2 KB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.1 tokenizers-0.13.2 transformers-4.26.1


# **Data**

# Covid Worry dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

covidworry_train_csv_path = '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/covidworry/covidworry_train.csv'
covidworry_dev_csv_path = '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/covidworry/covidworry_test.csv'

covidworry_train = pd.read_csv(covidworry_train_csv_path)
covidworry_dev = pd.read_csv(covidworry_dev_csv_path)


In [None]:
print('covidworry_train shape:', covidworry_train.shape)
covidworry_train['essay'].head(10)

covidworry_train shape: (1685, 2)


0    I feel stressed because of self isolation and ...
1    At this moment, I am feeling incredibly bored ...
2    Im feeling a combination of anxiety and fear f...
3    Extremely anxious at the Unknown’s in the whol...
4    I feel worried for my friends and family and s...
5    I am worried about family and friends being af...
6    I am scared for my family and friends. I do no...
7    It's a slightly worrying situation especially ...
8    I am quite anxious at the moment as I have a p...
9    Very worried I might get it if go out for supp...
Name: essay, dtype: object

In [None]:
covidworry_train['emotion'].head(10)

0    3
1    0
2    3
3    3
4    1
5    0
6    0
7    1
8    3
9    4
Name: emotion, dtype: int64

In [None]:
X_train = covidworry_train['essay']
y_train = covidworry_train['emotion']

X_test = covidworry_dev['essay']
y_test = covidworry_dev['emotion']

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1685,)
(1685,)
(723,)
(723,)


In [None]:
# label: 'sadness': 0, 'neutral': 6, 'fear': 2, 'anger': 3, 'disgust': 4, 'surprise': 5, 'joy': 6
y_train.value_counts()

3    966
4    250
1    233
0    161
2     75
Name: emotion, dtype: int64

In [None]:
y_test.value_counts()

3    415
4    107
1    100
0     69
2     32
Name: emotion, dtype: int64

# **Model**


In [None]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


tfidf train shape: (1685, 7142)
tfidf train type: float64
tfidf test: (723, 7142)


In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train_tfidf, y_train)

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

accuracy is:  56.984785615491006
[[  6   5   0  51   7]
 [  2  26   0  64   8]
 [  1   6   0  20   5]
 [ 10  21   0 368  16]
 [  1   8   0  86  12]]


              precision    recall  f1-score   support

           0       0.30      0.09      0.13        69
           1       0.39      0.26      0.31       100
           2       0.00      0.00      0.00        32
           3       0.62      0.89      0.73       415
           4       0.25      0.11      0.15       107

    accuracy                           0.57       723
   macro avg       0.31      0.27      0.27       723
weighted avg       0.48      0.57      0.50       723



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Augmentation using Bertarg

In [None]:
! pip install nlpaug==1.1.7

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting nlpaug==1.1.7
  Downloading nlpaug-1.1.7-py3-none-any.whl (405 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m405.1/405.1 KB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.7


In [None]:
# apply augmentation to the train data and save the results into a file
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action
import nlpaug.flow as naf

aug_bert = naf.Sequential([naf.Sometimes([
    naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert", device ='cuda')]),
    naf.Sometimes([naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", device ='cuda')
])])

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

(1685, 2)

In [None]:
rep = 5 # how many repetitions of the record


bert_train = base_train.copy()
bert_train['paraphrase'] = bert_train['text'].progress_apply(lambda x:aug_bert.augment(x, rep))
bert_train = bert_train.explode('paraphrase').reset_index(drop=True)


  0%|          | 0/1685 [00:00<?, ?it/s]



In [None]:
bert_train.head(20)

Unnamed: 0,text,label,paraphrase
0,I feel stressed because of self isolation and ...,3,i feel stressed because of self alone mainly i...
1,I feel stressed because of self isolation and ...,3,i feel uncertainty because of self isolation a...
2,I feel stressed because of self isolation and ...,3,i feel immediately stressed because of possibl...
3,I feel stressed because of self isolation and ...,3,i feel stressed like fighting its self isolati...
4,I feel stressed because of self isolation and ...,3,i feel stressed because of self isolation and ...
5,"At this moment, I am feeling incredibly bored ...",0,"at i moment, i am feeling incredibly bored by ..."
6,"At this moment, I am feeling incredibly bored ...",0,"at this moment, i am feeling incredibly awkwar..."
7,"At this moment, I am feeling incredibly bored ...",0,"at one this moment, mainly i am feeling incred..."
8,"At this moment, I am feeling incredibly bored ...",0,"at this moment, i wonder feeling positively bo..."
9,"At this moment, I am feeling incredibly bored ...",0,"at this moment, i dread feeling incredibly bor..."


# Model with augmented train data (paraphrased data)

In [None]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

# !! we should concatenate original train with paraphrased train
new_data = X_train.append(bert_train['paraphrase'])

vectorizer = TfidfVectorizer() #max_features=3000)
X_train_tfidf = vectorizer.fit_transform(new_data).toarray() #bert_train['paraphrase']).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


tfidf train shape: (10110, 11052)
tfidf train type: float64
tfidf test: (723, 11052)


In [None]:
from sklearn.svm import LinearSVC

# !! we should concatenate original labels with paraphrased text labels
new_labels = y_train.append(bert_train['label'])

clf = LinearSVC().fit(X_train_tfidf, new_labels) # bert_train['label'])

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

accuracy is:  55.46334716459198
[[  7   6   1  44  11]
 [  6  37   0  49   8]
 [  3   3   1  18   7]
 [ 21  29   1 335  29]
 [  6   6   0  74  21]]


              precision    recall  f1-score   support

           0       0.16      0.10      0.12        69
           1       0.46      0.37      0.41       100
           2       0.33      0.03      0.06        32
           3       0.64      0.81      0.72       415
           4       0.28      0.20      0.23       107

    accuracy                           0.55       723
   macro avg       0.37      0.30      0.31       723
weighted avg       0.50      0.55      0.52       723



# Create CSV file from augmented dataset

In [None]:
!cd '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/'
bert_train.to_csv('/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/covidworry_bertaug_5.csv', encoding='utf-8', index=False, sep=',')