# **Data**

# WASSA all dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

wassa_train_csv_path = '/content/drive/My Drive/WASSA_train_all.csv'
wassa_dev_csv_path = '/content/drive/My Drive/WASSA_dev_all.csv'

wassa_train = pd.read_csv(wassa_train_csv_path)
wassa_dev = pd.read_csv(wassa_dev_csv_path)


In [3]:
print('wassa_train shape:', wassa_train.shape)
wassa_train['essay'].head(10)

wassa_train shape: (1860, 2)


0    it is really diheartening to read about these ...
1    the phone lines from the suicide prevention li...
2    no matter what your heritage, you should be ab...
3    it is frightening to learn about all these sha...
4    the eldest generation of russians aren't being...
5    middle east is fucked up, I've honestly never ...
6    well first of all whoever wrote this article d...
7    well well well, look at what we have well, the...
8    just another fucked up mental sickness of amer...
9    it seems a horny male college student has fina...
Name: essay, dtype: object

In [4]:
wassa_train['emotion'].head(10)

0    0
1    0
2    6
3    2
4    0
5    3
6    3
7    4
8    3
9    4
Name: emotion, dtype: int64

In [5]:
X_train = wassa_train['essay']
y_train = wassa_train['emotion']

X_test = wassa_dev['essay']
y_test = wassa_dev['emotion']

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1860,)
(1860,)
(270,)
(270,)


In [7]:
# label: 'sadness': 0, 'neutral': 6, 'fear': 2, 'anger': 3, 'disgust': 4, 'surprise': 5, 'joy': 6
y_train.value_counts()

0    647
3    349
6    275
2    194
5    164
4    149
1     82
Name: emotion, dtype: int64

In [8]:
y_test.value_counts()

0    98
3    76
2    31
6    25
1    14
5    14
4    12
Name: emotion, dtype: int64

# **Model**


In [9]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)


tfidf train shape: (1860, 9688)
tfidf train type: float64
tfidf test: (270, 9688)


In [10]:
from sklearn.svm import LinearSVC
clf = LinearSVC().fit(X_train_tfidf, y_train)

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

accuracy is:  47.40740740740741
[[76  1  6  7  1  2  5]
 [ 6  0  1  3  1  1  2]
 [10  1 16  0  1  1  2]
 [35  0  0 27  7  2  5]
 [ 5  0  0  4  1  0  2]
 [ 6  0  0  4  1  2  1]
 [ 8  1  1  7  2  0  6]]


              precision    recall  f1-score   support

           0       0.52      0.78      0.62        98
           1       0.00      0.00      0.00        14
           2       0.67      0.52      0.58        31
           3       0.52      0.36      0.42        76
           4       0.07      0.08      0.08        12
           5       0.25      0.14      0.18        14
           6       0.26      0.24      0.25        25

    accuracy                           0.47       270
   macro avg       0.33      0.30      0.31       270
weighted avg       0.45      0.47      0.45       270



In [11]:
from tqdm.notebook import tqdm
tqdm.pandas()

data = {'text': X_train, 'label': y_train}
base_train = pd.DataFrame.from_dict(data)
base_train.shape

(1860, 2)

In [12]:
base_train.shape

(1860, 2)

# ChatGPT API Augmentation test

In [13]:
!pip install --upgrade setuptools


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [14]:
!pip install openai


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m41.4 MB/s[0m eta [36m0:00:00[0m
Collecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[

In [15]:
rep = 5
chatgpt_train = base_train.copy()

In [16]:
import openai
import pandas as pd
import time

openai.api_key = "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"

def augment_text(text):

    prompt = (f"Paraphrase this sentence: '{text}'. "
          "Your paraphrased version should be grammatically correct and semantically equivalent.")
    
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=60,
        n=5,
        stop=None,
        temperature=0.7,
    )

    responses = []
    for i in range(5):
      responses.append(response.choices[i].text.strip())
    time.sleep(3)
    return responses



In [17]:
chatgpt_train['paraphrase'] = chatgpt_train['text'].progress_apply(lambda x:augment_text(x))
chatgpt_train = chatgpt_train.explode('paraphrase').reset_index(drop=True)



  0%|          | 0/1860 [00:00<?, ?it/s]

In [18]:
chatgpt_train.head(50)

Unnamed: 0,text,label,paraphrase
0,it is really diheartening to read about these ...,0,It is really upsetting to read about these imm...
1,it is really diheartening to read about these ...,0,It's very upsetting to read about these immigr...
2,it is really diheartening to read about these ...,0,It is very upsetting to read about these immig...
3,it is really diheartening to read about these ...,0,It is really upsetting to read about these imm...
4,it is really diheartening to read about these ...,0,I was really upset to read about the immigrant...
5,the phone lines from the suicide prevention li...,0,The suicide prevention hotline saw an increase...
6,the phone lines from the suicide prevention li...,0,"After the election, the suicide prevention lin..."
7,the phone lines from the suicide prevention li...,0,"After the election, the number of phone calls ..."
8,the phone lines from the suicide prevention li...,0,The suicide prevention line received more call...
9,the phone lines from the suicide prevention li...,0,"After the election, the number of phone calls ..."


# Model with augmented train data (paraphrased data)

In [19]:
# Fit and transform X_train using Tfidf Vectorizer with default parameters
from sklearn.feature_extraction.text import TfidfVectorizer

# !! we should concatenate original train with paraphrased train
new_data = X_train.append(chatgpt_train['paraphrase'])

vectorizer = TfidfVectorizer() #max_features=3000)
X_train_tfidf = vectorizer.fit_transform(new_data).toarray() #chatgpt_train['paraphrase']).toarray()
print('tfidf train shape:', X_train_tfidf.shape)
print('tfidf train type:', X_train_tfidf.dtype)

X_test_tfidf = vectorizer.transform(X_test).toarray()
print('tfidf test:', X_test_tfidf.shape)

  new_data = X_train.append(chatgpt_train['paraphrase'])


tfidf train shape: (11160, 10833)
tfidf train type: float64
tfidf test: (270, 10833)


In [20]:
from sklearn.svm import LinearSVC

# !! we should concatenate original labels with paraphrased text labels
new_labels = y_train.append(chatgpt_train['label'])

clf = LinearSVC().fit(X_train_tfidf, new_labels) # chatgpt_train['label'])

from sklearn import metrics
predicted = clf.predict(X_test_tfidf)

acc = metrics.accuracy_score(y_test, predicted)
print('accuracy is: ', acc*100)

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix,classification_report
print(confusion_matrix(y_test,predicted))
print('\n')
print(classification_report(y_test,predicted))

  new_labels = y_train.append(chatgpt_train['label'])


accuracy is:  48.51851851851852
[[74  1  4  8  3  4  4]
 [ 6  2  0  5  0  0  1]
 [ 9  0 18  3  0  1  0]
 [35  0  2 27  5  1  6]
 [ 3  0  0  5  2  0  2]
 [ 5  0  1  5  0  2  1]
 [ 7  1  0  7  3  1  6]]


              precision    recall  f1-score   support

           0       0.53      0.76      0.62        98
           1       0.50      0.14      0.22        14
           2       0.72      0.58      0.64        31
           3       0.45      0.36      0.40        76
           4       0.15      0.17      0.16        12
           5       0.22      0.14      0.17        14
           6       0.30      0.24      0.27        25

    accuracy                           0.49       270
   macro avg       0.41      0.34      0.36       270
weighted avg       0.47      0.49      0.46       270



In [21]:
!cd '/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/'
chatgpt_train.to_csv('/content/drive/My Drive/Colab Notebooks/NLP Augmentation/Datasets/wassa_all_chatgptaug_5.csv', encoding='utf-8', index=False, sep=',')