In [60]:
import os

if not os.path.exists("train.csv"):
   from datasets import *
   dset = load_dataset("dair-ai/emotion")
   # code from https://stackoverflow.com/a/76218276
   train_testvalid = dset['train'].train_test_split(test_size=0.2)
   # Split the 10% test + valid in half test, half valid
   test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
   # gather everyone if you want to have a single DatasetDict
   dset = DatasetDict({
      'train': train_testvalid['train'],
      'test': test_valid['test'],
      'valid': test_valid['train']})
   dset   
else:
    print("file exists, skipping")
    

file exists, skipping


## Cleaning Function to clean the dataset text

In [61]:
import spacy
import pandas as pd
import numpy as np
import re
if not os.path.exists("test.csv"):
    # Check if the spaCy model is loaded, otherwise install it
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading the 'en_core_web_sm' model")
        !python -m spacy download en_core_web_sm
        nlp = spacy.load("en_core_web_sm")

    train_df = pd.DataFrame(dset["train"])
    test_df = pd.DataFrame(dset['test'])
    val_df = pd.DataFrame(dset['validation'])
    i = 0
    def cleaning(text):
        doc = nlp(text.lower().strip())
        cleaned_tokens = []
        global i
        print(f"Cleaning: {i}")
        for token in doc:
            if not token.is_stop and not token.is_punct and not token.is_space:
                lemma = re.sub(r'\W', '', token.lemma_)
                if lemma:
                    cleaned_tokens.append(lemma)
        i += 1
        return ' '.join(cleaned_tokens)

    # Apply the cleaning function to the text column
    train_df["cleaned_text"] = train_df["text"].apply(cleaning)
    test_df["cleaned_text"] = test_df["text"].apply(cleaning)
    val_df["cleaned_text"] = val_df["text"].apply(cleaning)

    print(train_df[['text', 'cleaned_text']].head())  # Display the original and cleaned text for verification

    train_df.to_csv("train.csv")
    test_df.to_csv("test.csv")
    val_df.to_csv("val.csv")

else: 
    print("file exists, skipping")
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    val_df = pd.read_csv("val.csv")    
    

file exists, skipping


In [62]:
# This is to remove the 5 from the dfs
train_df = train_df[train_df['label'] != 5]
test_df = test_df[test_df['label'] != 5]
val_df = val_df[val_df['label'] != 5]

train_df = train_df.dropna(subset=["cleaned_text"])
test_df = test_df.dropna(subset=["cleaned_text"])
val_df = val_df.dropna(subset=["cleaned_text"])

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

#### 4 Different types of embeddings

In [73]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X_bow = vectorizer.fit_transform(train_df['cleaned_text'])
X_test_bow = vectorizer.transform(test_df['cleaned_text'])
print("Bag of Words features:", X_bow.shape)

Bag of Words features: (15428, 11592)


In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(train_df['cleaned_text'])
X_test_tfidf = vectorizer.transform(test_df['cleaned_text'])
print("TF-IDF features:", X_tfidf.shape)


TF-IDF features: (15428, 11592)


In [65]:
from gensim.models import Word2Vec
import numpy as np

def create_word2vec_embeddings(dataframe):
    sentences = [text.split() for text in dataframe['cleaned_text']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv

    # Average Word Vectors for each text
    def document_vector(doc):
        return np.mean([word_vectors[w] for w in doc if w in word_vectors], axis=0)

    X_w2v = np.array([document_vector(text) for text in sentences if document_vector(text).shape != ()])
    return X_w2v


X_w2v = create_word2vec_embeddings(train_df)
X_test_w2v = create_word2vec_embeddings(test_df)
print("Word2Vec features shape:", X_w2v.shape)

Word2Vec features shape: (15428, 100)


In [66]:
from sentence_transformers import SentenceTransformer
#if not os.path.exists('X_bert.npy'):
def create_bert_embeddings(dataframe):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    X_bert = model.encode(dataframe['cleaned_text'], show_progress_bar=True)
    return X_bert

X_bert = create_bert_embeddings(train_df)
X_test_bert = create_bert_embeddings(test_df)
np.save('X_bert.npy', X_bert)
np.save('X_test_bert.npy', X_test_bert)

#else:    
#    print("file exists, skipping")
#    X_bert = np.load('X_bert.npy')
#    X_test_bert = np.load('X_test_bert.npy') 

print("BERT Embeddings shape:", X_bert.shape)

Batches:   0%|          | 0/483 [00:00<?, ?it/s]

Batches:   0%|          | 0/61 [00:00<?, ?it/s]

BERT Embeddings shape: (15428, 384)


In [67]:
train_df.loc[7122]

Unnamed: 0                                                   7419
text            i was feeling a bit like the internet is repla...
label                                                           1
cleaned_text    feel bit like internet replace valuable face f...
Name: 7122, dtype: object

In [68]:
y_train = (train_df['label'])
y_test = (test_df['label'])
y_val = val_df['label']

In [69]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## W2V Encoding


In [87]:
clf = RandomForestClassifier(n_estimators=100, random_state=82)
clf.fit(X_bert, y_train)
y_pred_test = clf.predict(X_test_bert)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.59      0.74      0.65       581
           1       0.60      0.87      0.71       695
           2       0.61      0.07      0.12       159
           3       0.77      0.25      0.37       275
           4       0.85      0.31      0.46       224

    accuracy                           0.61      1934
   macro avg       0.68      0.45      0.46      1934
weighted avg       0.65      0.61      0.57      1934



In [88]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_bert, y_train)
y_pred_test2 = clf2.predict(X_test_bert)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.73      0.74      0.73       581
           1       0.75      0.79      0.77       695
           2       0.53      0.45      0.49       159
           3       0.64      0.57      0.60       275
           4       0.66      0.69      0.68       224

    accuracy                           0.70      1934
   macro avg       0.66      0.65      0.65      1934
weighted avg       0.70      0.70      0.70      1934



BoW Encoding for RFC & Log. Reg.



In [77]:
clf3 = RandomForestClassifier(n_estimators=100, random_state=82)
clf3.fit(X_bow, y_train)
y_pred_test = clf3.predict(X_test_bow)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       581
           1       0.90      0.90      0.90       695
           2       0.71      0.72      0.72       159
           3       0.86      0.83      0.85       275
           4       0.89      0.89      0.89       224

    accuracy                           0.88      1934
   macro avg       0.85      0.85      0.85      1934
weighted avg       0.88      0.88      0.88      1934



In [75]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_bow, y_train)
y_pred_test2 = clf2.predict(X_test_bow)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88       581
           1       0.90      0.89      0.89       695
           2       0.64      0.69      0.67       159
           3       0.83      0.80      0.82       275
           4       0.82      0.87      0.84       224

    accuracy                           0.85      1934
   macro avg       0.82      0.82      0.82      1934
weighted avg       0.85      0.85      0.85      1934



## TF-IDF Embedding

In [83]:
clf = RandomForestClassifier(n_estimators=100, random_state=82)
clf.fit(X_tfidf, y_train)
y_pred_test = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))


              precision    recall  f1-score   support

           0       0.92      0.91      0.91       581
           1       0.87      0.93      0.90       695
           2       0.80      0.66      0.72       159
           3       0.89      0.84      0.86       275
           4       0.90      0.88      0.89       224

    accuracy                           0.88      1934
   macro avg       0.87      0.84      0.86      1934
weighted avg       0.88      0.88      0.88      1934



In [84]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_tfidf, y_train)
y_pred_test2 = clf2.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.90      0.90      0.90       581
           1       0.90      0.90      0.90       695
           2       0.67      0.70      0.69       159
           3       0.86      0.83      0.84       275
           4       0.85      0.85      0.85       224

    accuracy                           0.87      1934
   macro avg       0.83      0.83      0.83      1934
weighted avg       0.87      0.87      0.87      1934



## BERT Encoding for RFC & Log. Reg.

In [85]:
clf = RandomForestClassifier(n_estimators=100, random_state=82)
clf.fit(X_w2v, y_train)
y_pred_test = clf.predict(X_test_w2v)
print(classification_report(y_test, y_pred_test))


              precision    recall  f1-score   support

           0       0.34      0.19      0.24       581
           1       0.27      0.02      0.04       695
           2       0.00      0.00      0.00       159
           3       0.15      0.30      0.20       275
           4       0.11      0.50      0.18       224

    accuracy                           0.16      1934
   macro avg       0.17      0.20      0.13      1934
weighted avg       0.23      0.16      0.14      1934



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [91]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_w2v, y_train)
y_pred_test2 = clf2.predict(X_test_w2v)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       581
           1       1.00      0.00      0.00       695
           2       0.00      0.00      0.00       159
           3       0.14      1.00      0.25       275
           4       0.00      0.00      0.00       224

    accuracy                           0.14      1934
   macro avg       0.23      0.20      0.05      1934
weighted avg       0.38      0.14      0.04      1934



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
