<a href="https://colab.research.google.com/github/francisrocket/nlp_final_project/blob/main/Notebook_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q datasets gensim sentence_transformers

In [None]:
import os

if not os.path.exists("train.csv"):
   from datasets import *
   dset = load_dataset("dair-ai/emotion")
   # code from https://stackoverflow.com/a/76218276
   train_testvalid = dset['train'].train_test_split(test_size=0.2)
   # Split the 10% test + valid in half test, half valid
   test_valid = train_testvalid['test'].train_test_split(test_size=0.5)
   # gather everyone if you want to have a single DatasetDict
   dset = DatasetDict({
      'train': train_testvalid['train'],
      'test': test_valid['test'],
      'valid': test_valid['train']})
   print(dset)
else:
    print("file exists, skipping")


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 12800
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
    valid: Dataset({
        features: ['text', 'label'],
        num_rows: 1600
    })
})


## Cleaning Function to clean the dataset text

In [None]:
import spacy
import pandas as pd
import numpy as np
import re
if not os.path.exists("test.csv"):
    # Check if the spaCy model is loaded, otherwise install it
    try:
        nlp = spacy.load("en_core_web_sm")
    except OSError:
        print("Downloading the 'en_core_web_sm' model")
        !python -m spacy download en_core_web_sm
        nlp = spacy.load("en_core_web_sm")

    train_df = pd.DataFrame(dset["train"])
    test_df = pd.DataFrame(dset['test'])
    val_df = pd.DataFrame(dset['valid'])
    i = 0
    def cleaning(text):
        doc = nlp(text.lower().strip())
        cleaned_tokens = []
        global i
        print(f"Cleaning: {i}")
        for token in doc:
            if not token.is_stop and not token.is_punct and not token.is_space:
                lemma = re.sub(r'\W', '', token.lemma_)
                if lemma:
                    cleaned_tokens.append(lemma)
        i += 1
        return ' '.join(cleaned_tokens)

    # Apply the cleaning function to the text column
    train_df["cleaned_text"] = train_df["text"].apply(cleaning)
    test_df["cleaned_text"] = test_df["text"].apply(cleaning)
    val_df["cleaned_text"] = val_df["text"].apply(cleaning)

    print(train_df[['text', 'cleaned_text']].head())  # Display the original and cleaned text for verification

    train_df.to_csv("train.csv")
    test_df.to_csv("test.csv")
    val_df.to_csv("val.csv")

else:
    print("file exists, skipping")
    train_df = pd.read_csv("train.csv")
    test_df = pd.read_csv("test.csv")
    val_df = pd.read_csv("val.csv")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Cleaning: 11013
Cleaning: 11014
Cleaning: 11015
Cleaning: 11016
Cleaning: 11017
Cleaning: 11018
Cleaning: 11019
Cleaning: 11020
Cleaning: 11021
Cleaning: 11022
Cleaning: 11023
Cleaning: 11024
Cleaning: 11025
Cleaning: 11026
Cleaning: 11027
Cleaning: 11028
Cleaning: 11029
Cleaning: 11030
Cleaning: 11031
Cleaning: 11032
Cleaning: 11033
Cleaning: 11034
Cleaning: 11035
Cleaning: 11036
Cleaning: 11037
Cleaning: 11038
Cleaning: 11039
Cleaning: 11040
Cleaning: 11041
Cleaning: 11042
Cleaning: 11043
Cleaning: 11044
Cleaning: 11045
Cleaning: 11046
Cleaning: 11047
Cleaning: 11048
Cleaning: 11049
Cleaning: 11050
Cleaning: 11051
Cleaning: 11052
Cleaning: 11053
Cleaning: 11054
Cleaning: 11055
Cleaning: 11056
Cleaning: 11057
Cleaning: 11058
Cleaning: 11059
Cleaning: 11060
Cleaning: 11061
Cleaning: 11062
Cleaning: 11063
Cleaning: 11064
Cleaning: 11065
Cleaning: 11066
Cleaning: 11067
Cleaning: 11068
Cleaning: 11069
Cleaning: 11070
Cleanin

In [None]:
# This is to remove the 5 from the dfs
train_df = train_df[train_df['label'] != 5]
test_df = test_df[test_df['label'] != 5]
val_df = val_df[val_df['label'] != 5]

train_df = train_df.dropna(subset=["cleaned_text"])
test_df = test_df.dropna(subset=["cleaned_text"])
val_df = val_df.dropna(subset=["cleaned_text"])

train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)
val_df.reset_index(drop=True, inplace=True)

#### 4 Different types of embeddings

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

X_bow = vectorizer.fit_transform(train_df['cleaned_text'])
X_test_bow = vectorizer.transform(test_df['cleaned_text'])
print("Bag of Words features:", X_bow.shape)

Bag of Words features: (12360, 10345)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(train_df['cleaned_text'])
X_test_tfidf = vectorizer.transform(test_df['cleaned_text'])
print("TF-IDF features:", X_tfidf.shape)


TF-IDF features: (12360, 10345)


In [None]:
from gensim.models import Word2Vec
import numpy as np

def create_word2vec_embeddings(dataframe):
    sentences = [text.split() for text in dataframe['cleaned_text']]
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)
    word_vectors = model.wv

    # Average Word Vectors for each text
    def document_vector(doc):
        return np.mean([word_vectors[w] for w in doc if w in word_vectors], axis=0)

    X_w2v = np.array([document_vector(text) for text in sentences if document_vector(text).shape != ()])
    return X_w2v


X_w2v = create_word2vec_embeddings(train_df)
X_test_w2v = create_word2vec_embeddings(test_df)
print("Word2Vec features shape:", X_w2v.shape)

Word2Vec features shape: (12360, 100)


In [None]:
from sentence_transformers import SentenceTransformer
#if not os.path.exists('X_bert.npy'):
def create_bert_embeddings(dataframe):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    X_bert = model.encode(dataframe['cleaned_text'], show_progress_bar=True)
    return X_bert

X_bert = create_bert_embeddings(train_df)
X_test_bert = create_bert_embeddings(test_df)
np.save('X_bert.npy', X_bert)
np.save('X_test_bert.npy', X_test_bert)

#else:
#    print("file exists, skipping")
#    X_bert = np.load('X_bert.npy')
#    X_test_bert = np.load('X_test_bert.npy')

print("BERT Embeddings shape:", X_bert.shape)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/387 [00:00<?, ?it/s]

Batches:   0%|          | 0/49 [00:00<?, ?it/s]

BERT Embeddings shape: (12360, 384)


In [None]:
y_train = (train_df['label'])
y_test = (test_df['label'])
y_val = val_df['label']

In [None]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## BoW Encoding for RFC & Log. Reg. (latest)



In [None]:
clf3 = RandomForestClassifier(n_estimators=100, random_state=82)
clf3.fit(X_bow, y_train)
y_pred_test = clf3.predict(X_test_bow)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.90      0.91      0.91       469
           1       0.92      0.88      0.90       543
           2       0.71      0.80      0.75       133
           3       0.85      0.89      0.87       210
           4       0.89      0.85      0.87       182

    accuracy                           0.88      1537
   macro avg       0.85      0.86      0.86      1537
weighted avg       0.88      0.88      0.88      1537



In [None]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_bow, y_train)
y_pred_test2 = clf2.predict(X_test_bow)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87       469
           1       0.87      0.85      0.86       543
           2       0.62      0.76      0.68       133
           3       0.88      0.78      0.83       210
           4       0.86      0.84      0.85       182

    accuracy                           0.84      1537
   macro avg       0.82      0.82      0.82      1537
weighted avg       0.85      0.84      0.84      1537



## TF-IDF Embedding (latest)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=82)
clf.fit(X_tfidf, y_train)
y_pred_test = clf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))


              precision    recall  f1-score   support

           0       0.93      0.90      0.91       469
           1       0.85      0.93      0.89       543
           2       0.82      0.71      0.76       133
           3       0.87      0.82      0.84       210
           4       0.88      0.86      0.87       182

    accuracy                           0.88      1537
   macro avg       0.87      0.84      0.86      1537
weighted avg       0.88      0.88      0.88      1537



In [None]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_tfidf, y_train)
y_pred_test2 = clf2.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.92      0.81      0.86       469
           1       0.86      0.82      0.84       543
           2       0.39      0.82      0.53       133
           3       0.90      0.68      0.77       210
           4       0.82      0.77      0.80       182

    accuracy                           0.79      1537
   macro avg       0.78      0.78      0.76      1537
weighted avg       0.84      0.79      0.80      1537



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## W2V Encoding (latest) (ignore pls)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=82)
clf.fit(X_w2v, y_train)
y_pred_test = clf.predict(X_test_w2v)
print(classification_report(y_test, y_pred_test))


              precision    recall  f1-score   support

           0       0.36      0.05      0.08       469
           1       0.00      0.00      0.00       543
           2       0.00      0.00      0.00       133
           3       0.00      0.00      0.00       210
           4       0.12      0.96      0.21       182

    accuracy                           0.13      1537
   macro avg       0.10      0.20      0.06      1537
weighted avg       0.12      0.13      0.05      1537



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_w2v, y_train)
y_pred_test2 = clf2.predict(X_test_w2v)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.30      0.50      0.38       469
           1       0.25      0.02      0.04       543
           2       0.08      0.45      0.14       133
           3       0.00      0.00      0.00       210
           4       0.00      0.00      0.00       182

    accuracy                           0.20      1537
   macro avg       0.13      0.19      0.11      1537
weighted avg       0.19      0.20      0.14      1537



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## BERT Encoding for RFC & Log. Reg. (latest)

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=82)
clf.fit(X_bert, y_train)
y_pred_test = clf.predict(X_test_bert)
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.58      0.77      0.66       469
           1       0.60      0.86      0.71       543
           2       0.58      0.05      0.10       133
           3       0.93      0.26      0.40       210
           4       0.84      0.27      0.41       182

    accuracy                           0.61      1537
   macro avg       0.71      0.44      0.45      1537
weighted avg       0.67      0.61      0.56      1537



In [None]:
clf2 = LogisticRegression(penalty=None, max_iter=5000)
clf2.fit(X_bert, y_train)
y_pred_test2 = clf2.predict(X_test_bert)
print(classification_report(y_test, y_pred_test2))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75       469
           1       0.77      0.79      0.78       543
           2       0.61      0.52      0.56       133
           3       0.67      0.63      0.65       210
           4       0.71      0.66      0.68       182

    accuracy                           0.73      1537
   macro avg       0.70      0.68      0.69      1537
weighted avg       0.72      0.73      0.72      1537

