In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
from gensim.models import Word2Vec

In [4]:
# Download required NLTK data
nltk.download('stopwords')

# Load the dataset
data = pd.read_csv('emotion_dataset.csv')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# Data Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Stemming
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

In [6]:
data['processed_text'] = data['Text'].apply(preprocess_text)

# Split the data
X = data['processed_text']
y = data['Emotion']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [7]:
sentences = [text.split() for text in X_train]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [8]:
def text_to_vector(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [9]:
X_train_w2v = np.array([text_to_vector(text, w2v_model) for text in X_train])
X_test_w2v = np.array([text_to_vector(text, w2v_model) for text in X_test])

In [10]:
# Define models
models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': MultinomialNB(),
    'ANN': MLPClassifier(hidden_layer_sizes=(10,), max_iter=5, random_state=42)
}

# Train and evaluate models
results = {}

In [11]:
for name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)

    results[name] = {
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from sklearn.preprocessing import MinMaxScaler

# Create a MinMaxScaler object
scaler = MinMaxScaler()

# Fit the scaler to the training data and transform it
X_train_w2v_scaled = scaler.fit_transform(X_train_w2v)

# Transform the testing data using the fitted scaler
X_test_w2v_scaled = scaler.transform(X_test_w2v)



In [13]:
results_w2v = {}
for name, model in models.items():
    if name == 'Naive Bayes':
        model.fit(X_train_w2v_scaled, y_train)
        y_pred = model.predict(X_test_w2v_scaled)
    else:
        model.fit(X_train_w2v, y_train)
        y_pred = model.predict(X_test_w2v)

    results_w2v[name] = {
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'Accuracy': accuracy_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:

# Print results
for model, metrics in results.items():
    print(f"\nResults for TFIDF{model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Results for TFIDFDecision Tree:
Precision: 0.5287
Recall: 0.5340
Accuracy: 0.5340
F1 Score: 0.5285

Results for TFIDFNaive Bayes:
Precision: 0.6035
Recall: 0.5422
Accuracy: 0.5422
F1 Score: 0.5068

Results for TFIDFANN:
Precision: 0.6068
Recall: 0.5327
Accuracy: 0.5327
F1 Score: 0.4934


In [15]:
# Print results
for model, metrics in results_w2v.items():
    print(f"\nResults for W2V{model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Results for W2VDecision Tree:
Precision: 0.3704
Recall: 0.3724
Accuracy: 0.3724
F1 Score: 0.3711

Results for W2VNaive Bayes:
Precision: 0.2015
Recall: 0.3309
Accuracy: 0.3309
F1 Score: 0.2012

Results for W2VANN:
Precision: 0.2362
Recall: 0.3269
Accuracy: 0.3269
F1 Score: 0.1747


In [None]:
#hyperparameter tuning

from sklearn.model_selection import GridSearchCV

param_grids = {
    'Decision Tree': {
        'max_depth': [5, 10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Naive Bayes': {
        'alpha': [0.1, 0.5, 1.0]
    },
    'ANN': {
        'hidden_layer_sizes': [(50,), (100,), (100, 50)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'adaptive']
    }
}

def train_and_evaluate_tuned(X_train_embedded, X_test_embedded, models, param_grids):
    results = {}
    for name, model in models.items():
        print(f"Tuning {name}...")
        grid_search = GridSearchCV(model, param_grids[name], cv=3, scoring='accuracy', verbose=1)
        grid_search.fit(X_train_embedded, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best parameters for {name}: {grid_search.best_params_}")
        y_pred = best_model.predict(X_test_embedded)
        results[name] = {
            'Precision': precision_score(y_test, y_pred, average='weighted'),
            'Recall': recall_score(y_test, y_pred, average='weighted'),
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred, average='weighted')
        }
    return results

print("\n--- Tuning and Training with TF-IDF Features ---")
results_tfidf_tuned = train_and_evaluate_tuned(X_train_tfidf, X_test_tfidf, models, param_grids)

print("\n--- Tuning and Training with Word2Vec Features ---")
results_w2v_tuned = train_and_evaluate_tuned(X_train_w2v, X_test_w2v, models, param_grids)

print("\nResults for tuned TF-IDF-based models:")
for model_name, metrics in results_tfidf_tuned.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")

print("\nResults for tuned Word2Vec-based models:")
for model_name, metrics in results_w2v_tuned.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"{metric_name}: {value:.4f}")


In [16]:
from gensim.models import KeyedVectors
from transformers import BertTokenizer, BertModel
import torch

In [17]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip

--2025-03-03 16:03:19--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-03 16:03:19--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-03 16:03:19--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [18]:
# Load pre-trained GloVe embeddings
glove_model = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False, no_header=True)


In [19]:
# Load pre-trained BERT model and tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [20]:
def text_to_glove_vector(text, model):
    words = text.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

def text_to_bert_vector(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

In [21]:
# Create GloVe embeddings
X_train_glove = np.array([text_to_glove_vector(text, glove_model) for text in X_train])
X_test_glove = np.array([text_to_glove_vector(text, glove_model) for text in X_test])

# Create BERT embeddings
X_train_bert = np.array([text_to_bert_vector(text, bert_tokenizer, bert_model) for text in X_train])
X_test_bert = np.array([text_to_bert_vector(text, bert_tokenizer, bert_model) for text in X_test])

In [22]:
# ('BERT', X_train_bert, X_test_bert)
# Train and evaluate models
results_glove_bert = {}

for embedding_name, X_train_emb, X_test_emb in [('GloVe', X_train_glove, X_test_glove), ('BERT', X_train_bert, X_test_bert)]:
    for name, model in models.items():
        if name == 'Naive Bayes':
            continue
        model.fit(X_train_emb, y_train)
        y_pred = model.predict(X_test_emb)

        results_glove_bert[f"{name} ({embedding_name})"] = {
            'Precision': precision_score(y_test, y_pred, average='weighted'),
            'Recall': recall_score(y_test, y_pred, average='weighted'),
            'Accuracy': accuracy_score(y_test, y_pred),
            'F1 Score': f1_score(y_test, y_pred, average='weighted')
        }


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [23]:
# Print results
for model, metrics in results_glove_bert.items():
    print(f"\nResults for W2V{model}:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")



Results for W2VDecision Tree (GloVe):
Precision: 0.3775
Recall: 0.3708
Accuracy: 0.3708
F1 Score: 0.3735

Results for W2VANN (GloVe):
Precision: 0.3875
Recall: 0.3995
Accuracy: 0.3995
F1 Score: 0.3326

Results for W2VDecision Tree (BERT):
Precision: 0.3725
Recall: 0.3688
Accuracy: 0.3688
F1 Score: 0.3703

Results for W2VANN (BERT):
Precision: 0.4923
Recall: 0.5074
Accuracy: 0.5074
F1 Score: 0.4842
