In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.decomposition import PCA

import torch 
from torch import nn
from torch import optim

import numpy as np


import matplotlib.pyplot as plt
# Word2Vec
import nltk
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec




# Load the training and validation data
train = pd.read_csv('data/public_data/train/track_a/eng.csv')

train.head()


emotion2label = {'Anger': 0, 'Fear': 1, 'Joy': 2, 'Sadness': 3, 'Surprise': 4}
label2emotion = {v: k for k, v in emotion2label.items()}
emotions = list(emotion2label.keys())


# clean data
train_X = train["text"].str.lower()
train_X = train_X.str.replace(r'[^\w\s]', '') # remove punctuation
train_X = train_X.str.replace(r'\d+', '') # remove numbers
train_X = train_X.str.replace(r'\t', '') # remove tabs
train_X = train_X.str.replace(r'\s+', ' ') # remove extra spaces


train_Y = train[emotions].values

ModuleNotFoundError: No module named 'nltk'

## Generate Embedding 
### Word2Vec

In [None]:
embedding_dim = 1024
window_size = 200
min_word_count = 0
workers = 8

nltk.download('punkt_tab')

print([word for sentence in train_X for word in sentence.split(" ")])
tokens = [word_tokenize(word) for sentence in train_X for word in sentence.split(" ")]


# TODO: Train Word2Vec model with 'Word2Vec'
word2vec_model = Word2Vec(sentences=[*tokens], vector_size=embedding_dim, window=window_size,
                          min_count=min_word_count, workers=workers)
print("Word2Vec model training complete.")
train_X_emb = [[word2vec_model.wv[word] for word in sentence.split(" ") if word in word2vec_model.wv] for sentence in train_X]

words_to_visualize = [(word) for sentence in train_X[:10] for word in sentence.split(" ")]
print(words_to_visualize)
word_vectors = [word2vec_model.wv[word] for word in words_to_visualize]



pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(word_vectors)

# Plot the word vectors
plt.figure(figsize=(10, 8))
for i, word in enumerate(words_to_visualize):
    plt.scatter(reduced_vectors[i][0], reduced_vectors[i][1])
    plt.annotate(word, xy=(reduced_vectors[i][0], reduced_vectors[i][1]), fontsize=12)
plt.title("2D Visualization of Word Embeddings")
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.show()

### Bag of Words

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,2))
train_X_emb = vectorizer.fit_transform(train_X).toarray()

### Cast to Tensor

In [None]:
train_X_t = torch.Tensor(train_X_emb)
train_Y_t = torch.Tensor(train_Y)

print(train_X_t.shape, train_Y_t.shape)

_ = [print(f' - {e}: {v} ({(100*v/len(train_Y_t)).round()}%)') for e,v in zip(emotions, train_Y_t.sum(axis=0))]