In [None]:

from afinn import Afinn
from collections import defaultdict

# Load AFINN lexicon with defaultdict to handle missing words
sent_lexicon = defaultdict(lambda: 0, Afinn()._dict)

# Function to calculate sentiment score for a single text
def calculate_sentiment(text):

    print(*(sent_lexicon[word] for word in text), sep='\n')
    
    # Sum up the sentiment scores of each word in the text
    sentiment_score = sum(sent_lexicon[word] for word in text)
    
    # Optionally normalize by the number of words (average sentiment per word)
    if len(text) > 0:
        normalized_score = sentiment_score / len(text)
    else:
        normalized_score = 0
    
    return normalized_score

In [3]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
import numpy as np
import multiprocessing

In [5]:
df_combined = pd.read_csv("data/combined_preprocessed_comments.csv")

sentences = df_combined['comment_no_stopwords'].dropna().apply(lambda x: x.split()).tolist()

In [6]:
# Train Word2Vec model
w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=4,  
    min_count=3,
    sample=1e-5,  
    alpha=0.03,  
    min_alpha=0.0007,  
    negative=20,  
    workers=multiprocessing.cpu_count() - 1,  
    sg=0 
)

# Build vocabulary and train the Word2Vec model
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=20)

(1507473, 5030000)

In [7]:
word_vectors = w2v_model.wv
vocab = list(word_vectors.index_to_key)
embeddings = np.array([word_vectors[word] for word in vocab])