In [29]:
import pandas as pd
import opendatasets as od
import numpy as np
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Sajid
[nltk_data]     Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '../kaggle.json'

In [21]:
od.download(
    "https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Dataset URL: https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis
Downloading twitter-entity-sentiment-analysis.zip to .\twitter-entity-sentiment-analysis


100%|██████████| 1.99M/1.99M [00:00<00:00, 66.3MB/s]







In [26]:
df = pd.read_csv('./twitter-entity-sentiment-analysis/twitter_training.csv', header=None)
df.columns = ['ID', 'Entity', 'Sentiment', 'Tweet']
df.head()

Unnamed: 0,ID,Entity,Sentiment,Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [34]:
df['tokens'] = df['Tweet'].apply(lambda x: simple_preprocess(str(x), deacc=True, min_len=3))
df.head()

Unnamed: 0,ID,Entity,Sentiment,Tweet,tokens
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,"[getting, borderlands, and, will, murder, you,..."
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[coming, the, borders, and, will, kill, you, all]"
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,"[getting, borderlands, and, will, kill, you, all]"
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[coming, borderlands, and, will, murder, you, ..."
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[getting, borderlands, and, will, murder, you,..."


In [35]:
sentences = df['tokens'].tolist()

In [36]:
# CBOW Model
model_cbow = Word2Vec(sentences, vector_size=100, window=5,
min_count=1, sg=0, epochs=100)

# Skip-Gram Model
model_sg = Word2Vec(sentences, vector_size=100, window=5,
min_count=1, sg=1, epochs=100)

In [37]:
def avg_word2vec(tokens, model):
    valid_words = [w for w in tokens if w in model.wv]
    if not valid_words:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[w] for w in valid_words], axis=0)

In [39]:
X_cbow = np.array([avg_word2vec(tokens, model_cbow) for tokens in
df['tokens']])
X_sg = np.array([avg_word2vec(tokens, model_sg) for tokens in
df['tokens']])
y = df['Sentiment']

In [40]:
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_cbow, y,
test_size=0.3, random_state=42)
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X_sg, y,
test_size=0.3, random_state=42)

In [41]:
# CBOW embeddings
clf_cbow = LogisticRegression(max_iter=1000)
clf_cbow.fit(Xc_train, yc_train)
y_pred_cbow = clf_cbow.predict(Xc_test)


# Skip-Gram embeddings
clf_sg = LogisticRegression(max_iter=1000)
clf_sg.fit(Xs_train, ys_train)
y_pred_sg = clf_sg.predict(Xs_test)

In [42]:
print("=== CBOW Model Performance ===")
print(classification_report(yc_test, y_pred_cbow))
print("Accuracy:", accuracy_score(yc_test, y_pred_cbow))

print("\n=== Skip-Gram Model Performance ===")
print(classification_report(ys_test, y_pred_sg))
print("Accuracy:", accuracy_score(ys_test, y_pred_sg))

=== CBOW Model Performance ===
              precision    recall  f1-score   support

  Irrelevant       0.41      0.20      0.27      3870
    Negative       0.58      0.71      0.64      6739
     Neutral       0.50      0.50      0.50      5424
    Positive       0.57      0.61      0.59      6372

    accuracy                           0.54     22405
   macro avg       0.52      0.51      0.50     22405
weighted avg       0.53      0.54      0.53     22405

Accuracy: 0.5438518187904485

=== Skip-Gram Model Performance ===
              precision    recall  f1-score   support

  Irrelevant       0.43      0.21      0.28      3870
    Negative       0.57      0.73      0.64      6739
     Neutral       0.53      0.44      0.48      5424
    Positive       0.54      0.63      0.58      6372

    accuracy                           0.54     22405
   macro avg       0.52      0.50      0.50     22405
weighted avg       0.53      0.54      0.52     22405

Accuracy: 0.5409060477571971


In [49]:
new_sentences = [
        "loved the new season on netflix it was totally worth the wait",
        "twitter keeps crashing every time i try to post something so annoying"
    ]
tokens_new = [word_tokenize(s.lower()) for s in new_sentences]

# Convert to embeddings
X_new_cbow = np.array([avg_word2vec(t, model_cbow) for t in tokens_new])
X_new_sg = np.array([avg_word2vec(t, model_sg) for t in tokens_new])

# Predict
print("CBOW Predictions:", clf_cbow.predict(X_new_cbow))
print("Skip-Gram Predictions:", clf_sg.predict(X_new_sg))

CBOW Predictions: ['Positive' 'Negative']
Skip-Gram Predictions: ['Positive' 'Negative']


In [47]:
model_sg.wv.most_similar('murder', topn=5)


[('preacher', 0.6309787631034851),
 ('manor', 0.621044933795929),
 ('premeditated', 0.6008966565132141),
 ('precede', 0.5980029106140137),
 ('felony', 0.5890827178955078)]