In [1]:
import numpy as np
import pandas as pd


In [2]:
train_df = pd.read_csv('train_data.csv')

In [3]:
test_df = pd.read_csv('test_data.csv')

In [4]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

In [5]:
# Tokenize sentences into words
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in train_df['tweet'].astype(str)]

# Train Word2Vec model
w2v_model = Word2Vec(sentences=tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save or load the model
w2v_model.save("word2vec_model.model")
# w2v_model = Word2Vec.load("word2vec_model.model")

# Get the word vectors
word_vectors = w2v_model.wv

In [6]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train = np.array([vectorize(str(sentence)) for sentence in train_df['tweet']])
X_test = np.array([vectorize(str(sentence)) for sentence in test_df['tweet']])

In [7]:
Y_train = train_df['decision']

In [8]:
Y_test = test_df['decision']

In [9]:
from sklearn import svm

#Create a svm Classifier
clf = svm.SVC(kernel='rbf', C=1.0, gamma='scale', decision_function_shape='ovr') # Linear Kernel

#Train the model using the training sets
clf.fit(X_train, Y_train)

#Predict the response for test dataset
Y_pred = clf.predict(X_test)

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy:', accuracy_score(Y_test, Y_pred))

Accuracy: 0.7404637588636311


In [11]:
precision = precision_score(Y_test, Y_pred, average='weighted')
f1 = f1_score(Y_test, Y_pred, average='weighted')
recall = recall_score(Y_test, Y_pred, average='weighted')
print(precision, f1, recall)

0.7429559171197867 0.7183387672051681 0.7404637588636311


In [12]:
from sklearn.metrics import classification_report

In [13]:
report = classification_report(Y_test, Y_pred, target_names=['negative','neutral','positive'])

In [14]:
print(report)

              precision    recall  f1-score   support

    negative       0.75      0.29      0.41      8038
     neutral       0.76      0.75      0.76     13637
    positive       0.73      0.90      0.80     22184

    accuracy                           0.74     43859
   macro avg       0.75      0.65      0.66     43859
weighted avg       0.74      0.74      0.72     43859

