In [1]:
import sys
sys.path.append('../scripts')
from helpers import *
from feature_extraction import *
from modeling import *
from evaluation import *

In [2]:
base_path = '../data/processed_data/'
df_train = read_file(base_path + 'preprocessed_training_tweets.csv')
df_test = read_file(base_path + 'preprocessed_test_tweets.csv')
df_validation = read_file(base_path + 'preprocessed_validation_tweets.csv')
df_train.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,Preprocessed Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get borderland murder


## Drop nan values (after preprocessing)

In [3]:
df_train = df_train.dropna(subset=['Preprocessed Tweet content'])
df_test = df_test.dropna(subset=['Preprocessed Tweet content'])
df_validation = df_validation.dropna(subset=['Preprocessed Tweet content'])

## Text Vectorization
- Bag of Words
- TF-IDF
- Word2Vec
- GloVe
- FastText

### 1- Bag of Words

In [14]:
X_train_bow, X_val_bow, X_test_bow = vectorize_bow(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'])
X_train_bow

<72310x33350 sparse matrix of type '<class 'numpy.int64'>'
	with 747231 stored elements in Compressed Sparse Row format>

### 2- TF-IDF

In [15]:
X_train_tfidf, X_val_tfidf, X_test_tfidf = vectorize_tfidf(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'])
X_train_tfidf

<72310x33350 sparse matrix of type '<class 'numpy.float64'>'
	with 747231 stored elements in Compressed Sparse Row format>

### 3- Word2Vec

In [16]:
X_train_word2vec, X_val_word2vec, X_test_word2vec = vectorize_word2vec_data(df_train['Tweet content'], df_validation['Tweet content'], df_test['Tweet content'])
X_train_word2vec

NameError: name 'Word2Vec' is not defined

### 4- GloVe

In [10]:
glove_model = load_glove_model('../models/glove.6B.50d.txt')
X_train_glove, X_val_glove, X_test_glove = vectorize_glove(df_train['Tweet content'], df_validation['Tweet content'], df_test['Tweet content'], glove_model)
X_train_glove

array([[ 0.16185583,  0.07815364,  0.12150591, ..., -0.23421918,
        -0.03753255,  0.14226   ],
       [ 0.35839301,  0.08509   ,  0.13379246, ..., -0.22806492,
        -0.10831277,  0.21318831],
       [ 0.21978219,  0.00331909,  0.28728227, ..., -0.24701373,
        -0.07912255,  0.17095545],
       ...,
       [ 0.2251264 ,  0.04810692,  0.2021436 , ..., -0.19074525,
         0.05156472, -0.08938948],
       [ 0.20418338,  0.03497481,  0.26873603, ..., -0.21134004,
         0.02856181, -0.05868534],
       [ 0.28219232,  0.1075846 ,  0.13965788, ..., -0.21725121,
         0.02088072, -0.0263792 ]])

## Train SVM model

-using Bag of Words

In [17]:
svm_model_bow = train_svm_model(X_train_bow, df_train['sentiment'])

[LibLinear]



In [18]:
predict_test_bow=svm_model_bow.predict(X_test_bow)
print("SVM using Bag Of Words")
evaluate_model(svm_model_bow, df_test['sentiment'],predict_test_bow)

SVM using Bag Of Words


{'accuracy': 0.9218436873747495,
 'confusion_matrix': array([[ 84,   3,   0,   5],
        [  3, 115,   3,   4],
        [  0,   3, 126,   5],
        [  6,   4,   3, 135]], dtype=int64),
 'precision': 0.9218436873747495,
 'recall': 0.9218436873747495,
 'f1_score': 0.9218436873747495}

-using tf-idf

In [19]:
svm_model_tf_idf = train_svm_model(X_train_tfidf, df_train['sentiment'])

[LibLinear]

In [20]:
predict_test_tf_idf=svm_model_tf_idf.predict(X_test_bow)
print("SVM using tf-idf")
evaluate_model(svm_model_tf_idf, df_test['sentiment'],predict_test_tf_idf)

SVM using tf-idf


{'accuracy': 0.8957915831663327,
 'confusion_matrix': array([[ 83,   4,   0,   5],
        [  3, 113,   4,   5],
        [  3,   8, 116,   7],
        [  7,   3,   3, 135]], dtype=int64),
 'precision': 0.8957915831663327,
 'recall': 0.8957915831663327,
 'f1_score': 0.8957915831663327}

In [12]:
svm_model_glove = train_svm_model(X_train_glove, df_train['sentiment'])

[LibLinear]

In [13]:
predict_test_glove=svm_model_glove.predict(X_test_glove)
print("SVM using glove")
evaluate_model(svm_model_glove, df_test['sentiment'],predict_test_glove)

SVM using glove


{'accuracy': 0.45691382765531063,
 'confusion_matrix': array([[ 3, 28, 23, 38],
        [ 0, 91, 20, 14],
        [ 2, 47, 48, 37],
        [ 2, 36, 24, 86]], dtype=int64),
 'precision': 0.45691382765531063,
 'recall': 0.45691382765531063,
 'f1_score': 0.45691382765531063}