In [1]:
import sys
sys.path.append('../scripts')
from helpers import *
from feature_extraction import *
from modeling import *
from evaluation import *

In [2]:
base_path = '../data/processed_data/'
df_train = read_file(base_path + 'preprocessed_training_tweets.csv')
df_test = read_file(base_path + 'preprocessed_test_tweets.csv')
df_validation = read_file(base_path + 'preprocessed_validation_tweets.csv')
df_train.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,Preprocessed Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get borderland murder


## Drop nan values (after preprocessing)

In [3]:
df_train = df_train.dropna(subset=['Preprocessed Tweet content'])
df_test = df_test.dropna(subset=['Preprocessed Tweet content'])
df_validation = df_validation.dropna(subset=['Preprocessed Tweet content'])

## Text Vectorization
- Bag of Words
- TF-IDF
- Word2Vec
- GloVe
- FastText

### 1- Bag of Words

In [4]:
X_train_bow, X_val_bow, X_test_bow = vectorize_bow(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'])
X_train_bow

<72310x33350 sparse matrix of type '<class 'numpy.int64'>'
	with 747231 stored elements in Compressed Sparse Row format>

### 2- TF-IDF

In [5]:
X_train_tfidf, X_val_tfidf, X_test_tfidf = vectorize_tfidf(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'])
X_train_tfidf

<72310x33350 sparse matrix of type '<class 'numpy.float64'>'
	with 747231 stored elements in Compressed Sparse Row format>

### 3- Word2Vec

In [6]:
# X_train_word2vec, X_val_word2vec, X_test_word2vec = vectorize_word2vec_data(df_train['Tweet content'], df_validation['Tweet content'], df_test['Tweet content'])
# X_train_word2vec

### 4- GloVe

In [7]:
glove_model = load_glove_model('../models/glove.6B.50d.txt')
X_train_glove, X_val_glove, X_test_glove = vectorize_glove(df_train['Tweet content'], df_validation['Tweet content'], df_test['Tweet content'], glove_model)
X_train_glove

array([[ 0.16185583,  0.07815364,  0.12150591, ..., -0.23421918,
        -0.03753255,  0.14226   ],
       [ 0.35839301,  0.08509   ,  0.13379246, ..., -0.22806492,
        -0.10831277,  0.21318831],
       [ 0.21978219,  0.00331909,  0.28728227, ..., -0.24701373,
        -0.07912255,  0.17095545],
       ...,
       [ 0.2251264 ,  0.04810692,  0.2021436 , ..., -0.19074525,
         0.05156472, -0.08938948],
       [ 0.20418338,  0.03497481,  0.26873603, ..., -0.21134004,
         0.02856181, -0.05868534],
       [ 0.28219232,  0.1075846 ,  0.13965788, ..., -0.21725121,
         0.02088072, -0.0263792 ]])

## Train SVM model

-using Bag of Words

In [8]:
svm_model_bow = train_svm_model(X_train_bow, df_train['sentiment'])

[LibLinear]



In [9]:
predict_test_bow=svm_model_bow.predict(X_test_bow)
print("SVM using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

SVM using Bag Of Words
Accuracy: 0.9218436873747495
Confusion Matrix:
 [[ 84   3   0   5]
 [  3 115   3   4]
 [  0   3 126   5]
 [  6   4   3 135]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.90      0.91      0.91        92
    Negative       0.92      0.92      0.92       125
     Neutral       0.95      0.94      0.95       134
    Positive       0.91      0.91      0.91       148

    accuracy                           0.92       499
   macro avg       0.92      0.92      0.92       499
weighted avg       0.92      0.92      0.92       499



-using tf-idf

In [12]:
svm_model_tf_idf = train_svm_model(X_train_tfidf, df_train['sentiment'])

[LibLinear]

In [13]:
predict_test_tf_idf=svm_model_tf_idf.predict(X_test_bow)
print("SVM using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

SVM using tf-idf
Accuracy: 0.8957915831663327
Confusion Matrix:
 [[ 83   4   0   5]
 [  3 113   4   5]
 [  3   8 116   7]
 [  7   3   3 135]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.86      0.90      0.88        92
    Negative       0.88      0.90      0.89       125
     Neutral       0.94      0.87      0.90       134
    Positive       0.89      0.91      0.90       148

    accuracy                           0.90       499
   macro avg       0.89      0.90      0.89       499
weighted avg       0.90      0.90      0.90       499



-using glove

In [14]:
svm_model_glove = train_svm_model(X_train_glove, df_train['sentiment'])

[LibLinear]

In [15]:
predict_test_glove=svm_model_glove.predict(X_test_glove)
print("SVM using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

SVM using glove
Accuracy: 0.45691382765531063
Confusion Matrix:
 [[ 3 28 23 38]
 [ 0 91 20 14]
 [ 2 47 48 37]
 [ 2 36 24 86]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.43      0.03      0.06        92
    Negative       0.45      0.73      0.56       125
     Neutral       0.42      0.36      0.39       134
    Positive       0.49      0.58      0.53       148

    accuracy                           0.46       499
   macro avg       0.45      0.42      0.38       499
weighted avg       0.45      0.46      0.41       499



## Train naive bayes model

-using Bag of Words

In [16]:
naive_bayes_model_bow = train_naive_bayes(X_train_bow, df_train['sentiment'])

In [17]:
predict_test_bow=naive_bayes_model_bow.predict(X_test_bow)
print("naive bayes using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)


naive bayes using Bag Of Words
Accuracy: 0.7715430861723447
Confusion Matrix:
 [[ 65  14   2  11]
 [  2 107   6  10]
 [  9  20  92  13]
 [  7  13   7 121]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.78      0.71      0.74        92
    Negative       0.69      0.86      0.77       125
     Neutral       0.86      0.69      0.76       134
    Positive       0.78      0.82      0.80       148

    accuracy                           0.77       499
   macro avg       0.78      0.77      0.77       499
weighted avg       0.78      0.77      0.77       499



-using tf-idf

In [18]:
naive_bayes_model_tf_idf = train_naive_bayes(X_train_tfidf, df_train['sentiment'])

In [19]:
predict_test_tf_idf=naive_bayes_model_tf_idf.predict(X_test_tfidf)
print("naive bayes using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

naive bayes using tf-idf
Accuracy: 0.7675350701402806
Confusion Matrix:
 [[ 54  21   2  15]
 [  0 115   2   8]
 [  2  29  86  17]
 [  4  12   4 128]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.90      0.59      0.71        92
    Negative       0.65      0.92      0.76       125
     Neutral       0.91      0.64      0.75       134
    Positive       0.76      0.86      0.81       148

    accuracy                           0.77       499
   macro avg       0.81      0.75      0.76       499
weighted avg       0.80      0.77      0.76       499



-using glove

In [23]:
X_train_scaled_glove, X_test_scaled_glove= scaler_data(X_train_glove, X_test_glove)
naive_bayes_model_glove = train_naive_bayes(X_train_scaled_glove, df_train['sentiment'])

In [24]:
predict_test_glove=naive_bayes_model_glove.predict(X_test_scaled_glove)
print("naive bayes using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

naive bayes using glove
Accuracy: 0.250501002004008
Confusion Matrix:
 [[  0  92   0   0]
 [  0 125   0   0]
 [  0 134   0   0]
 [  0 148   0   0]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00        92
    Negative       0.25      1.00      0.40       125
     Neutral       0.00      0.00      0.00       134
    Positive       0.00      0.00      0.00       148

    accuracy                           0.25       499
   macro avg       0.06      0.25      0.10       499
weighted avg       0.06      0.25      0.10       499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Train logistic regression

-using bag of words

In [25]:
logistic_regression_model_bow = train_logistic_regression(X_train_bow, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
predict_test_bow=logistic_regression_model_bow.predict(X_test_bow)
print("logistic regression using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

logistic regression using Bag Of Words
Accuracy: 0.8997995991983968
Confusion Matrix:
 [[ 80   3   1   8]
 [  5 113   4   3]
 [  0   5 123   6]
 [  7   5   3 133]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.87      0.87      0.87        92
    Negative       0.90      0.90      0.90       125
     Neutral       0.94      0.92      0.93       134
    Positive       0.89      0.90      0.89       148

    accuracy                           0.90       499
   macro avg       0.90      0.90      0.90       499
weighted avg       0.90      0.90      0.90       499



-using tf-idf

In [27]:
logistic_regression_model_tf_idf = train_logistic_regression(X_train_tfidf, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
predict_test_tf_idf=logistic_regression_model_tf_idf.predict(X_test_tfidf)
print("logistic regression using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

logistic regression using tf-idf
Accuracy: 0.8777555110220441
Confusion Matrix:
 [[ 77   7   2   6]
 [  4 112   5   4]
 [  7   6 117   4]
 [  8   3   5 132]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.80      0.84      0.82        92
    Negative       0.88      0.90      0.89       125
     Neutral       0.91      0.87      0.89       134
    Positive       0.90      0.89      0.90       148

    accuracy                           0.88       499
   macro avg       0.87      0.87      0.87       499
weighted avg       0.88      0.88      0.88       499



-using glove

In [29]:
logistic_regression_model_glove = train_logistic_regression(X_train_glove, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
predict_test_glove=logistic_regression_model_glove.predict(X_test_glove)
print("logistic regression using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

logistic regression using glove
Accuracy: 0.46292585170340683
Confusion Matrix:
 [[ 8 25 21 38]
 [ 1 86 21 17]
 [ 4 41 54 35]
 [ 5 32 28 83]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.44      0.09      0.15        92
    Negative       0.47      0.69      0.56       125
     Neutral       0.44      0.40      0.42       134
    Positive       0.48      0.56      0.52       148

    accuracy                           0.46       499
   macro avg       0.46      0.43      0.41       499
weighted avg       0.46      0.46      0.43       499



## Train KNN

-using bag of words

In [31]:
knn_model_bow = train_knn(X_train_bow, df_train['sentiment'])

In [32]:
predict_test_bow=knn_model_bow.predict(X_test_bow)
print("knn using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

knn using Bag Of Words
Accuracy: 0.9719438877755511
Confusion Matrix:
 [[ 88   1   0   3]
 [  0 124   0   1]
 [  3   1 129   1]
 [  2   1   1 144]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.95      0.96      0.95        92
    Negative       0.98      0.99      0.98       125
     Neutral       0.99      0.96      0.98       134
    Positive       0.97      0.97      0.97       148

    accuracy                           0.97       499
   macro avg       0.97      0.97      0.97       499
weighted avg       0.97      0.97      0.97       499



-using tf-idf

In [33]:
knn_model_tf_idf = train_knn(X_train_tfidf, df_train['sentiment'])

In [34]:
predict_test_tf_idf=knn_model_tf_idf.predict(X_test_tfidf)
print("knn using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

knn using tf-idf
Accuracy: 0.9458917835671342
Confusion Matrix:
 [[ 90   1   0   1]
 [  1 123   0   1]
 [  7   1 126   0]
 [ 13   2   0 133]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.81      0.98      0.89        92
    Negative       0.97      0.98      0.98       125
     Neutral       1.00      0.94      0.97       134
    Positive       0.99      0.90      0.94       148

    accuracy                           0.95       499
   macro avg       0.94      0.95      0.94       499
weighted avg       0.95      0.95      0.95       499



-using glove

In [35]:
knn_model_glove = train_knn(X_train_glove, df_train['sentiment'])

In [36]:
predict_test_glove=knn_model_glove.predict(X_test_glove)
print("knn using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

knn using glove
Accuracy: 0.8657314629258517
Confusion Matrix:
 [[ 74   4   8   6]
 [  5 114   5   1]
 [  6   3 116   9]
 [  8   5   7 128]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.80      0.80      0.80        92
    Negative       0.90      0.91      0.91       125
     Neutral       0.85      0.87      0.86       134
    Positive       0.89      0.86      0.88       148

    accuracy                           0.87       499
   macro avg       0.86      0.86      0.86       499
weighted avg       0.87      0.87      0.87       499

