In [2]:
import sys
sys.path.append('../scripts')
from helpers import *
from feature_extraction import *
from modeling import *
from evaluation import *

In [3]:
base_path = '../data/processed_data/'
df_train = read_file(base_path + 'preprocessed_training_tweets.csv')
df_test = read_file(base_path + 'preprocessed_test_tweets.csv')
df_validation = read_file(base_path + 'preprocessed_validation_tweets.csv')
df_train.head()

Unnamed: 0,Tweet ID,entity,sentiment,Tweet content,Preprocessed Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,im get borderland murder
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,come border kill
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,im get borderland kill
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,im come borderland murder
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,im get borderland murder


## Drop nan values (after preprocessing)

In [4]:
df_train = df_train.dropna(subset=['Preprocessed Tweet content'])
df_test = df_test.dropna(subset=['Preprocessed Tweet content'])
df_validation = df_validation.dropna(subset=['Preprocessed Tweet content'])

## Text Vectorization
- Bag of Words
- TF-IDF
- Word2Vec
- GloVe
- FastText

### 1- Bag of Words

In [5]:
X_train_bow, X_val_bow, X_test_bow = vectorize_bow(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'])
X_train_bow

<72310x33350 sparse matrix of type '<class 'numpy.int64'>'
	with 747231 stored elements in Compressed Sparse Row format>

### 2- TF-IDF

In [6]:
X_train_tfidf, X_val_tfidf, X_test_tfidf = vectorize_tfidf(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'])
X_train_tfidf

<72310x33350 sparse matrix of type '<class 'numpy.float64'>'
	with 747231 stored elements in Compressed Sparse Row format>

### 3- Word2Vec

In [10]:
X_train_word2vec= vectorize_word2vec_data(df_train['Preprocessed Tweet content'])
X_val_word2vec= vectorize_word2vec_data(df_validation['Preprocessed Tweet content'])
X_test_word2vec= vectorize_word2vec_data(df_test['Preprocessed Tweet content'])
X_train_word2vec

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.014036,0.430571,0.539577,-0.639534,0.169186,-0.674001,-0.196263,1.268145,-0.705653,-0.638094,...,0.139400,-0.132278,0.427559,0.751168,0.187882,0.026096,-0.011868,-0.320428,0.818503,-0.103725
1,0.003110,0.609597,0.509721,0.255386,0.184877,-1.175817,-0.059922,0.602043,-0.376292,-0.663746,...,0.174276,0.272716,0.337077,0.474314,0.456278,0.718474,-0.375157,0.233253,0.383640,0.570185
2,0.046008,0.544611,0.573684,-0.694496,0.111231,-0.978861,-0.098933,1.243091,-0.972191,-0.676963,...,0.349270,-0.135414,0.413543,0.611177,0.286102,0.157505,-0.100442,-0.230821,0.858332,-0.009552
3,0.017449,0.575258,0.526059,-0.458798,0.180068,-0.963977,-0.360158,1.062833,-0.214202,-0.654413,...,-0.238597,0.028587,0.389649,0.991787,0.037014,0.211681,0.037916,-0.027362,0.736385,0.433018
4,0.014036,0.430571,0.539577,-0.639534,0.169186,-0.674001,-0.196263,1.268145,-0.705653,-0.638094,...,0.139400,-0.132278,0.427559,0.751168,0.187882,0.026096,-0.011868,-0.320428,0.818503,-0.103725
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72305,0.380933,0.731974,0.282214,0.290113,0.624013,-0.441692,-0.174329,1.056800,-0.558692,-0.436377,...,0.408003,0.355734,0.633011,0.062165,0.758302,0.086660,-0.263412,0.112832,0.422880,-0.171908
72306,0.426381,0.711457,0.193117,0.227430,0.649964,-0.512596,-0.282753,1.186269,-0.530241,-0.503277,...,0.327798,0.343191,0.633612,0.078843,0.734912,0.057234,-0.266654,-0.009306,0.439189,-0.238179
72307,0.379662,0.683923,0.215349,0.269714,0.643403,-0.451711,-0.241157,1.128227,-0.520607,-0.494656,...,0.373588,0.345824,0.601870,0.074338,0.722378,0.072142,-0.288318,-0.039966,0.404217,-0.202219
72308,0.343226,0.999473,0.388349,0.234204,0.605790,-0.542341,-0.158048,1.087535,-0.608297,-0.535299,...,0.457798,0.336251,0.458279,0.131583,0.748805,-0.155759,-0.166874,0.125870,0.468222,-0.189956


### 4- GloVe

In [11]:
glove_model = load_glove_model('../models/glove.6B.50d.txt')
X_train_glove, X_val_glove, X_test_glove = vectorize_glove(df_train['Preprocessed Tweet content'], df_validation['Preprocessed Tweet content'], df_test['Preprocessed Tweet content'], glove_model)
X_train_glove

array([[ 0.098708  ,  0.024415  ,  0.2143875 , ..., -0.27250675,
         0.18058433,  0.10823   ],
       [ 0.64638   , -0.31335667,  0.49528967, ..., -0.071779  ,
        -0.14538   , -0.32737267],
       [ 0.2580055 , -0.18138   ,  0.6702725 , ..., -0.30769175,
         0.06621182,  0.1871425 ],
       ...,
       [ 0.02429973, -0.05667118,  0.47029909, ..., -0.12513364,
         0.368412  , -0.10008964],
       [ 0.00721407, -0.08955614,  0.44675443, ..., -0.21374929,
         0.32166086, -0.09875436],
       [ 0.12131609,  0.04995064,  0.25476455, ..., -0.28956455,
         0.26706018, -0.04971727]])

## Train SVM model

-using Bag of Words

In [12]:
svm_model_bow = train_svm_model(X_train_bow, df_train['sentiment'])

[LibLinear]



In [16]:
predict_test_bow=svm_model_bow.predict(X_test_bow)
print("SVM using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

SVM using Bag Of Words
Accuracy: 0.9218436873747495
Confusion Matrix:
 [[ 84   3   0   5]
 [  3 115   3   4]
 [  0   3 126   5]
 [  6   4   3 135]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.90      0.91      0.91        92
    Negative       0.92      0.92      0.92       125
     Neutral       0.95      0.94      0.95       134
    Positive       0.91      0.91      0.91       148

    accuracy                           0.92       499
   macro avg       0.92      0.92      0.92       499
weighted avg       0.92      0.92      0.92       499



-using tf-idf

In [17]:
svm_model_tf_idf = train_svm_model(X_train_tfidf, df_train['sentiment'])

[LibLinear]

In [21]:
predict_test_tf_idf=svm_model_tf_idf.predict(X_test_tfidf)
print("SVM using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

SVM using tf-idf
Accuracy: 0.9138276553106213
Confusion Matrix:
 [[ 83   3   1   5]
 [  4 113   4   4]
 [  0   3 128   3]
 [  8   5   3 132]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.87      0.90      0.89        92
    Negative       0.91      0.90      0.91       125
     Neutral       0.94      0.96      0.95       134
    Positive       0.92      0.89      0.90       148

    accuracy                           0.91       499
   macro avg       0.91      0.91      0.91       499
weighted avg       0.91      0.91      0.91       499



-using word2vec

In [23]:
svm_model_word2vec = train_svm_model(X_train_word2vec, df_train['sentiment'])

[LibLinear]

In [24]:
predict_test_word2vec=svm_model_word2vec.predict(X_test_word2vec)
print("SVM using word2vec")
evaluate_model(df_test['sentiment'],predict_test_word2vec)

SVM using word2vec
Accuracy: 0.2685370741482966
Confusion Matrix:
 [[  0   0  92   0]
 [  0   0 125   0]
 [  0   0 134   0]
 [  0   0 148   0]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00        92
    Negative       0.00      0.00      0.00       125
     Neutral       0.27      1.00      0.42       134
    Positive       0.00      0.00      0.00       148

    accuracy                           0.27       499
   macro avg       0.07      0.25      0.11       499
weighted avg       0.07      0.27      0.11       499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-using glove

In [25]:
svm_model_glove = train_svm_model(X_train_glove, df_train['sentiment'])

[LibLinear]

In [26]:
predict_test_glove=svm_model_glove.predict(X_test_glove)
print("SVM using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

SVM using glove
Accuracy: 0.40080160320641284
Confusion Matrix:
 [[ 3 37 22 30]
 [ 1 77 22 25]
 [ 2 45 51 36]
 [ 0 46 33 69]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.50      0.03      0.06        92
    Negative       0.38      0.62      0.47       125
     Neutral       0.40      0.38      0.39       134
    Positive       0.43      0.47      0.45       148

    accuracy                           0.40       499
   macro avg       0.43      0.37      0.34       499
weighted avg       0.42      0.40      0.37       499



## Train naive bayes model

-using Bag of Words

In [27]:
naive_bayes_model_bow = train_naive_bayes(X_train_bow, df_train['sentiment'])

In [28]:
predict_test_bow=naive_bayes_model_bow.predict(X_test_bow)
print("naive bayes using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

naive bayes using Bag Of Words
Accuracy: 0.7715430861723447
Confusion Matrix:
 [[ 65  14   2  11]
 [  2 107   6  10]
 [  9  20  92  13]
 [  7  13   7 121]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.78      0.71      0.74        92
    Negative       0.69      0.86      0.77       125
     Neutral       0.86      0.69      0.76       134
    Positive       0.78      0.82      0.80       148

    accuracy                           0.77       499
   macro avg       0.78      0.77      0.77       499
weighted avg       0.78      0.77      0.77       499



-using tf-idf

In [29]:
naive_bayes_model_tf_idf = train_naive_bayes(X_train_tfidf, df_train['sentiment'])

In [30]:
predict_test_tf_idf=naive_bayes_model_tf_idf.predict(X_test_tfidf)
print("naive bayes using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

naive bayes using tf-idf
Accuracy: 0.7675350701402806
Confusion Matrix:
 [[ 54  21   2  15]
 [  0 115   2   8]
 [  2  29  86  17]
 [  4  12   4 128]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.90      0.59      0.71        92
    Negative       0.65      0.92      0.76       125
     Neutral       0.91      0.64      0.75       134
    Positive       0.76      0.86      0.81       148

    accuracy                           0.77       499
   macro avg       0.81      0.75      0.76       499
weighted avg       0.80      0.77      0.76       499



-using word2vec

In [47]:
X_train_scaled_word2vec, X_test_scaled_word2vec= scaler_data(X_train_word2vec, X_test_word2vec)
naive_bayes_model_word2vec = train_naive_bayes(X_train_scaled_word2vec, df_train['sentiment'])

In [48]:
predict_test_word2vec=naive_bayes_model_word2vec.predict(X_test_scaled_word2vec)
print("naive bayes using word2vec")
evaluate_model(df_test['sentiment'],predict_test_word2vec)

naive bayes using word2vec
Accuracy: 0.250501002004008
Confusion Matrix:
 [[  0  92   0   0]
 [  0 125   0   0]
 [  0 134   0   0]
 [  0 148   0   0]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00        92
    Negative       0.25      1.00      0.40       125
     Neutral       0.00      0.00      0.00       134
    Positive       0.00      0.00      0.00       148

    accuracy                           0.25       499
   macro avg       0.06      0.25      0.10       499
weighted avg       0.06      0.25      0.10       499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-using glove

In [49]:
X_train_scaled_glove, X_test_scaled_glove= scaler_data(X_train_glove, X_test_glove)
naive_bayes_model_glove = train_naive_bayes(X_train_scaled_glove, df_train['sentiment'])

In [50]:
predict_test_glove=naive_bayes_model_glove.predict(X_test_scaled_glove)
print("naive bayes using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

naive bayes using glove
Accuracy: 0.2545090180360721
Confusion Matrix:
 [[  0  92   0   0]
 [  0 125   0   0]
 [  0 134   0   0]
 [  0 146   0   2]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00        92
    Negative       0.25      1.00      0.40       125
     Neutral       0.00      0.00      0.00       134
    Positive       1.00      0.01      0.03       148

    accuracy                           0.25       499
   macro avg       0.31      0.25      0.11       499
weighted avg       0.36      0.25      0.11       499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Train logistic regression

-using bag of words

In [51]:
logistic_regression_model_bow = train_logistic_regression(X_train_bow, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [52]:
predict_test_bow=logistic_regression_model_bow.predict(X_test_bow)
print("logistic regression using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

logistic regression using Bag Of Words
Accuracy: 0.8997995991983968
Confusion Matrix:
 [[ 80   3   1   8]
 [  5 113   4   3]
 [  0   5 123   6]
 [  7   5   3 133]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.87      0.87      0.87        92
    Negative       0.90      0.90      0.90       125
     Neutral       0.94      0.92      0.93       134
    Positive       0.89      0.90      0.89       148

    accuracy                           0.90       499
   macro avg       0.90      0.90      0.90       499
weighted avg       0.90      0.90      0.90       499



-using tf-idf

In [53]:
logistic_regression_model_tf_idf = train_logistic_regression(X_train_tfidf, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [54]:
predict_test_tf_idf=logistic_regression_model_tf_idf.predict(X_test_tfidf)
print("logistic regression using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

logistic regression using tf-idf
Accuracy: 0.8777555110220441
Confusion Matrix:
 [[ 77   7   2   6]
 [  4 112   5   4]
 [  7   6 117   4]
 [  8   3   5 132]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.80      0.84      0.82        92
    Negative       0.88      0.90      0.89       125
     Neutral       0.91      0.87      0.89       134
    Positive       0.90      0.89      0.90       148

    accuracy                           0.88       499
   macro avg       0.87      0.87      0.87       499
weighted avg       0.88      0.88      0.88       499



-using word2vec

In [55]:
logistic_regression_model_word2vec = train_logistic_regression(X_train_word2vec, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [56]:
predict_test_word2vec=logistic_regression_model_word2vec.predict(X_test_word2vec)
print("logistic regression using word2vec")
evaluate_model(df_test['sentiment'],predict_test_word2vec)

logistic regression using word2vec
Accuracy: 0.2685370741482966
Confusion Matrix:
 [[  0   0  92   0]
 [  0   0 125   0]
 [  0   0 134   0]
 [  0   0 148   0]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.00      0.00      0.00        92
    Negative       0.00      0.00      0.00       125
     Neutral       0.27      1.00      0.42       134
    Positive       0.00      0.00      0.00       148

    accuracy                           0.27       499
   macro avg       0.07      0.25      0.11       499
weighted avg       0.07      0.27      0.11       499



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


-using glove

In [57]:
logistic_regression_model_glove = train_logistic_regression(X_train_glove, df_train['sentiment'])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [58]:
predict_test_glove=logistic_regression_model_glove.predict(X_test_glove)
print("logistic regression using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

logistic regression using glove
Accuracy: 0.40480961923847697
Confusion Matrix:
 [[ 8 31 26 27]
 [ 3 74 22 26]
 [ 3 43 53 35]
 [ 2 45 34 67]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.50      0.09      0.15        92
    Negative       0.38      0.59      0.47       125
     Neutral       0.39      0.40      0.39       134
    Positive       0.43      0.45      0.44       148

    accuracy                           0.40       499
   macro avg       0.43      0.38      0.36       499
weighted avg       0.42      0.40      0.38       499



## Train KNN

-using bag of words

In [59]:
knn_model_bow = train_knn(X_train_bow, df_train['sentiment'])

In [60]:
predict_test_bow=knn_model_bow.predict(X_test_bow)
print("knn using Bag Of Words")
evaluate_model(df_test['sentiment'],predict_test_bow)

knn using Bag Of Words
Accuracy: 0.9719438877755511
Confusion Matrix:
 [[ 88   1   0   3]
 [  0 124   0   1]
 [  3   1 129   1]
 [  2   1   1 144]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.95      0.96      0.95        92
    Negative       0.98      0.99      0.98       125
     Neutral       0.99      0.96      0.98       134
    Positive       0.97      0.97      0.97       148

    accuracy                           0.97       499
   macro avg       0.97      0.97      0.97       499
weighted avg       0.97      0.97      0.97       499



-using tf-idf

In [61]:
knn_model_tf_idf = train_knn(X_train_tfidf, df_train['sentiment'])

In [62]:
predict_test_tf_idf=knn_model_tf_idf.predict(X_test_tfidf)
print("knn using tf-idf")
evaluate_model(df_test['sentiment'],predict_test_tf_idf)

knn using tf-idf
Accuracy: 0.9458917835671342
Confusion Matrix:
 [[ 90   1   0   1]
 [  1 123   0   1]
 [  7   1 126   0]
 [ 13   2   0 133]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.81      0.98      0.89        92
    Negative       0.97      0.98      0.98       125
     Neutral       1.00      0.94      0.97       134
    Positive       0.99      0.90      0.94       148

    accuracy                           0.95       499
   macro avg       0.94      0.95      0.94       499
weighted avg       0.95      0.95      0.95       499



-using word2vec

In [65]:
knn_model_word2vec = train_knn(X_train_word2vec, df_train['sentiment'])

In [66]:
predict_test_word2vec=knn_model_word2vec.predict(X_test_word2vec)
print("knn using word2vec")
evaluate_model(df_test['sentiment'],predict_test_word2vec)

knn using word2vec
Accuracy: 0.2665330661322645
Confusion Matrix:
 [[ 6 61 13 12]
 [10 68 18 29]
 [10 81 22 21]
 [16 72 23 37]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.14      0.07      0.09        92
    Negative       0.24      0.54      0.33       125
     Neutral       0.29      0.16      0.21       134
    Positive       0.37      0.25      0.30       148

    accuracy                           0.27       499
   macro avg       0.26      0.26      0.23       499
weighted avg       0.28      0.27      0.25       499



-using glove

In [67]:
knn_model_glove = train_knn(X_train_glove, df_train['sentiment'])

In [68]:
predict_test_glove=knn_model_glove.predict(X_test_glove)
print("knn using glove")
evaluate_model(df_test['sentiment'],predict_test_glove)

knn using glove
Accuracy: 0.8637274549098196
Confusion Matrix:
 [[ 83   1   5   3]
 [  5 116   3   1]
 [  5  12 111   6]
 [ 11  13   3 121]]
Classification Report:
               precision    recall  f1-score   support

  Irrelevant       0.80      0.90      0.85        92
    Negative       0.82      0.93      0.87       125
     Neutral       0.91      0.83      0.87       134
    Positive       0.92      0.82      0.87       148

    accuracy                           0.86       499
   macro avg       0.86      0.87      0.86       499
weighted avg       0.87      0.86      0.86       499



## Using trained model for prediction on validation dataset

In [105]:
path = '../data/processed_data/'
# df_validation.drop(columns=['Tweet content','Preprocessed Tweet content'],inplace=True)
predict_val_bow=svm_model_bow.predict(X_val_bow)
df_validation['predicted sentiment'] = predict_val_bow

save_to_csv(df_validation, filename= path + 'validation_bow_svm.csv')

equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(svm + bow): {equal_count}")

Number of correct predictions(svm + bow): 464


In [106]:
predict_val_tf_idf=svm_model_tf_idf.predict(X_val_tfidf)
df_validation['predicted sentiment'] = predict_val_tf_idf
save_to_csv(df_validation, filename= path + 'validation_tf_idf_svm.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(svm + tf-idf): {equal_count}")

Number of correct predictions(svm + tf-idf): 466


In [107]:
predict_val_glove=svm_model_glove.predict(X_val_glove)
df_validation['predicted sentiment'] = predict_val_glove
save_to_csv(df_validation, filename= path + 'validation_glove_svm.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(svm + glove): {equal_count}")

Number of correct predictions(svm + glove): 215


In [108]:
predict_val_word2vec=svm_model_word2vec.predict(X_val_word2vec)
df_validation['predicted sentiment'] = predict_val_word2vec
save_to_csv(df_validation, filename= path + 'validation_word2vec_svm.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions (svm + word2vec): {equal_count}")

Number of correct predictions (svm + word2vec): 151


In [109]:
predict_val_bow=logistic_regression_model_bow.predict(X_val_bow)
df_validation['predicted sentiment'] = predict_val_bow
save_to_csv(df_validation, filename= path + 'validation_bow_lr.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(logistic_regression + bow): {equal_count}")

Number of correct predictions(logistic_regression + bow): 459


In [110]:
predict_val_tf_idf=logistic_regression_model_tf_idf.predict(X_val_tfidf)
df_validation['predicted sentiment'] = predict_val_tf_idf
save_to_csv(df_validation, filename= path + 'validation_tf_idf_lr.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(logistic_regression + tf-idf): {equal_count}")

Number of correct predictions(logistic_regression + tf-idf): 440


In [111]:
predict_val_glove=logistic_regression_model_glove.predict(X_val_glove)
df_validation['predicted sentiment'] = predict_val_glove
save_to_csv(df_validation, filename= path + 'validation_glove_lr.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(logistic_regression + glove): {equal_count}")

Number of correct predictions(logistic_regression + glove): 220


In [112]:
predict_val_word2vec=logistic_regression_model_word2vec.predict(X_val_word2vec)
df_validation['predicted sentiment'] = predict_val_word2vec
save_to_csv(df_validation, filename= path + 'validation_word2vec_lr.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions: {equal_count}")

Number of correct predictions: 151


In [114]:
predict_val_bow=naive_bayes_model_bow.predict(X_val_bow)
df_validation['predicted sentiment'] = predict_val_bow
save_to_csv(df_validation, filename= path + 'validation_bow_NB.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(naive_bayes + bow): {equal_count}")

Number of correct predictions(naive_bayes + bow): 401


In [115]:
predict_val_tf_idf=naive_bayes_model_tf_idf.predict(X_val_tfidf)
df_validation['predicted sentiment'] = predict_val_tf_idf
save_to_csv(df_validation, filename= path + 'validation_tf_idf_NB.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(naive_bayes + tf-idf): {equal_count}")

Number of correct predictions(naive_bayes + tf-idf): 395


In [116]:
predict_val_glove=naive_bayes_model_glove.predict(X_val_glove)
df_validation['predicted sentiment'] = predict_val_glove
save_to_csv(df_validation, filename= path + 'validation_glove_NB.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(naive_bayes + glove): {equal_count}")

Number of correct predictions(naive_bayes + glove): 164


In [117]:
predict_val_word2vec=naive_bayes_model_word2vec.predict(X_val_word2vec)
df_validation['predicted sentiment'] = predict_val_word2vec
save_to_csv(df_validation, filename= path + 'validation_word2vec_NB.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions (naive_bayes + word2vec): {equal_count}")

Number of correct predictions (naive_bayes + word2vec): 140


In [118]:
predict_val_bow=knn_model_bow.predict(X_val_bow)
df_validation['predicted sentiment'] = predict_val_bow
save_to_csv(df_validation, filename= path + 'validation_bow_KNN.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions (knn + bow): {equal_count}")

Number of correct predictions (knn + bow): 478


In [119]:
predict_val_tf_idf=knn_model_tf_idf.predict(X_val_tfidf)
df_validation['predicted sentiment'] = predict_val_tf_idf
save_to_csv(df_validation, filename= path + 'validation_tf_idf_KNN.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(knn + tf-idf): {equal_count}")

Number of correct predictions(knn + tf-idf): 476


In [120]:
predict_val_glove=knn_model_glove.predict(X_val_glove)
df_validation['predicted sentiment'] = predict_val_glove
save_to_csv(df_validation, filename= path + 'validation_glove_KNN.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions (knn + glove): {equal_count}")

Number of correct predictions (knn + glove): 436


In [121]:
predict_val_word2vec=knn_model_word2vec.predict(X_val_word2vec)
df_validation['predicted sentiment'] = predict_val_word2vec
save_to_csv(df_validation, filename= path + 'validation_word2vec_KNN.csv')
equal_count = (df_validation['predicted sentiment'] == df_validation['sentiment']).sum()
print(f"Number of correct predictions(knn + word2vec): {equal_count}")

Number of correct predictions(knn + word2vec): 128
