#Import libraries and helper functions

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive/nlp/")

Mounted at /content/drive


In [2]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.0.0.tar.gz (197 kB)
[K     |████████████████████████████████| 197 kB 5.2 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.0.0-py3-none-any.whl size=193022 sha256=87bedfbacc210734c28547e66b66350da6e3a6fcc5b3423ca1ef828e96cc51cd
  Stored in directory: /root/.cache/pip/wheels/ec/29/4d/3cfe7452ac7d8d83b1930f8a6205c3c9649b24e80f9029fc38
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.0.0


In [3]:
!pip install gensim

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import json
from sklearn import metrics as m
import numpy as np
from os import path
import utils
from utils import preprocessing
from utils import feature_extraction
from sklearn.calibration import CalibratedClassifierCV

In [5]:
def read_files():
  x_train = pd.read_csv('/content/drive/My Drive/nlp/data/x_train_without_preprocessing.csv', converters = {'review': str})
  x_test = pd.read_csv('/content/drive/My Drive/nlp/data/x_test_without_preprocessing.csv', converters = {'review': str})
  y_train = pd.read_csv('/content/drive/My Drive/nlp/data/y_train_without_preprocessing.csv').values.ravel()
  y_test = pd.read_csv('/content/drive/My Drive/nlp/data/y_test_without_preprocessing.csv').values.ravel()

  return x_train, x_test, y_train, y_test

In [6]:
def json_metrics(file_name, prediction_model, embedding, metrics, df):
    dictionary = {'Model': prediction_model,
                  'User embedding': embedding,
                  'Metrics': metrics,
                  'Data': df.to_dict('records')}

    if path.isfile(file_name):  # file exist
        with open(file_name) as fp:
            listObj = json.load(fp)

        listObj.append(dictionary)

        with open(file_name, 'w') as json_file:
            json.dump(listObj, json_file, indent=4)
    else:
        with open(file_name, 'w') as json_file:
            json.dump([dictionary], json_file, indent=4)


def metrics(y_test, y_pred, target_names):
    tn, fp, fn, tp = m.confusion_matrix(y_true=y_test, y_pred=y_pred).ravel()
    dict_confusion = {'True negative': int(tn),
                      'False positive': int(fp),
                      'False negative': int(fn),
                      'True positive': int(tp),
                      }
    dict_report = m.classification_report(y_test, y_pred, target_names=target_names, output_dict=True)
    return {**dict_confusion, **dict_report}

def metrics_for_neutral(y_test, y_pred, target_names):
  FP = TP = TN = FN = NP = NN = 0

  for i in range(len(y_pred)):
    prediction = y_pred[i]
    true = y_test[i]

    if prediction == 1 and (true == 1 or true == 0):
      FP += 1
    elif prediction == 1 and (true == 3 or true == 4):
      TP += 1
    elif prediction == 0 and (true == 1 or true == 0) :
      TN += 1
    elif prediction == 0 and (true == 3 or true == 4):
      FN += 1
    elif prediction == 1 and true == 2:
      NP += 1
    elif prediction == 0 and true == 2:
      NN += 1
      
  dict_confusion = {'True negative' : int(TN),
        'False positive' : int(FP),
        'False negative' : int(FN),
        'True positive' : int(TP),
        'Neutral positive' : int(NP),
        'Neutral negative' : int(NN),
        }
  return {**dict_confusion}

#Logistic regression

##Tuning

In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

logModel = LogisticRegression()
param_grid = [    
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-4, 4, 20),
     'solver' : ['lbfgs', 'newton-cg', 'liblinear'],
    'max_iter' : [500, 1000]
    }
]
clf = GridSearchCV(logModel, param_grid = param_grid, cv=3, verbose=True)
best_clf = clf.fit(x_train, y_train)
best_clf.best_estimator_

In [None]:
# Print the best accuracy score for the training dataset
print(f'The best accuracy score for the training dataset is {best_clf.best_score_:.4f}')

# Print the hyperparameters for the best score
print(f'The best hyperparameters are {best_clf.best_params_}')

# Print the best accuracy score for the testing dataset
print(f'The accuracy score for the testing dataset is {best_clf.score(x_test, y_test):.4f}')

print(best_clf.best_estimator_.get_params())

##Best model

In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
#_, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

#COUNT VECTORIZER
_, x_train, x_test = feature_extraction.get_count_vector(x_train['review'], x_test['review'], ngram_range=(1,2), min_df=0.0, remove_stopwords=False)

model = LogisticRegression(C=29.763514416313132, penalty='l2', max_iter=500)
lr_fit = model.fit(x_train, y_train)

predict = model.predict(x_test)
probab = model.predict_proba(x_test)

metric = metrics(y_test, predict, ['Positive', 'Negative'])
json_metrics('/content/drive/My Drive/nlp/json/LR_cv_without_preprocessing_neutral.json', 'LR without preproces neutral', 'cv', metric, pd.DataFrame())

df= pd.DataFrame({'Id': np.arange(y_test.shape[0]), 'Label': y_test, 'Prediction': predict, 'Probability': probab.tolist()})
df.to_csv('/content/drive/My Drive/nlp/probab/lr_cv_without_preprocessing_neutral.csv')

##Word2vec

In [None]:
x_train, x_test, y_train, y_test = read_files()

#WORD2VEC
word2vec_model = feature_extraction.create_word2vec_model(x_train['review'], x_test['review']) 
x_train, x_test = feature_extraction.get_word2vec_embedding(word2vec_model, x_train['review'], x_test['review']) 

model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = model.fit(x_train, y_train)
print(lr_fit)

lr_predict = model.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

TypeError: ignored

##GloVe

In [None]:
x_train, x_test, y_train, y_test = read_files()

#GLOVE
dirname = os.path.dirname(__file__)
filepath = os.path.join(dirname, 'glove.6B.200d.txt')

word2vec_output_file = 'glove.6B.200d' + '.word2vec'

glove_model = feature_extraction.load_glove_model(filepath, word2vec_output_file)
x_train, x_test = feature_extraction.get_glove_embedding(glove_model, x_train['review'], x_test['review'])

model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = model.fit(x_train, y_train)
print(lr_fit)

pickle.dump(model, open('model_logistic_regression_glove.sav', 'wb')) #save model

lr_predict = model.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

#SVM

In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
#dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

#COUNT VECTORIZER
_, x_train, x_test = feature_extraction.get_count_vector(x_train['review'], x_test['review'], ngram_range=(1,2), min_df=0.0, remove_stopwords=False)

model = SGDClassifier(loss='hinge', max_iter=500)
clf = CalibratedClassifierCV(model) 
model = clf.fit(x_train, y_train)

predict = clf.predict(x_test)
probab = clf.predict_proba(x_test)

metric = metrics(y_test, predict, ['Positive', 'Negative'])
json_metrics('/content/drive/My Drive/nlp/json/SGDC_without_preprocessing.json', 'SGDC without preproces', 'cv', metric, pd.DataFrame())

df= pd.DataFrame({'Id': np.arange(y_test.shape[0]), 'Label': y_test, 'Prediction': predict, 'Probability': probab.tolist()})
df.to_csv('/content/drive/My Drive/nlp/probab/SGDC_cv_without_preprocessing.csv')

In [None]:
#x_train, x_test, y_train, y_test = read_files()

#TFIDF
#dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

model = LinearSVC()
clf = CalibratedClassifierCV(model) 
model = clf.fit(x_train, y_train)

predict = clf.predict(x_test)
probab = clf.predict_proba(x_test)

metric = metrics(y_test, predict, ['Positive', 'Negative'])
json_metrics('/content/drive/My Drive/nlp/json/LinearSVC_without_preprocessing.json', 'LinearSVC without preproces', 'TFIDF', metric, pd.DataFrame())

df= pd.DataFrame({'Id': np.arange(y_test.shape[0]), 'Label': y_test, 'prediction': predict, 'Probability': probab.tolist()})
df.to_csv('/content/drive/My Drive/nlp/probab/LinearSVC_tfidf_without_preprocessing.csv')

##Tuning

In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

model = SVC()
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SVC_alone_tfidf_without_preprocessing.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

In [None]:
_, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

param_grid = {'C': [0.1, 1, 10, 100], 
              'gamma': [1, 0.1, 0.01, 0.001],
              'kernel': ['linear']}
 
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(x_train, y_train)

# print best parameter after tuning
print(grid.best_params_)
print(grid.best_estimator_)

grid_predictions = grid.predict(x_test)
print(classification_report(y_test, grid_predictions))

##Word2Vec

In [None]:
word2vec_model = feature_extraction.create_word2vec_model(x_train['review'], x_test['review'])
x_train, x_test = feature_extraction.get_word2vec_embedding(word2vec_model, x_train['review'], x_test['review'])

model = LinearSVC()
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SVC_word2vec.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

In [None]:
model = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SGD_word2vec.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

##GloVe

In [None]:
dirname = os.path.dirname(__file__)
filepath = os.path.join(dirname, 'glove.6B.200d.txt')

word2vec_output_file = 'glove.6B.200d' + '.word2vec'

glove_model = feature_extraction.load_glove_model(filepath, word2vec_output_file)
x_train, x_test = feature_extraction.get_glove_embedding(glove_model, x_train['review'], x_test['review'])

model = LinearSVC()
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SVC_glove.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

In [None]:
model = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SGD_glove.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

#Neutral dataset with best models

In [None]:
def get_tfidf(x_train, x_test, test, ngram_range = None):
    
    tfidf = TfidfVectorizer(min_df=0.0002)
        
    if ngram_range != None:
        tfidf.ngram_range = ngram_range
        
    tfidf.fit(x_train)
    x_train_vector = tfidf.transform(x_train)
    x_test_vector = tfidf.transform(x_test)
    test_vector = tfidf.transform(test)
    
    return x_train_vector, x_test_vector, test_vector

In [None]:
neutral_df = pd.read_csv('/content/drive/My Drive/nlp/data/neutral_dataset_without_preprocessing.csv', converters = {'Phrase': str})
neutral_df.rename(columns = {'Phrase':'review', 'Sentiment':'sentiment'}, inplace = True)

x_train, x_test, y_train, y_test = read_files()
x_train, x_test, test = get_tfidf(x_train['review'], x_test['review'], neutral_df['review'], ngram_range=(1,2))

<8529x136888 sparse matrix of type '<class 'numpy.float64'>'
	with 197346 stored elements in Compressed Sparse Row format>

##Logistic Regression

In [None]:
model = LogisticRegression(C=29.763514416313132, penalty='l2', max_iter=500)
lr_fit = model.fit(x_train, y_train)

predict = model.predict(test)
probab = model.predict_proba(test)

In [None]:
df= pd.DataFrame({'Id': neutral_df['SentenceId'].to_numpy(), 'Label': neutral_df['sentiment'], 'Prediction': predict, 'Probability': probab.tolist()})
df.to_csv('/content/drive/My Drive/nlp/probab/Neutral_LogisticRegression_tfidf_without_preprocessing_both.csv', index=False)

In [None]:
metrics_for_neutral(neutral_df['sentiment'], predict, ['1','0'])

{'False negative': 738,
 'False positive': 803,
 'Neutral negative': 872,
 'Neutral positive': 783,
 'True negative': 2469,
 'True positive': 2864}

##LinearSVC

In [None]:
model = LinearSVC()
clf = CalibratedClassifierCV(model) 
model = clf.fit(x_train, y_train)

predict = clf.predict(test)
probab = clf.predict_proba(test)

In [None]:
df= pd.DataFrame({'Id': neutral_df['SentenceId'].to_numpy(), 'Label': neutral_df['sentiment'], 'Prediction': predict, 'Probability': probab.tolist()})
df.to_csv('/content/drive/My Drive/nlp/probab/Neutral_LinearSVC_tfidf_without_preprocessing_both.csv', index=False)

In [None]:
metrics_for_neutral(neutral_df['sentiment'], predict, ['1','0'])

{'False negative': 744,
 'False positive': 783,
 'Neutral negative': 879,
 'Neutral positive': 776,
 'True negative': 2489,
 'True positive': 2858}