In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/My Drive/nlp")

Mounted at /content/drive


In [None]:
!pip install emoji

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-1.7.0.tar.gz (175 kB)
[K     |████████████████████████████████| 175 kB 5.3 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171046 sha256=75e157c59e1b1123f2d8e2a6c355e5a14405932a47878daf24e398db85f24aa5
  Stored in directory: /root/.cache/pip/wheels/8a/4e/b6/57b01db010d17ef6ea9b40300af725ef3e210cb1acfb7ac8b6
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-1.7.0


In [None]:
import pickle
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn import svm
from sklearn.linear_model import SGDClassifier
import utils
from utils import preprocessing
from utils import feature_extraction

In [None]:
def read_files():
  x_train = pd.read_csv('x_train.csv', converters = {'review': str})
  #x_train = x_train['review'].values
  x_test = pd.read_csv('x_test.csv', converters = {'review': str})
  #x_test = x_test['review'].values
  y_train = pd.read_csv('y_train.csv').values.ravel()
  y_test = pd.read_csv('y_test.csv').values.ravel()

  return x_train, x_test, y_train, y_test

#Logistic regression

In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = model.fit(x_train, y_train)
print(lr_fit)

pickle.dump(model, open('model_logistic_regression_tfidf.sav', 'wb'))

lr_predict = model.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

LogisticRegression(C=1, max_iter=500, random_state=123)
              precision    recall  f1-score   support

    Positive       0.92      0.88      0.90      3824
    Negative       0.88      0.92      0.90      3676

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500

[[3381  295]
 [ 440 3384]]


In [None]:
#loaded_model = pickle.load(open('model_logistic_regression_tfidf.sav', 'rb'))
#result = loaded_model.score(x_test, y_test)
#print(result)

0.8845333333333333


In [None]:
x_train, x_test, y_train, y_test = read_files()

#COUNT VECTORIZER
_, x_train, x_test = feature_extraction.get_count_vector(x_train['review'], x_test['review'], ngram_range=(1,2), min_df=0.0, remove_stopwords=False)

model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = model.fit(x_train, y_train)
print(lr_fit)

pickle.dump(model, open('model_logistic_regression_cv.sav', 'wb')) #save model

lr_predict = model.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

LogisticRegression(C=1, max_iter=500, random_state=123)
              precision    recall  f1-score   support

    Positive       0.91      0.89      0.90      3824
    Negative       0.89      0.91      0.90      3676

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500

[[3357  319]
 [ 416 3408]]


In [None]:
x_train, x_test, y_train, y_test = read_files()

#WORD2VEC
word2vec_model = feature_extraction.create_word2vec_model(x_train['review'], x_test['review']) 
x_train, x_test = feature_extraction.get_word2vec_embedding(word2vec_model, x_train['review'], x_test['review']) 

model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = model.fit(x_train, y_train)
print(lr_fit)

pickle.dump(model, open('model_logistic_regression_word2vec.sav', 'wb')) #save model

lr_predict = model.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

In [None]:
x_train, x_test, y_train, y_test = read_files()

#GLOVE
dirname = os.path.dirname(__file__)
filepath = os.path.join(dirname, 'glove.6B.200d.txt')

word2vec_output_file = 'glove.6B.200d' + '.word2vec'

glove_model = feature_extraction.load_glove_model(filepath, word2vec_output_file)
x_train, x_test = feature_extraction.get_glove_embedding(glove_model, x_train['review'], x_test['review'])

model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=123)
lr_fit = model.fit(x_train, y_train)
print(lr_fit)

pickle.dump(model, open('model_logistic_regression_glove.sav', 'wb')) #save model

lr_predict = model.predict(x_test)

report = classification_report(y_test, lr_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, lr_predict, labels=[1,0])
print(cm)

#SVM

In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

model = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SGD_tfidf.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

              precision    recall  f1-score   support

    Positive       0.92      0.88      0.90      3824
    Negative       0.88      0.93      0.90      3676

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500

[[3401  275]
 [ 471 3353]]


In [None]:
x_train, x_test, y_train, y_test = read_files()

#COUNT VECTORIZER
_, x_train, x_test = feature_extraction.get_count_vector(x_train['review'], x_test['review'], ngram_range=(1,2), min_df=0.0, remove_stopwords=False)

model = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SGD_cv.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

              precision    recall  f1-score   support

    Positive       0.91      0.90      0.90      3824
    Negative       0.89      0.91      0.90      3676

    accuracy                           0.90      7500
   macro avg       0.90      0.90      0.90      7500
weighted avg       0.90      0.90      0.90      7500

[[3331  345]
 [ 400 3424]]


In [None]:
x_train, x_test, y_train, y_test = read_files()

#TFIDF
dictionary, x_train, x_test = feature_extraction.get_tfidf_vector(x_train['review'], x_test['review'], remove_stopwords=False, ngram_range=(1,2))

model = svm.LinearSVC()
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SVC_tfidf.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

              precision    recall  f1-score   support

    Positive       0.92      0.90      0.91      3824
    Negative       0.90      0.92      0.91      3676

    accuracy                           0.91      7500
   macro avg       0.91      0.91      0.91      7500
weighted avg       0.91      0.91      0.91      7500

[[3366  310]
 [ 387 3437]]


In [None]:
word2vec_model = feature_extraction.create_word2vec_model(x_train['review'], x_test['review'])
x_train, x_test = feature_extraction.get_word2vec_embedding(word2vec_model, x_train['review'], x_test['review'])

model = svm.LinearSVC()
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SVC_word2vec.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

In [None]:
model = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SGD_word2vec.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

In [None]:
dirname = os.path.dirname(__file__)
filepath = os.path.join(dirname, 'glove.6B.200d.txt')

word2vec_output_file = 'glove.6B.200d' + '.word2vec'

glove_model = feature_extraction.load_glove_model(filepath, word2vec_output_file)
x_train, x_test = feature_extraction.get_glove_embedding(glove_model, x_train['review'], x_test['review'])

model = svm.LinearSVC()
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SVC_glove.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

In [None]:
model = SGDClassifier(loss='hinge', max_iter=500, random_state=123)
svm = model.fit(x_train, y_train)

pickle.dump(model, open('model_svm_SGD_glove.sav', 'wb'))

svm_predict = svm.predict(x_test)

report = classification_report(y_test, svm_predict, target_names=['Positive', 'Negative'])
print(report)

cm = confusion_matrix(y_test, svm_predict, labels=[1,0])
print(cm)

#MLP Classifier

In [None]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
import pandas as pd
import spacy

nlp_lg = spacy.load('en_core_web_lg')

x_train = pd.read_csv('x_train.csv', converters = {'review': str})
x_train = x_train['review'].values
x_test = pd.read_csv('x_test.csv', converters = {'review': str})
x_test = x_test['review'].values
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

#converting the data to a vector representation
def convert_data(corpus):
    new_corpus = []
    for document in corpus:
        doc = nlp_lg(document)
        new_corpus.append(doc.vector)
    return(new_corpus)


x_train = convert_data(x_train)
x_test = convert_data(x_test)

#MLP classifier 
classifier = MLPClassifier(hidden_layer_sizes=(300,150,50), max_iter=300, activation = 'relu',solver='adam',random_state=1)
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

#creating the confusion matrix
cm = confusion_matrix(y_pred, y_test)
diagonal_sum = cm.trace()
sum_of_all_elements = cm.sum()
accuracy = diagonal_sum / sum_of_all_elements

#printing the results of MLP
print("MLPClassifier accuracy: ", accuracy)
print("\nConfusion matrix: \n")
print(cm)

In [None]:
from sklearn.metrics import classification_report

report = classification_report(y_test, y_pred, target_names=['Positive', 'Negative'])
print(report)
cm = confusion_matrix(y_test, y_pred, labels=[1,0])
print(cm)