In [None]:
'''
    Afshin Karimi
    99210431
'''

## Initialaize

In [1]:
# import libraries
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
import gensim
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import pickle
from prettytable import PrettyTable

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
def analysis(labels, predictions):
    print(f"Report: Classification\n{classification_report(labels, predictions, target_names=['positive','negative'])}")
    print(f"Matrix: Confusion\n{confusion_matrix(labels, predictions)}")
    print(f"Accuracy: \n{accuracy_score(labels, predictions)}")

In [3]:
# connect to my drive for loading dataset
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [36]:
# load dataset
df = pd.read_csv('/content/gdrive/My Drive/Datasets/dataset.csv')
df.head()

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",negative
1,If you're a layman interested in quantum theor...,negative
2,It's amazing that this no talent actor Chapa g...,negative
3,This must be one of the most overrated Spanish...,negative
4,Some critics have compared Chop Shop with the ...,positive


In [37]:
# convert sentiment results to number {0,1}
df['sentiment'].replace('negative',0,inplace=True)
df['sentiment'].replace('positive',1,inplace=True)
df.head()

Unnamed: 0,comment,sentiment
0,"Oh my god, it just doesn't get any worse than ...",0
1,If you're a layman interested in quantum theor...,0
2,It's amazing that this no talent actor Chapa g...,0
3,This must be one of the most overrated Spanish...,0
4,Some critics have compared Chop Shop with the ...,1


### Preprocess

In [38]:
def preprocess(df, mode = 'low'):
    df_copy = df.copy()
    # convert all comments to lowercase
    df_copy["comment"] = df_copy["comment"].str.lower()
    # remove numbers from comments
    df_copy['comment'] = df_copy['comment'].str.replace('\d+', '')
    # remove special characters
    df_copy.replace(r'[^A-Za-z0-9 ]+', '', regex=True,inplace=True)
    # word tokenize
    df_copy['comment']=df_copy['comment'].apply(word_tokenize)
    if mode == 'high':
        # stop words removal
        stopword = stopwords.words('english')
        df_copy['comment'] = df_copy['comment'].apply(lambda x: [item for item in x if item not in stopword])
        # lemma
        lemmatizer = WordNetLemmatizer()
        df_copy['comment'] = df_copy['comment'].apply(lambda x: [lemmatizer.lemmatize(w) for w in x])
        # stemming
        snowball_stemmer = SnowballStemmer('english')
        df_copy['comment'] = df_copy['comment'].apply(lambda x: [snowball_stemmer.stem(w) for w in x])
        # Removal of Frequent words
        cnt = Counter()
        for text in df_copy["comment"].values:
            for word in text:
                cnt[word] += 1
        FREQWORDS = set([w for (w, wc) in cnt.most_common(6)])
        df_copy["comment"] = df_copy["comment"].apply(lambda text: list([word for word in text if word not in FREQWORDS]))
        # Removal of Rare words
        n_rare_words = 10
        RAREWORDS = set([w for (w, wc) in cnt.most_common()[:-n_rare_words-1:-1]])
        df_copy["comment"] = df_copy["comment"].apply(lambda text: list([word for word in text if word not in RAREWORDS]))
    return df_copy

In [7]:
df_low = preprocess(df, mode = 'low')
df_high = preprocess(df, mode = 'high')

### Vectorization

In [34]:
def vectorize(df, vectorization_type = 'bow', is_preprocessed = True , bow_features_num = 1000, w2v_size = 200):
    bow_model = None
    df_comments = df['comment']
    if vectorization_type == 'bow':
        bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=bow_features_num)
        if is_preprocessed:
            df_comments = [' '.join([str(elem) for elem in sublist]) for sublist in df['comment']]
        bow_model = bow_vectorizer.fit_transform(df_comments)
        return bow_model
    elif vectorization_type == 'w2v':
        w2v_vectorizer = gensim.models.Word2Vec(df['comment'],
                     min_count=20,
                     size=w2v_size,
                     workers=8)
        wordvec_arrays = np.zeros((len(df['comment']), w2v_size)) 
        for i in range(len(df['comment'])):
            wordvec_arrays[i,:] = word_vector(df['comment'][i], w2v_size,w2v_vectorizer)
        w2v_model = pd.DataFrame(wordvec_arrays)
        return w2v_model
    bow_vectorizer.fit_transform(df['comment'])
    return

# create a vector for each comment by taking the average of the vectors of the words present in the comment.
def word_vector(tokens, size, w2v_vectorizer):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in tokens:
        try:
            vec += w2v_vectorizer[word].reshape((1, size))
            count += 1.
        except KeyError:  # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

In [None]:
model = vectorize(df, is_preprocessed = False)
model_low = vectorize(df_low, vectorization_type='bow')
model_high = vectorize(df_high, vectorization_type='bow')
model_w2v = vectorize(df_high, vectorization_type='w2v')

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Logistic Regression

In [None]:
lr_dict = {}

def log_reg(model, labels, mode = 0):
    X_train, X_test, y_train, y_test = train_test_split(model, labels, train_size=0.8)
    for c in [0.01, 0.05, 0.25, 0.5, 1]:
        results = []
        lr = LogisticRegression(C=c)
        lr.fit(X_train, y_train)
        acc_score = accuracy_score(y_test, lr.predict(X_test))
        results.append(c)
        print('Accuracy for C=%s: %s'
            % (c, acc_score))
    nl = '\n'
    best_c = max(results)
    print(f'{nl}So the best C is : {best_c}')
    lr_model = LogisticRegression(C=best_c)
    lr_model.fit(X_train, y_train)
    predictions = lr_model.predict(X_test)
    score = accuracy_score(y_test, predictions)
    if mode == 0:
        lr_dict['LR without preprocess(BoW)'] = score
    elif mode == 1:
        lr_dict['LR with low preprocess(BoW)'] = score
    elif mode == 2:
        lr_dict['LR with high preprocess(BoW)'] = score
    elif mode == 3:
        lr_dict['LR with high preprocess(w2v)'] = score
    analysis(y_test,predictions)
    return lr_model

In [None]:
# Bag-of-Words without preprocess
lr_model1 = log_reg(model,df['sentiment'])

Accuracy for C=0.01: 0.8674444444444445
Accuracy for C=0.05: 0.8688888888888889
Accuracy for C=0.25: 0.8668888888888889
Accuracy for C=0.5: 0.8666666666666667
Accuracy for C=1: 0.8661111111111112

So the best C is : 1
Report: Classification
              precision    recall  f1-score   support

    positive       0.87      0.86      0.87      4495
    negative       0.86      0.87      0.87      4505

    accuracy                           0.87      9000
   macro avg       0.87      0.87      0.87      9000
weighted avg       0.87      0.87      0.87      9000

Matrix: Confusion
[[3864  631]
 [ 574 3931]]
Accuracy: 
0.8661111111111112


In [None]:
# Bag-of-Words with low preprocess
lr_model2 = log_reg(model_low, df['sentiment'], mode=1)

Accuracy for C=0.01: 0.8666666666666667
Accuracy for C=0.05: 0.8654444444444445
Accuracy for C=0.25: 0.8643333333333333
Accuracy for C=0.5: 0.8628888888888889
Accuracy for C=1: 0.8632222222222222

So the best C is : 1
Report: Classification
              precision    recall  f1-score   support

    positive       0.87      0.86      0.86      4510
    negative       0.86      0.87      0.86      4490

    accuracy                           0.86      9000
   macro avg       0.86      0.86      0.86      9000
weighted avg       0.86      0.86      0.86      9000

Matrix: Confusion
[[3875  635]
 [ 596 3894]]
Accuracy: 
0.8632222222222222


In [None]:
# Bag-of-Words with high preprocess
lr_model3 = log_reg(model_high, df['sentiment'], mode=2)

Accuracy for C=0.01: 0.8563333333333333
Accuracy for C=0.05: 0.8561111111111112
Accuracy for C=0.25: 0.8544444444444445
Accuracy for C=0.5: 0.8541111111111112
Accuracy for C=1: 0.8537777777777777

So the best C is : 1
Report: Classification
              precision    recall  f1-score   support

    positive       0.86      0.84      0.85      4507
    negative       0.84      0.87      0.86      4493

    accuracy                           0.85      9000
   macro avg       0.85      0.85      0.85      9000
weighted avg       0.85      0.85      0.85      9000

Matrix: Confusion
[[3788  719]
 [ 597 3896]]
Accuracy: 
0.8537777777777777


In [None]:
# Word2Vec with high preprocess
lr_model4 = log_reg(model_w2v, df['sentiment'], mode=3)

Accuracy for C=0.01: 0.8631111111111112
Accuracy for C=0.05: 0.871
Accuracy for C=0.25: 0.874
Accuracy for C=0.5: 0.8747777777777778
Accuracy for C=1: 0.8768888888888889

So the best C is : 1
Report: Classification
              precision    recall  f1-score   support

    positive       0.88      0.87      0.88      4469
    negative       0.88      0.88      0.88      4531

    accuracy                           0.88      9000
   macro avg       0.88      0.88      0.88      9000
weighted avg       0.88      0.88      0.88      9000

Matrix: Confusion
[[3899  570]
 [ 538 3993]]
Accuracy: 
0.8768888888888889


In [None]:
lr_dict

{'LR with high preprocess(BoW)': 0.8537777777777777,
 'LR with high preprocess(w2v)': 0.8768888888888889,
 'LR with low preprocess(BoW)': 0.8632222222222222,
 'LR without preprocess(BoW)': 0.8661111111111112}

## KNN

In [None]:
knn_dict = {}

def best_k(X_train, y_train):
    param_grid = {'n_neighbors':np.arange(9,12)}
    knn = KNeighborsClassifier()
    knn_cv= GridSearchCV(knn,param_grid,cv=5)
    knn_cv.fit(X_train, y_train)
    nl = '\n'
    bestk = knn_cv.best_params_['n_neighbors']
    print(f'Best score is {knn_cv.best_score_} for k = {bestk} {nl}')
    return bestk

def KNN(model, labels, mode = 0):
    X_train, X_test, y_train, y_test = train_test_split(model, labels, train_size=0.8)
    best_k_num = best_k(X_train, y_train)
    #Setup a knn classifier with k neighbors
    knn_model = KNeighborsClassifier(n_neighbors=best_k_num)
    #Fit the model
    knn_model.fit(X_train, y_train)
    predictions = knn_model.predict(X_test)
    score = accuracy_score(y_test, predictions)
    if mode == 0:
        knn_dict['kNN without preprocess(BoW)'] = score
    elif mode == 1:
        knn_dict['kNN with low preprocess(BoW)'] = score
    elif mode == 2:
        knn_dict['kNN with high preprocess(BoW)'] = score
    elif mode == 3:
        knn_dict['kNN with high preprocess(w2v)'] = score
    analysis(y_test,predictions)
    return knn_model


In [None]:
# Bag-of-Words without preprocess (KNN)
knn_model1 = KNN(model,df['sentiment'])

Best score is 0.6520277777777779 for k = 10 

Report: Classification
              precision    recall  f1-score   support

    positive       0.67      0.65      0.66      4587
    negative       0.64      0.66      0.65      4413

    accuracy                           0.65      9000
   macro avg       0.65      0.65      0.65      9000
weighted avg       0.65      0.65      0.65      9000

Matrix: Confusion
[[2975 1612]
 [1496 2917]]
Accuracy: 
0.6546666666666666


In [None]:
# Bag-of-Words with low preprocess (KNN)
knn_model2 = KNN(model_low,df['sentiment'],mode=1)

Best score is 0.6554166666666666 for k = 10 

Report: Classification
              precision    recall  f1-score   support

    positive       0.65      0.66      0.65      4435
    negative       0.66      0.66      0.66      4565

    accuracy                           0.66      9000
   macro avg       0.66      0.66      0.66      9000
weighted avg       0.66      0.66      0.66      9000

Matrix: Confusion
[[2916 1519]
 [1570 2995]]
Accuracy: 
0.6567777777777778


In [None]:
# Bag-of-Words with high preprocess (KNN)
knn_model3 = KNN(model_high,df['sentiment'],mode=2)

Best score is 0.69675 for k = 11 

Report: Classification
              precision    recall  f1-score   support

    positive       0.69      0.70      0.70      4473
    negative       0.70      0.69      0.70      4527

    accuracy                           0.70      9000
   macro avg       0.70      0.70      0.70      9000
weighted avg       0.70      0.70      0.70      9000

Matrix: Confusion
[[3131 1342]
 [1395 3132]]
Accuracy: 
0.6958888888888889


In [None]:
# Word2Vec with high preprocess (KNN)
knn_model4 = KNN(model_w2v, df['sentiment'], mode=3)

Best score is 0.8166944444444445 for k = 11 

Report: Classification
              precision    recall  f1-score   support

    positive       0.81      0.84      0.82      4513
    negative       0.83      0.80      0.82      4487

    accuracy                           0.82      9000
   macro avg       0.82      0.82      0.82      9000
weighted avg       0.82      0.82      0.82      9000

Matrix: Confusion
[[3789  724]
 [ 899 3588]]
Accuracy: 
0.8196666666666667


In [None]:
knn_dict

{'kNN with high preprocess(BoW)': 0.6958888888888889,
 'kNN with high preprocess(w2v)': 0.8196666666666667,
 'kNN with low preprocess(BoW)': 0.6567777777777778,
 'kNN without preprocess(BoW)': 0.6546666666666666}

## SVM

In [None]:
svm_dict = {}

def svm(model, labels, mode = 0):
    X_train, X_test, y_train, y_test = train_test_split(model, labels, train_size=0.8)
    svm_model = SGDClassifier()
    svm_model.fit(X_train, y_train)
    predictions = svm_model.predict(X_test)
    score = accuracy_score(y_test, predictions)
    if mode == 0:
        svm_dict['SVM without preprocess(BoW)'] = score
    elif mode == 1:
        svm_dict['SVM with low preprocess(BoW)'] = score
    elif mode == 2:
        svm_dict['SVM with high preprocess(BoW)'] = score
    elif mode == 3:
        svm_dict['SVM with high preprocess(w2v)'] = score
    analysis(y_test,predictions)
    return svm_model


In [None]:
# Bag-of-Words without preprocess (SVM)
svm_model1 = svm(model,df['sentiment'])

Report: Classification
              precision    recall  f1-score   support

    positive       0.92      0.77      0.84      4476
    negative       0.80      0.93      0.86      4524

    accuracy                           0.85      9000
   macro avg       0.86      0.85      0.85      9000
weighted avg       0.86      0.85      0.85      9000

Matrix: Confusion
[[3431 1045]
 [ 310 4214]]
Accuracy: 
0.8494444444444444


In [None]:
# Bag-of-Words with low preprocess (SVM)
svm_model2 = svm(model_low, df['sentiment'], mode=1)

Report: Classification
              precision    recall  f1-score   support

    positive       0.84      0.88      0.86      4476
    negative       0.88      0.83      0.85      4524

    accuracy                           0.86      9000
   macro avg       0.86      0.86      0.86      9000
weighted avg       0.86      0.86      0.86      9000

Matrix: Confusion
[[3955  521]
 [ 781 3743]]
Accuracy: 
0.8553333333333333


In [None]:
# Bag-of-Words with high preprocess (SVM)
svm_model3 = svm(model_high, df['sentiment'], mode=2)

Report: Classification
              precision    recall  f1-score   support

    positive       0.89      0.80      0.84      4476
    negative       0.82      0.90      0.86      4524

    accuracy                           0.85      9000
   macro avg       0.86      0.85      0.85      9000
weighted avg       0.86      0.85      0.85      9000

Matrix: Confusion
[[3583  893]
 [ 443 4081]]
Accuracy: 
0.8515555555555555


In [None]:
# Word2Vec with high preprocess (SVM)
svm_model4 = svm(model_w2v, df['sentiment'], mode=3)

Report: Classification
              precision    recall  f1-score   support

    positive       0.90      0.78      0.84      4476
    negative       0.81      0.91      0.86      4524

    accuracy                           0.85      9000
   macro avg       0.86      0.85      0.85      9000
weighted avg       0.85      0.85      0.85      9000

Matrix: Confusion
[[3506  970]
 [ 388 4136]]
Accuracy: 
0.8491111111111111


In [None]:
svm_dict

{'SVM with high preprocess(BoW)': 0.8515555555555555,
 'SVM with high preprocess(w2v)': 0.8491111111111111,
 'SVM with low preprocess(BoW)': 0.8553333333333333,
 'SVM without preprocess(BoW)': 0.8494444444444444}

## Evaluate and save best results

In [None]:
def print_results(result_dict):
    t = PrettyTable(['model', 'accuracy'])
    for key, val in result_dict.items():
        t.add_row([key, val])
    print(t)    

In [16]:
def save_model(model, model_name):
    model_name = model_name + '.pkl'
    with open(model_name,'wb') as f:
        pickle.dump(model,f)

In [None]:
print_results(lr_dict)

+------------------------------+--------------------+
|            model             |      accuracy      |
+------------------------------+--------------------+
|  LR without preprocess(BoW)  | 0.8661111111111112 |
| LR with low preprocess(BoW)  | 0.8632222222222222 |
| LR with high preprocess(BoW) | 0.8537777777777777 |
| LR with high preprocess(w2v) | 0.8768888888888889 |
+------------------------------+--------------------+


In [None]:
# save LR with high preprocess(w2v) according to above results
save_model(lr_model4,'LR')

In [None]:
print_results(knn_dict)

+-------------------------------+--------------------+
|             model             |      accuracy      |
+-------------------------------+--------------------+
|  kNN without preprocess(BoW)  | 0.6546666666666666 |
|  kNN with low preprocess(BoW) | 0.6567777777777778 |
| kNN with high preprocess(BoW) | 0.6958888888888889 |
| kNN with high preprocess(w2v) | 0.8196666666666667 |
+-------------------------------+--------------------+


In [None]:
# save kNN with high preprocess(w2v) according to above results
save_model(knn_model4,'kNN')

In [None]:
print_results(svm_dict)

+-------------------------------+--------------------+
|             model             |      accuracy      |
+-------------------------------+--------------------+
|  SVM without preprocess(BoW)  | 0.8494444444444444 |
|  SVM with low preprocess(BoW) | 0.8553333333333333 |
| SVM with high preprocess(BoW) | 0.8515555555555555 |
| SVM with high preprocess(w2v) | 0.8491111111111111 |
+-------------------------------+--------------------+


In [None]:
# save SVM with low preprocess(BoW) according to above results
save_model(svm_model2,'SVM')

## MLP

In [48]:
bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=500)
df_comments = [' '.join([str(elem) for elem in sublist]) for sublist in df_high['comment']]
model_high = bow_vectorizer.fit_transform(df_comments)
pickle.dump(bow_vectorizer,open('bow_vectorizer.pkl','wb'))

In [49]:
X = model_high
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [50]:
from sklearn.neural_network import MLPClassifier

mlp_model = MLPClassifier(hidden_layer_sizes=250, activation='relu', max_iter=300, warm_start=True)
mlp_model.partial_fit(X_train, y_train,classes=np.unique(y_train))

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=250, learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=300,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=True)

In [23]:
predictions = mlp_model.predict(X_test)
analysis(y_test,predictions)

Report: Classification
              precision    recall  f1-score   support

    positive       0.85      0.87      0.86      4480
    negative       0.87      0.85      0.86      4520

    accuracy                           0.86      9000
   macro avg       0.86      0.86      0.86      9000
weighted avg       0.86      0.86      0.86      9000

Matrix: Confusion
[[3900  580]
 [ 693 3827]]
Accuracy: 
0.8585555555555555


In [24]:
pickle.dump(mlp_model,open('best.pkl','wb'))