In [1]:
import pandas as pd
import numpy as np
import text_normalizer as tn
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import SGDClassifier, LogisticRegression


In [6]:
dataset = pd.read_csv('movie_reviews.csv',index_col=0)

In [7]:
dataset.head()

Unnamed: 0,sentiment,Review
0,0,Imagine The Big Chill with a cast of twenty-so...
1,0,I'd have to say that I've seen worse Sci Fi Ch...
2,0,Director Fabio Barreto got a strange Academy N...
3,0,Pretty bad PRC cheapie which I rarely bother t...
4,1,This is a very intriguing short movie by David...


In [9]:
# Trainning and test datasets

reviews = np.array(dataset['Review'])
sentiments = np.array(dataset['sentiment'])

train_reviews = reviews[:35000]
train_sentiments = sentiments[:35000]
test_reviews = reviews[35000:]
test_sentiments = sentiments[35000:]

In [11]:
%%time
# Normalization 

norm_train_reviews = tn.normalize_corpus(train_reviews)
norm_test_reviews = tn.normalize_corpus(test_reviews)

Wall time: 1h 30min 42s


## Supervised Models -> LR and SVM

### Feature Engineering

In [13]:
# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=0.0, max_df=1.0, ngram_range=(1,2))
cv_train_features = cv.fit_transform(norm_train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=0.0, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

In [14]:
# transform test reviews into features
cv_test_features = cv.transform(norm_test_reviews)
tv_test_features = tv.transform(norm_test_reviews)

In [15]:
print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (35000, 2107020)  Test features shape: (15000, 2107020)
TFIDF model:> Train features shape: (35000, 2107020)  Test features shape: (15000, 2107020)


## Model Trainning

In [16]:
lr = LogisticRegression(penalty='l2', max_iter=100, C=1)
svm = SGDClassifier(loss='hinge', max_iter=100)

### Predictions

lr using BOW

In [25]:
%%time
lr.fit(cv_train_features, train_sentiments)
prediction_lr_bow = lr.predict(cv_test_features)

Wall time: 1min 2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Lr using TFIDF

In [26]:
%%time
lr.fit(tv_train_features, train_sentiments)
prediction_lr_tdidf = lr.predict(tv_test_features)

Wall time: 46.2 s


SVM using BOW

In [27]:
%%time
svm.fit(cv_train_features, train_sentiments)
prediction_svm_bow = lr.predict(cv_test_features)

Wall time: 1.74 s


SVM using TFIDF

In [28]:
%%time
svm.fit(tv_train_features, train_sentiments)
prediction_svm_tdidf = lr.predict(tv_test_features)

Wall time: 1.69 s


## Evaluation

In [21]:
# Evaluation Function
def evaluation_report(test, prediction):

    
    correct_docs = (test == prediction).sum()
    numdocs_wrong = test.shape[0] - correct_docs

    # Confusion Matrix
    true_pos = (test == 1) & (prediction == 1)
    false_pos = (test == 0) & (prediction == 1)
    false_neg = (test == 1) & (prediction == 0)
    true_neg = (test == 0) & (prediction == 0)
    
    
    # Accuracy
    fraction_wrong = numdocs_wrong/len(test)
    # Recall score
    recall_score = true_pos.sum() / (true_pos.sum() + false_neg.sum())
    # Precision Score
    precision_score = true_pos.sum() / (true_pos.sum() + false_pos.sum())
    # F1 Score
    f1_score = 2 * (precision_score * recall_score) / (precision_score + recall_score)

    
    # Printing the evaluation 
    print('Model Performance metrics: ')
    print('-'*20)
    print('Accuracy of the model is {:.2%}'.format(1-fraction_wrong))
    
    print('Recall score is {:.2%}'.format(recall_score))
    #print(re)
    print('Precision score is {:.3}'.format(precision_score))

    print('F1 Score is {:.2}'.format(f1_score))

    
    print('\n Model Classification report:')
    print('-'*20)
    target = ['positive', 'negative']
    print(classification_report(test, prediction, target_names=target))
    
    print('\n Prediction Confusion Matrix:')
    print('-'*20)
    confusion_matrix = pd.DataFrame([[true_pos.sum(), false_neg.sum()],[false_pos.sum().sum(),true_neg.sum()]],
                         columns=pd.MultiIndex(levels=[['predicted:'], 
                                                    ['positive', 'negative']], 
                                                    codes=[[0,0],[0,1]]),
                         index = pd.MultiIndex(levels=[['Actual:'], 
                                                    ['positive', 'negative']], 
                                                    codes=[[0,0],[0,1]]))
    print(confusion_matrix)

In [29]:
print('Logistic Regression results with Bow:')
evaluation_report(test_sentiments, prediction_lr_bow)

Logistic Regression results with Bow:
Model Performance metrics: 
--------------------
Accuracy of the model is 90.13%
Recall score is 91.22%
Precision score is 0.893
F1 Score is 0.9

 Model Classification report:
--------------------
              precision    recall  f1-score   support

    positive       0.91      0.89      0.90      7507
    negative       0.89      0.91      0.90      7493

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000


 Prediction Confusion Matrix:
--------------------
                 predicted:         
                   positive negative
Actual: positive       6835      658
        negative        823     6684


In [30]:
print('Logistic Regression results with TD-IDF:')
evaluation_report(test_sentiments, prediction_lr_tdidf)

Logistic Regression results with TD-IDF:
Model Performance metrics: 
--------------------
Accuracy of the model is 89.15%
Recall score is 90.66%
Precision score is 0.88
F1 Score is 0.89

 Model Classification report:
--------------------
              precision    recall  f1-score   support

    positive       0.90      0.88      0.89      7507
    negative       0.88      0.91      0.89      7493

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000


 Prediction Confusion Matrix:
--------------------
                 predicted:         
                   positive negative
Actual: positive       6793      700
        negative        927     6580


In [31]:
print('SVM results with BOW:')
evaluation_report(test_sentiments, prediction_svm_bow)

SVM results with BOW:
Model Performance metrics: 
--------------------
Accuracy of the model is 84.97%
Recall score is 78.86%
Precision score is 0.898
F1 Score is 0.84

 Model Classification report:
--------------------
              precision    recall  f1-score   support

    positive       0.81      0.91      0.86      7507
    negative       0.90      0.79      0.84      7493

    accuracy                           0.85     15000
   macro avg       0.85      0.85      0.85     15000
weighted avg       0.85      0.85      0.85     15000


 Prediction Confusion Matrix:
--------------------
                 predicted:         
                   positive negative
Actual: positive       5909     1584
        negative        671     6836


In [32]:
print('SVM results with TD-IDF:')
evaluation_report(test_sentiments, prediction_svm_tdidf)

SVM results with TD-IDF:
Model Performance metrics: 
--------------------
Accuracy of the model is 89.15%
Recall score is 90.66%
Precision score is 0.88
F1 Score is 0.89

 Model Classification report:
--------------------
              precision    recall  f1-score   support

    positive       0.90      0.88      0.89      7507
    negative       0.88      0.91      0.89      7493

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000


 Prediction Confusion Matrix:
--------------------
                 predicted:         
                   positive negative
Actual: positive       6793      700
        negative        927     6580
