In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import re,html,json
import seaborn as sns
import nltk as nt
from nltk.corpus import stopwords
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import Pipeline

In [4]:
df = pd.read_excel("Cleaned_Data.xlsx")

In [5]:
df.columns

Index(['Unnamed: 0', 'Review', 'Title', 'Spoiler_flag', 'Synopsis',
       'Cosine_Similarity', 'doc_similarity', 'tokens'],
      dtype='object')

In [6]:
df.drop("Unnamed: 0",axis=1,inplace= True)

In [7]:
df['Spoiler_flag'].value_counts()

0    12663
1    10216
Name: Spoiler_flag, dtype: int64

In [8]:
df.isnull().sum()

Review               1
Title                0
Spoiler_flag         0
Synopsis             0
Cosine_Similarity    0
doc_similarity       0
tokens               0
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df.isnull().sum()

Review               0
Title                0
Spoiler_flag         0
Synopsis             0
Cosine_Similarity    0
doc_similarity       0
tokens               0
dtype: int64

In [11]:
X = df[["Review"]]

In [12]:
y = df['Spoiler_flag']

In [13]:
X_train, X_test, y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=1)
X_train_2, X_val, y_train_2, y_val = train_test_split(X_train,y_train,test_size=0.2,random_state=1)

In [14]:
print(X_train.shape)
print(y_train.shape)
print(X_train_2.shape)
print(y_train_2.shape)

(20590, 1)
(20590,)
(16472, 1)
(16472,)


In [15]:
X_train = pd.concat([X_train,X_train_2])
X_train.shape

(37062, 1)

In [16]:
y_train = pd.concat([y_train,y_train_2])
y_train.shape

(37062,)

In [17]:
def execute_model(x,y,val_x,val_y,model,vectorizer):
    
    #Training the model.
    x_vectorized = vectorizer.fit_transform(x['Review'])
    model.fit(x_vectorized,y)
    
    #Running on validation.
    val_x_vectorized = vectorizer.transform(val_x['Review'])
    Y_pred_val = model.predict(val_x_vectorized)
    
    print('Accuracy with the model {} is {} :'.format(type(model).__name__,accuracy_score(Y_pred_val,val_y)))
    print("Classification Report is")
    print(classification_report(val_y,Y_pred_val))
    
    #Confusion Matrix 
    cm = confusion_matrix(y_val, Y_pred_val)
    # Extract the false negatives count from the confusion matrix
    false_negatives = cm[1, 0]

    # Calculate the false negative ratio
    false_neg_ratio = false_negatives / sum(cm[0])

    # Print the false negative ratio
    print("False negative ratio: ", false_neg_ratio)
    
    return model,vectorizer

# Model Implementation with Count Vectorizer 

In [18]:
cv = CountVectorizer(stop_words='english')
lr = LogisticRegression(max_iter=100,C=1)

In [19]:
lr_model,vectorizer = execute_model(X_train,y_train,X_val,y_val,lr,cv)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy with the model LogisticRegression is 0.921321029626032 :
Classification Report is
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      2243
           1       0.95      0.88      0.91      1875

    accuracy                           0.92      4118
   macro avg       0.92      0.92      0.92      4118
weighted avg       0.92      0.92      0.92      4118

False negative ratio:  0.10254123941150245


In [20]:
sgd = SGDClassifier(loss='log_loss',max_iter=100)
sgd_model,vectorizer = execute_model(X_train,y_train,X_val,y_val,sgd,cv)

Accuracy with the model SGDClassifier is 0.917921321029626 :
Classification Report is
              precision    recall  f1-score   support

           0       0.89      0.97      0.93      2243
           1       0.96      0.85      0.90      1875

    accuracy                           0.92      4118
   macro avg       0.93      0.91      0.92      4118
weighted avg       0.92      0.92      0.92      4118

False negative ratio:  0.12394115024520731


# Model Implementation with Tfid Vectorizer

In [21]:
tv = TfidfVectorizer(stop_words='english')
lr = LogisticRegression(max_iter=100,C=1)

In [22]:
lr_model,vectorizer = execute_model(X_train,y_train,X_val,y_val,lr,tv)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy with the model LogisticRegression is 0.7948033025740651 :
Classification Report is
              precision    recall  f1-score   support

           0       0.78      0.86      0.82      2243
           1       0.81      0.71      0.76      1875

    accuracy                           0.79      4118
   macro avg       0.80      0.79      0.79      4118
weighted avg       0.80      0.79      0.79      4118

False negative ratio:  0.24164065983058403


In [23]:
sgd = SGDClassifier(loss='log_loss',max_iter=100)
sgd_model,vectorizer = execute_model(X_train,y_train,X_val,y_val,sgd,tv)

Accuracy with the model SGDClassifier is 0.7520641087906751 :
Classification Report is
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      2243
           1       0.78      0.64      0.70      1875

    accuracy                           0.75      4118
   macro avg       0.76      0.74      0.74      4118
weighted avg       0.76      0.75      0.75      4118

False negative ratio:  0.30450289790459206


# Grid Search On Count vectorizer

In [27]:
def execute_grid_search(x,y,val_x,val_y,model,vectorizer):
    print(type(model).__name__)
    if type(model).__name__ == 'LogisticRegression':
        
        print("Model is {}".format(type(model).__name__))
        
        param_grid_logistic = {'penalty' : ['l1', 'l2'],
                               'C' : np.logspace(-4, 4, 20),
                               'solver' : ['liblinear'],
                               'max_iter' : [1000]}
        clf = GridSearchCV(model,param_grid=param_grid_logistic,cv=5)
        
        #Training the model.
        x_vectorized = vectorizer.fit_transform(x['Review'])
        print('Running Grid Search')
        best_fit = clf.fit(x_vectorized,y)
        
        print("best Params are:",best_fit.best_params_)
    
        #Running on validation.
        val_x_vectorized = vectorizer.transform(val_x['Review'])
        Y_pred_val = best_fit.predict(val_x_vectorized)

        print('Accuracy with the model {} is {} :'.format(type(model).__name__,accuracy_score(Y_pred_val,val_y)))
        print("Classification Report is")
        print(classification_report(val_y,Y_pred_val))

        #Confusion Matrix 
        cm = confusion_matrix(y_val, Y_pred_val)
        # Extract the false negatives count from the confusion matrix
        false_negatives = cm[1, 0]

        # Calculate the false negative ratio
        false_neg_ratio = false_negatives / sum(cm[0])

        # Print the false negative ratio
        print("False negative ratio: ", false_neg_ratio)

        return best_fit,vectorizer
    elif type(model).__name__ == 'SGDClassifier':
        
        print("Model is {}".format(type(model).__name__))
              
        param_grid_sgd = [{'alpha' : [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3],
                           'max_iter' : [1000],
                           'loss': ['log'],
                           'penalty': ['l2','l1']}]
        
        clf = GridSearchCV(model,param_grid=param_grid_logistic,cv=5)
        
        #Training the model.
        x_vectorized = vectorizer.fit_transform(x['Review'])
        print('Running Grid Search')
        best_fit = clf.fit(x_vectorized,y)
        
        print("best Params are:",best_fit.best_params_)
        #Running on validation.
        val_x_vectorized = vectorizer.transform(val_x['Review'])
        Y_pred_val = best_fit.predict(val_x_vectorized)

        print('Accuracy with the model {} is {} :'.format(type(model).__name__,accuracy_score(Y_pred_val,val_y)))
        print("Classification Report is")
        print(classification_report(val_y,Y_pred_val))

        #Confusion Matrix 
        cm = confusion_matrix(y_val, Y_pred_val)
        # Extract the false negatives count from the confusion matrix
        false_negatives = cm[1, 0]

        # Calculate the false negative ratio
        false_neg_ratio = false_negatives / sum(cm[0])

        # Print the false negative ratio
        print("False negative ratio: ", false_neg_ratio)

        return best_fit,vectorizer

In [28]:
cv = CountVectorizer(stop_words='english')
lr = LogisticRegression()

In [None]:
lr_model,vectorizer = execute_grid_search(X_train,y_train,X_val,y_val,lr,cv)

LogisticRegression
Model is LogisticRegression
Running Grid Search
