IMPORTING FIRST DATASET

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data_imbd = pd.read_csv('Dataset.csv')

In [None]:
data_imbd.shape 

(5000, 2)

In [None]:
data_imbd.head()

Unnamed: 0,text,label
0,I always wrote this series off as being a comp...,0
1,1st watched 12/7/2002 - 3 out of 10(Dir-Steve ...,0
2,This movie was so poorly written and directed ...,0
3,The most interesting thing about Miryang (Secr...,1
4,"when i first read about ""berlin am meer"" i did...",0


In [None]:
#Check for null values
data_imbd.isnull().sum()

text     0
label    0
dtype: int64

In [None]:
#Check the distribution of label column
data_imbd['label'].value_counts()

1    2505
0    2495
Name: label, dtype: int64

In [None]:
#column text = x, column label = y
x1 = data_imbd['text']
y1 = data_imbd['label']

DATA CLEANING, DATA PREPROCESSING

In [None]:
import string 
import re
import nltk

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
#Data cleaning and preprocessing

def text_data_cleaning(sentence):
    wn = nltk.WordNetLemmatizer()
    stopword = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(sentence)
    lower = [word.lower() for word in tokens]
    no_stopwords = [word for word in lower if word not in stopword]
    no_alpha = [word for word in no_stopwords if word.isalpha()]
    lemm_text = [wn.lemmatize(word) for word in no_alpha]
    clean_text = lemm_text
    return clean_text

In [None]:
#test the data cleaning function
text_data_cleaning("Hello, It's a beautiful day outside there!")

['hello', 'beautiful', 'day', 'outside']

VECTORIZATION FEATURE ENGINEERING (TF-IDF)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [None]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)

In [None]:
#Support Vector Classifier
classifier = LinearSVC()

In [None]:
#Logistic Regression Classifier
classifier2 = LogisticRegression()

In [None]:
#Random Forest Classifier 
classifier3 = RandomForestClassifier()

SPLIT THE DATASET INTO TEST AND TRAIN DATASET

In [None]:
#Split the dataset, 20% testing, 80% training
from sklearn.model_selection import train_test_split
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size = 0.2, random_state = 0)

In [None]:
x1_train.shape, x1_test.shape

((4000,), (1000,))

In [None]:
x1_train.head()

2913    Damn, I thought I'd seen some bad westerns. Ca...
3275    First off I am in my mid 40's. Been watchin ho...
775     This is an absolutely true and faithful adapta...
217     "Rush in Rio" is, no doubt, one of the most ex...
1245    There were so many things wrong with this movi...
Name: text, dtype: object

TRAINING USING SUPPORT VECTOR CLASSIFIER

In [None]:
#Fit the x_train and y_train for Support Vector Classifier
clf = Pipeline ([('tfidf',tfidf), ('clf', classifier)])  #it will do vectorization then classification

In [None]:
#Training using Support Vector Classifier 
clf.fit(x1_train, y1_train) 

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7fbd608ef440>)),
                ('clf', LinearSVC())])

TRAINING USING LOGISTIC REGRESSION CLASSIFIER

In [None]:
#Fit the x_train and y_train for Logistic Regression Classifier
clf2 = Pipeline ([('tfidf',tfidf), ('clf', classifier2)])  #it will do vectorization then classification

In [None]:
#Training for Logistic Regression Classifier
clf2.fit(x1_train, y1_train) 

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7fbd608ef440>)),
                ('clf', LogisticRegression())])

TRAINING FOR RANDOM FOREST CLASSIFIER

In [None]:
#Fit the x_train and y_train for Random Forest Classifier
clf3 = Pipeline ([('tfidf',tfidf), ('clf', classifier3)])  #it will do vectorization then classification

In [None]:
#Training for Random Forest Classifier 
clf3.fit(x1_train, y1_train) 

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7fbd608ef440>)),
                ('clf', RandomForestClassifier())])

PREDICT TEST SET RESULTS FOR SVM

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
y_pred1 = clf.predict(x1_test)

In [None]:
#Confusion matrix
confusion_matrix(y1_test, y_pred1)

array([[403,  86],
       [ 55, 456]])

In [None]:
#Classfication report 
print(classification_report(y1_test, y_pred1))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85       489
           1       0.84      0.89      0.87       511

    accuracy                           0.86      1000
   macro avg       0.86      0.86      0.86      1000
weighted avg       0.86      0.86      0.86      1000



In [None]:
#Accuracy of the model
accuracy_score(y1_test, y_pred1)

0.859

PREDICT TEST SET RESULTS FOR LINEAR REGRESSION 

In [None]:
y_pred2 = clf2.predict(x1_test)

In [None]:
#Confusion matrix
confusion_matrix(y1_test, y_pred2)

array([[387, 102],
       [ 51, 460]])

In [None]:
#classfication_report 
print(classification_report(y1_test, y_pred2))

              precision    recall  f1-score   support

           0       0.88      0.79      0.83       489
           1       0.82      0.90      0.86       511

    accuracy                           0.85      1000
   macro avg       0.85      0.85      0.85      1000
weighted avg       0.85      0.85      0.85      1000



In [None]:
#Accuracy of the model
accuracy_score(y1_test, y_pred2)

0.847

PREDICT TEST SET RESULTS FOR RANDOM FOREST 

In [None]:
y_pred3 = clf3.predict(x1_test)

In [None]:
#Confusion matrix
confusion_matrix(y1_test, y_pred3)

array([[404,  85],
       [ 92, 419]])

In [None]:
#classfication_report 
print(classification_report(y1_test, y_pred3))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       489
           1       0.83      0.82      0.83       511

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000



In [None]:
#Accuracy of the model
accuracy_score(y1_test, y_pred3)

0.823

PREDICTING OTHER REVIEWS FOR SUPPORT VECTOR CLASSIFIER

In [None]:
#Example positive reviews
clf.predict(["films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before ."])

array([1])

In [None]:
#Example negative reviews
clf.predict(["the little girl is also pretty good but unfortunately her character doesn't really do all that much ."])

array([0])

PREDICTING OTHER REVIEWS FOR LOGISTIC REGRESSION CLASSIFIER

In [None]:
#Example positive reviews
clf2.predict(["films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before ."])

array([1])

In [None]:
#Example negative reviews
clf2.predict(["the little girl is also pretty good but unfortunately her character doesn't really do all that much ."])

array([0])

PREDICTING OTHER REVIEWS FOR RANDOM FOREST CLASSIFIER

In [None]:
#Example positive reviews
clf3.predict(["films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before ."])

array([1])

In [None]:
#Example negative reviews
clf3.predict(["the little girl is also pretty good but unfortunately her character doesn't really do all that much ."])

array([1])