## Necessary Import

In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.pipeline import Pipeline


In [2]:
nlp=spacy.load('en_core_web_sm')
df = pd.read_csv("imdb_labelled.txt", sep='\t', header=None, names=['Text', 'Label'])
df.head()

Unnamed: 0,Text,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
# Check data is balanced or Not
df.Label.value_counts()

Label
1    386
0    362
Name: count, dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    748 non-null    object
 1   Label   748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


## Data Preprocess remove Stop Words and Punctuation

In [5]:
def preprocess(text):
    doc=nlp(text)
    no_stop_word=[token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    return " ".join(no_stop_word)

In [6]:
df['new_text']=df['Text'].apply(preprocess)

In [7]:
df.head()

Unnamed: 0,Text,Label,new_text
0,"A very, very, very slow-moving, aimless movie ...",0,slow move aimless movie distressed drift young...
1,Not sure who was more lost - the flat characte...,0,sure lose flat character audience nearly half ...
2,Attempting artiness with black & white and cle...,0,attempt artiness black white clever camera ang...
3,Very little music or anything to speak of.,0,little music speak
4,The best scene in the movie was when Gerardo i...,1,good scene movie Gerardo try find song keep ru...


In [8]:
df['new_text'].iloc[0]

'slow move aimless movie distressed drift young man  '

In [9]:
df['Text'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

### Split the Data into Train and Test for Bag of Word

In [10]:
X_train,X_test,y_train,y_test=train_test_split(df.new_text,df.Label,test_size=0.2,random_state=42)

In [11]:
X_train.shape

(598,)

In [12]:
X_test.shape

(150,)

### Use CountVectorizer which convert a collection of text documents into a matrix of token counts.

In [13]:
v=CountVectorizer()
X_train_v=v.fit_transform(X_train)

In [14]:
X_test_v=v.transform(X_test)
X_test.shape

(150,)

In [15]:
X_train_v.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
X_train_v.shape

(598, 2214)

### Train on Several Model and Print Classification Report

In [17]:
d={'Naive Bayes':MultinomialNB(),'Random Forest':RandomForestClassifier(),
  'Decision Tree':DecisionTreeClassifier(),'KNN':KNeighborsClassifier()}

for k,v in d.items():
    model=d[k]
    model.fit(X_train_v,y_train)
    y_pred=model.predict(X_test_v)
    print(f"{k} using Bag of Word")
    print(classification_report(y_test,y_pred))
    print('\n')

Naive Bayes using Bag of Word
              precision    recall  f1-score   support

           0       0.86      0.72      0.79        76
           1       0.76      0.88      0.81        74

    accuracy                           0.80       150
   macro avg       0.81      0.80      0.80       150
weighted avg       0.81      0.80      0.80       150



Random Forest using Bag of Word
              precision    recall  f1-score   support

           0       0.80      0.59      0.68        76
           1       0.67      0.85      0.75        74

    accuracy                           0.72       150
   macro avg       0.74      0.72      0.72       150
weighted avg       0.74      0.72      0.72       150



Decision Tree using Bag of Word
              precision    recall  f1-score   support

           0       0.71      0.72      0.71        76
           1       0.71      0.69      0.70        74

    accuracy                           0.71       150
   macro avg       0.71      0

###  USE TF-IDF  Works and add on of CountVectorizer along with Token Count Handle the Relevancy

In [18]:
from sklearn.model_selection import train_test_split
X_train1,X_test1,y_train1,y_test1=train_test_split(df.new_text,df.Label,test_size=0.2,random_state=42)

### Train on Several Model and Print Classification Report

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('TF-IDF', TfidfVectorizer()),
    ('Naive Bayes', MultinomialNB())
])

clf.fit(X_train1,y_train1)


In [20]:
y_pred1=clf.predict(X_test1)


In [21]:
print("Naive Bayes using TF-IDF")
print(classification_report(y_test,y_pred1))

Naive Bayes using TF-IDF
              precision    recall  f1-score   support

           0       0.88      0.70      0.78        76
           1       0.74      0.91      0.82        74

    accuracy                           0.80       150
   macro avg       0.81      0.80      0.80       150
weighted avg       0.81      0.80      0.80       150



In [22]:

clf = Pipeline([
    ('TF-IDF', TfidfVectorizer()),
    ('Naive Bayes', KNeighborsClassifier())
])

clf.fit(X_train1,y_train1)

In [23]:
y_pred2=clf.predict(X_test1)

In [24]:
print("KNN using TF-IDF")
print(classification_report(y_test,y_pred2))

KNN using TF-IDF
              precision    recall  f1-score   support

           0       0.77      0.66      0.71        76
           1       0.69      0.80      0.74        74

    accuracy                           0.73       150
   macro avg       0.73      0.73      0.73       150
weighted avg       0.73      0.73      0.73       150

