In [1]:
import pandas as pd
import numpy as np

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
import os 
import glob

folder_path = "movie_reviews\\"

In [3]:
pos_review = "movie_reviews\\" + 'pos'

pos_text_files = glob.glob(f"{pos_review}\\*.txt")
# pos_text_files

In [4]:
neg_review = "movie_reviews\\" + 'neg'

neg_text_files = glob.glob(f"{neg_review}\\*.txt")
# neg_text_files

In [5]:
import re
review_list =[]

for file_name in pos_text_files:
    f = open(file_name)
    text = f.read()
    text = re.sub('[^A-Za-z]+'," ",text)
    f.close()
    review_list.append(text)
    

for file_name in neg_text_files:
    f = open(file_name)
    text = f.read()
    text = re.sub('[^A-Za-z]+'," ",text)
    f.close()
    review_list.append(text)

In [6]:
len(review_list)

2000

In [7]:
review_list[1]

'every now and then a movie comes along from a suspect studio with every indication that it will be a stinker and to everybody s surprise perhaps even the studio the film becomes a critical darling mtv films election a high school comedy starring matthew broderick and reese witherspoon is a current example did anybody know this film existed a week before it opened the plot is deceptively simple george washington carver high school is having student elections tracy flick reese witherspoon is an over achiever with her hand raised at nearly every question way way high mr m matthew broderick sick of the megalomaniac student encourages paul a popular but slow jock to run and paul s nihilistic sister jumps in the race as well for personal reasons the dark side of such sleeper success is that because expectations were so low going in the fact that this was quality stuff made the reviews even more enthusiastic than they have any right to be you can t help going in with the baggage of glowing r

In [8]:
pos_target = np.ones(len(pos_text_files), dtype=int)
neg_target = np.zeros(len(neg_text_files), dtype=int)
pos_target.shape

(1000,)

In [9]:
y = np.append(pos_target,neg_target)
y.shape

(2000,)

## Bag of wards >> count vectorizer 

In [10]:
count_vect = CountVectorizer(stop_words='english',min_df=0.03)
X_count_vect = count_vect.fit_transform(review_list)
X_count_vect.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [11]:
x = pd.DataFrame(X_count_vect.toarray(),columns = count_vect.get_feature_names())
x



Unnamed: 0,ability,able,absolutely,academy,accent,accept,accident,act,acted,acting,...,written,wrong,wrote,yeah,year,years,yes,york,young,younger
0,0,0,0,0,2,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,2,0,0,0,0,0
3,0,0,0,1,0,0,0,5,0,1,...,0,0,0,0,0,1,0,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,2,0,1,0,0,0
1996,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
1997,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0


In [12]:
print(count_vect.get_feature_names())

['ability', 'able', 'absolutely', 'academy', 'accent', 'accept', 'accident', 'act', 'acted', 'acting', 'action', 'actions', 'actor', 'actors', 'actress', 'acts', 'actual', 'actually', 'adaptation', 'add', 'added', 'addition', 'adds', 'admit', 'adult', 'adults', 'adventure', 'affair', 'age', 'agent', 'ago', 'ahead', 'air', 'alan', 'alien', 'aliens', 'alive', 'allen', 'allow', 'allowed', 'allows', 'amazing', 'america', 'american', 'amusing', 'angry', 'animal', 'animated', 'animation', 'annoying', 'answer', 'anti', 'apart', 'apartment', 'apparent', 'apparently', 'appeal', 'appealing', 'appear', 'appearance', 'appears', 'appreciate', 'approach', 'appropriate', 'aren', 'army', 'arrives', 'art', 'artist', 'aside', 'ask', 'asked', 'asking', 'asks', 'aspect', 'aspects', 'ass', 'assistant', 'atmosphere', 'attack', 'attempt', 'attempts', 'attention', 'attitude', 'attractive', 'audience', 'audiences', 'available', 'average', 'avoid', 'award', 'aware', 'away', 'awful', 'baby', 'background', 'bad',



# tf-idf Vectorizer

In [13]:
tfidf_vect = TfidfVectorizer(stop_words='english',min_df=0.05)
x_tfidf_vect = tfidf_vect.fit_transform(review_list)
x_tfidf_vect

<2000x886 sparse matrix of type '<class 'numpy.float64'>'
	with 213487 stored elements in Compressed Sparse Row format>

In [14]:
x_tfidf_vect.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.10235647, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.10225682, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [15]:
x = pd.DataFrame(x_tfidf_vect.toarray(), columns = tfidf_vect.get_feature_names())
x

Unnamed: 0,ability,able,absolutely,act,acting,action,actor,actors,actress,actual,...,writers,writing,written,wrong,wrote,year,years,yes,york,young
0,0.000000,0.0,0.000000,0.000000,0.050373,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.044504,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.0,0.102356,0.000000,0.068145,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.128293,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.0,0.000000,0.289285,0.042533,0.0,0.000000,0.043055,0.0,0.069053,...,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.040127,0.000000,0.063777,0.043447
4,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.054598,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.080876,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.000000,0.0,0.051310,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.064312,0.000000,0.047589,0.000000,0.000000
1996,0.074700,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.043402,0.043500,0.000000,0.000000,0.000000
1997,0.102257,0.0,0.000000,0.085858,0.000000,0.0,0.069705,0.000000,0.0,0.000000,...,0.0,0.0,0.00000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1998,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.070130,0.000000,0.0,0.000000,...,0.0,0.0,0.07528,0.078095,0.0,0.000000,0.000000,0.000000,0.000000,0.000000


In [16]:
y.shape

(2000,)

In [17]:
x.shape

(2000, 886)

# Train model

In [18]:
x_train,x_test, y_train,y_test = train_test_split(x,y, test_size=0.25, stratify=y)

In [19]:
x_train.shape,x_test.shape, y_train.shape,y_test.shape 

((1500, 886), (500, 886), (1500,), (500,))

In [20]:
gnb = GaussianNB()
gnb.fit(x_train,y_train)

### Testing Data Evaluation 

In [21]:
## Test Data Evaluation 
y_pred = gnb.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print(f"Confusion Matrix=\n{cnf_matrix}")

acc = accuracy_score(y_test,y_pred)
print(f"Accuracy Score = {acc}")

clf_report = classification_report(y_test,y_pred)
print(f"classification Report = \n {clf_report}")

Confusion Matrix=
[[200  50]
 [ 63 187]]
Accuracy Score = 0.774
classification Report = 
               precision    recall  f1-score   support

           0       0.76      0.80      0.78       250
           1       0.79      0.75      0.77       250

    accuracy                           0.77       500
   macro avg       0.77      0.77      0.77       500
weighted avg       0.77      0.77      0.77       500



### Traininig Data Evaluation 

In [22]:
## Training Data Evaluation 
y_pred = gnb.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print(f"Confusion Matrix=\n{cnf_matrix}")

acc = accuracy_score(y_train,y_pred)
print(f"Accuracy Score = {acc}")

clf_report = classification_report(y_train,y_pred)
print(f"classification Report = \n {clf_report}")

Confusion Matrix=
[[648 102]
 [116 634]]
Accuracy Score = 0.8546666666666667
classification Report = 
               precision    recall  f1-score   support

           0       0.85      0.86      0.86       750
           1       0.86      0.85      0.85       750

    accuracy                           0.85      1500
   macro avg       0.85      0.85      0.85      1500
weighted avg       0.85      0.85      0.85      1500



# multinominalNB

In [23]:
mnb= MultinomialNB()
mnb.fit(x_train,y_train)

In [24]:
## Test Data Evaluation 
y_pred = mnb.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print(f"Confusion Matrix=\n{cnf_matrix}")

acc = accuracy_score(y_test,y_pred)
print(f"Accuracy Score = {acc}")

clf_report = classification_report(y_test,y_pred)
print(f"classification Report = \n {clf_report}")

Confusion Matrix=
[[209  41]
 [ 47 203]]
Accuracy Score = 0.824
classification Report = 
               precision    recall  f1-score   support

           0       0.82      0.84      0.83       250
           1       0.83      0.81      0.82       250

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500



In [25]:
## Training Data Evaluation 
y_pred = mnb.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print(f"Confusion Matrix=\n{cnf_matrix}")

acc = accuracy_score(y_train,y_pred)
print(f"Accuracy Score = {acc}")

clf_report = classification_report(y_train,y_pred)
print(f"classification Report = \n {clf_report}")

Confusion Matrix=
[[635 115]
 [113 637]]
Accuracy Score = 0.848
classification Report = 
               precision    recall  f1-score   support

           0       0.85      0.85      0.85       750
           1       0.85      0.85      0.85       750

    accuracy                           0.85      1500
   macro avg       0.85      0.85      0.85      1500
weighted avg       0.85      0.85      0.85      1500



# BernoulliNB

In [26]:
bnb = BernoulliNB()
bnb.fit(x_train,y_train)

In [27]:
## Test Data Evaluation 
y_pred = bnb.predict(x_test)

cnf_matrix = confusion_matrix(y_test,y_pred)
print(f"Confusion Matrix=\n{cnf_matrix}")

acc = accuracy_score(y_test,y_pred)
print(f"Accuracy Score = {acc}")

clf_report = classification_report(y_test,y_pred)
print(f"classification Report = \n {clf_report}")

Confusion Matrix=
[[218  32]
 [ 60 190]]
Accuracy Score = 0.816
classification Report = 
               precision    recall  f1-score   support

           0       0.78      0.87      0.83       250
           1       0.86      0.76      0.81       250

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500



In [28]:
## Training Data Evaluation 
y_pred = bnb.predict(x_train)

cnf_matrix = confusion_matrix(y_train,y_pred)
print(f"Confusion Matrix=\n{cnf_matrix}")

acc = accuracy_score(y_train,y_pred)
print(f"Accuracy Score = {acc}")

clf_report = classification_report(y_train,y_pred)
print(f"classification Report = \n {clf_report}")

Confusion Matrix=
[[641 109]
 [157 593]]
Accuracy Score = 0.8226666666666667
classification Report = 
               precision    recall  f1-score   support

           0       0.80      0.85      0.83       750
           1       0.84      0.79      0.82       750

    accuracy                           0.82      1500
   macro avg       0.82      0.82      0.82      1500
weighted avg       0.82      0.82      0.82      1500



In [29]:
review1 = """This is one of the best theatrical experiences I've had and I'm so happy someone has taken the practical route rather than throwing everyone into a green screen. I watched the first one many times and I can't believe this just topped it by a huge margin. Tom Cruise will be written as the most passionate filmmaker in history. I mean to put a whole cast in a bunch of f-18 jets and act in a jet as well as film yourself is a huge deal. If this movie does not cross a billion then there is something wrong with taste that people have these days. So called marvel fans. This is what you call a cinematic experience. Not some cropped cgi scenes."""

In [30]:
text = re.sub('[^A-Za-z]+'," ",review1)

text_list = text.split()
text_list = [" ".join(text_list)]
text_list


['This is one of the best theatrical experiences I ve had and I m so happy someone has taken the practical route rather than throwing everyone into a green screen I watched the first one many times and I can t believe this just topped it by a huge margin Tom Cruise will be written as the most passionate filmmaker in history I mean to put a whole cast in a bunch of f jets and act in a jet as well as film yourself is a huge deal If this movie does not cross a billion then there is something wrong with taste that people have these days So called marvel fans This is what you call a cinematic experience Not some cropped cgi scenes']

In [31]:
user_data_vector = tfidf_vect.transform(text_list)
arr = user_data_vector.toarray()
print(arr.shape)
result = gnb.predict(arr)
result

(1, 886)




array([1])

In [32]:
u1 = """If you were a late teen or in your early twenties in the mid 1980's the world was very different. No computers, no mobile phones, no internet, no DVD's. We had cars though, and bikes, and we loved them, and we loved films too. The original Top Gun captured this moment in time perfectly, and gave us a thrilling ride like we had never seen before. The humour, the games, the bikes, the aircraft and my word, those flying scenes. We went back to the cinema to see it again and again, and spent the following decades quoting the movie. As time went on, it remained like a static snapshot in time to perfectly represent that magical point in our lives for so many of us.

Now, 36 years later, we are a generation that has lost our parents, we've had our own children who have moved on themselves, and we now approach the end of our own careers and our young selves are gone forever.

This film is the missing bookend to that whole generation. The original was there for the start of our young adult lives, and this new film now marks the end. It's magnificent.

I'm 55, but yesterday, just for one last night, I was 19 again. Thank you."""
