In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

### Importing the review files 
There are two different files cointaining reviews. One file have pos reviews and the other have neg reviews

In [2]:
pos_rev = pd.read_csv('netflix/pos.txt' , sep = '\n' , header= None , encoding='latin-1')
pos_rev['mood'] = 1
pos_rev.rename(columns = {0: 'review'}, inplace = True)
pos_rev.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [3]:
neg_rev = pd.read_csv('netflix/negative.txt' , sep = '\n' , header= None , encoding='latin-1')
neg_rev['mood'] = 0
neg_rev.rename(columns = {0: 'review'}, inplace = True)
neg_rev.head()

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0


In [4]:
df = pd.concat([pos_rev, neg_rev]).reset_index(drop = True)
df.head()

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1


In [5]:
def clean(text):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

In [6]:
df['review'] = df['review'].apply(lambda x: clean(x))

In [7]:
df.head()

Unnamed: 0,review,mood
0,the rock is destined to be the centurys new ...,1
1,the gorgeously elaborate continuation of the ...,1
2,effective but tootepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,emerges as something rare an issue movie that...,1


In [8]:
xtrain,xtest,ytrain,ytest = train_test_split(df['review'], df['mood'], random_state = 44, test_size = 0.2)

In [15]:
vectorizer = TfidfVectorizer()
train_vec = vectorizer.fit_transform(xtrain)
test_vec  = vectorizer.transform(xtest)

In [10]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report
classifier  = SVC(kernel = 'linear')
classifier.fit(train_vec, ytrain)
y_pred = classifier.predict(test_vec)
print(classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.76      0.75      1066
           1       0.75      0.74      0.75      1067

    accuracy                           0.75      2133
   macro avg       0.75      0.75      0.75      2133
weighted avg       0.75      0.75      0.75      2133



In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_vec,ytrain)
y_pred_clf = clf.predict(test_vec)
print(classification_report(ytest, y_pred_clf))

              precision    recall  f1-score   support

           0       0.76      0.77      0.77      1066
           1       0.77      0.75      0.76      1067

    accuracy                           0.76      2133
   macro avg       0.76      0.76      0.76      2133
weighted avg       0.76      0.76      0.76      2133



In [58]:
import joblib
joblib.dump(clf,'naive_clf.pkl')
joblib.dump(vectorizer, 'transform.pkl')

['transform.pkl']

In [33]:
txt = 'I  good after watching the movie'
txt = list(clean(txt))
vec = vectorizer.transform(txt).toarray()
pred = clf.predict(vec)
pred_proba = clf.predict_proba(vec)

In [38]:
input

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [45]:
def predictions(txt):
    txt = [clean(txt)]
    vec = vectorizer.transform(txt).toarray()
    return vec

In [64]:
print(pred)
print(pred_proba)

[0]
[[0.7510856 0.2489144]]


In [78]:
import gradio as gr

a = ['Negative', 'Positive']

def pred(txt):
    txt = clean(txt)
    vec = predictions(txt)
    pred = clf.predict(vec)
    pred_proba = clf.predict_proba(vec)
    return a[pred[0]]


demo = gr.Interface(fn=pred, inputs="text", outputs="text")
demo.launch()


Running on local URL:  http://127.0.0.1:7865/

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x2a04ecd2760>, 'http://127.0.0.1:7865/', None)

In [85]:
def pred(txt):
    txt = clean(txt)
    vec = predictions(txt)
    pred = clf.predict(vec)
    pred_proba = clf.predict_proba(vec)
    return a[pred[0]], pred_proba[0]
txt = 'I feel sad'
pred(txt)

('Positive', array([0.49709788, 0.50290212]))