### This is a basic model on Netflix sentimental analysis

In [1]:
# importing the important libraries

import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import string
from nltk.stem import WordNetLemmatizer

In [2]:
# On positive review

pos_rev = pd.read_csv("C:/Users/clintonrozarioe/Documents/Ebin/LB_DSC_AI&ML_Course material/NLP by Afsaan/Netflix/pos.txt", encoding='latin-1', header = None, sep = '\n')

In [3]:
pos_rev

Unnamed: 0,0
0,the rock is destined to be the 21st century's ...
1,"the gorgeously elaborate continuation of "" the..."
2,effective but too-tepid biopic
3,if you sometimes like to go to the movies to h...
4,"emerges as something rare , an issue movie tha..."
...,...
5326,both exuberantly romantic and serenely melanch...
5327,mazel tov to a film about a family's joyous li...
5328,standing in the shadows of motown is the best ...
5329,it's nice to see piscopo again after all these...


In [4]:
#creating target variable

pos_rev['mood'] = 1
pos_rev

Unnamed: 0,0,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [5]:
pos_rev.rename(columns={0: 'review'}, inplace = True)

In [6]:
pos_rev

Unnamed: 0,review,mood
0,the rock is destined to be the 21st century's ...,1
1,"the gorgeously elaborate continuation of "" the...",1
2,effective but too-tepid biopic,1
3,if you sometimes like to go to the movies to h...,1
4,"emerges as something rare , an issue movie tha...",1
...,...,...
5326,both exuberantly romantic and serenely melanch...,1
5327,mazel tov to a film about a family's joyous li...,1
5328,standing in the shadows of motown is the best ...,1
5329,it's nice to see piscopo again after all these...,1


In [7]:
# On negative review

neg_rev = pd.read_csv("C:/Users/clintonrozarioe/Documents/Ebin/LB_DSC_AI&ML_Course material/NLP by Afsaan/Netflix/negative.txt", encoding='latin-1', header = None, sep = '\n')

In [8]:
neg_rev['mood'] = 0

In [9]:
neg_rev.rename(columns={0: 'review'}, inplace = True)

In [10]:
neg_rev

Unnamed: 0,review,mood
0,"simplistic , silly and tedious.",0
1,"it's so laddish and juvenile , only teenage bo...",0
2,exploitative and largely devoid of the depth o...,0
3,[garbus] discards the potential for pathologic...,0
4,a visually flashy but narratively opaque and e...,0
...,...,...
5326,a terrible movie that some people will neverth...,0
5327,there are many definitions of 'time waster' bu...,0
5328,"as it stands , crocodile hunter has the hurrie...",0
5329,the thing looks like a made-for-home-video qui...,0


In [11]:
# initialize stopwords and lemma

sw = stopwords.words('english')

lemma = WordNetLemmatizer()

In [12]:
# Cleaning the positive data

#converting into lower case ----> removing puncutations -----.stopwords ----> lemmatizing ------> join to string

pos_rev['review'] = pos_rev.review.apply(lambda x : x.lower())

pos_rev['review'] = pos_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in sw]))

pos_rev['review'] = pos_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in string.punctuation]))

pos_rev['review'] = pos_rev.review.apply(lambda x : " ".join([lemma.lemmatize(word) for word in nltk.word_tokenize(x)]))

In [13]:
pos_rev

Unnamed: 0,review,mood
0,rock destined 21st century 's new conan he 's ...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective too-tepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerges something rare issue movie that 's hon...,1
...,...,...
5326,exuberantly romantic serenely melancholy time ...,1
5327,mazel tov film family 's joyous life acting yi...,1
5328,standing shadow motown best kind documentary o...,1
5329,nice see piscopo year chaykin headly priceless,1


In [14]:
# Cleaning the negative data

neg_rev['review'] = neg_rev.review.apply(lambda x : x.lower())

neg_rev['review'] = neg_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in sw]))

neg_rev['review'] = neg_rev.review.apply(lambda x : " ".join([word for word in x.split() if word not in string.punctuation]))

neg_rev['review'] = neg_rev.review.apply(lambda x : " ".join([lemma.lemmatize(word, pos='v') for word in nltk.word_tokenize(x)]))

In [15]:
neg_rev

Unnamed: 0,review,mood
0,simplistic silly tedious .,0
1,laddish juvenile teenage boys could possibly f...,0
2,exploitative largely devoid depth sophisticati...,0
3,[ garbus ] discard potential pathological stud...,0
4,visually flashy narratively opaque emotionally...,0
...,...,...
5326,terrible movie people nevertheless find move,0
5327,many definitions 'time waster ' movie must sur...,0
5328,stand crocodile hunter hurry badly cobble look...,0
5329,thing look like made-for-home-video quickie,0


In [16]:
# contactenate both dataframes

common_rev = pd.concat([pos_rev, neg_rev], axis=0).reset_index(drop=True)

In [17]:
common_rev

Unnamed: 0,review,mood
0,rock destined 21st century 's new conan he 's ...,1
1,gorgeously elaborate continuation lord ring tr...,1
2,effective too-tepid biopic,1
3,sometimes like go movie fun wasabi good place ...,1
4,emerges something rare issue movie that 's hon...,1
...,...,...
10657,terrible movie people nevertheless find move,0
10658,many definitions 'time waster ' movie must sur...,0
10659,stand crocodile hunter hurry badly cobble look...,0
10660,thing look like made-for-home-video quickie,0


In [18]:
# train_test split

X_train, X_test, y_train, y_test = train_test_split(common_rev['review'].values, common_rev['mood'].values, test_size=0.2, random_state=101)

In [19]:
train_data = pd.DataFrame({'review': X_train, 'mood': y_train})
test_data = pd.DataFrame({'review': X_test, 'mood': y_test})

In [20]:
train_data

Unnamed: 0,review,mood
0,put washington honest work man john q archibal...,0
1,poignant familiar story young person suspended...,1
2,timely director could ever dreamed quietly lyr...,1
3,film virtually choke self-consciousness,0
4,film take inside rhythm subject experience watch,1
...,...,...
8524,branagh forceful non-shakespeare screen perfor...,1
8525,movie friday fan critics damn already like sor...,0
8526,perhaps heaviest joyless movie ever make giant...,0
8527,film rival live fine little amuse-bouche keep ...,1


In [21]:
test_data

Unnamed: 0,review,mood
0,important movie reminder power film move u mak...,1
1,i 've never seen heard anything quite like fil...,1
2,ending leave unfulfilled performance enjoy mem...,1
3,surface lovers-on-the-run crime flick lot comm...,1
4,walk remember shrewd enough activate girlish t...,0
...,...,...
2128,bullock good job working natural likability,1
2129,result memorable least interesting,1
2130,apparently design reverie memory regret thing ...,0
2131,movie insecure capacity excite churn one two f...,0


In [23]:
# vectorization

vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(train_data['review'])
test_vectors = vectorizer.transform(test_data['review'])

In [27]:
# SVM / Naive Bayes in NLP

#SVM

from sklearn import svm
from sklearn.metrics import classification_report

In [28]:
classifier = svm.SVC()
classifier.fit(train_vectors, train_data['mood'])

SVC()

In [29]:
# Prediction

pred = classifier.predict(test_vectors)

In [30]:
report = classification_report(test_data['mood'], pred, output_dict=True)

In [31]:
report

{'0': {'precision': 0.8367729831144465,
  'recall': 0.8471035137701804,
  'f1-score': 0.8419065596979707,
  'support': 1053},
 '1': {'precision': 0.8491096532333646,
  'recall': 0.8388888888888889,
  'f1-score': 0.8439683278993945,
  'support': 1080},
 'accuracy': 0.8429442100328176,
 'macro avg': {'precision': 0.8429413181739056,
  'recall': 0.8429962013295347,
  'f1-score': 0.8429374437986825,
  'support': 2133},
 'weighted avg': {'precision': 0.8430193983645317,
  'recall': 0.8429442100328176,
  'f1-score': 0.8429504929645144,
  'support': 2133}}

In [34]:
print(f"positive {report['1']['recall']}")
print(f"negative {report['0']['recall']}")

positive 0.8388888888888889
negative 0.8471035137701804


In [36]:
import joblib

In [37]:
joblib.dump(classifier, 'netflix75.pkl')

['netflix75.pkl']

In [38]:
joblib.dump(vectorizer, 'tfidf_vector_model.pkl')  # dumping the model/scaler so that the prediction is done correctly
# this is to save the vocabulary

['tfidf_vector_model.pkl']

In [40]:
vectorizer.get_feature_names()

['00',
 '000',
 '007',
 '10',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '10th',
 '11',
 '110',
 '112',
 '11th',
 '12',
 '127',
 '129',
 '12th',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '146',
 '15',
 '15th',
 '16',
 '163',
 '168',
 '17',
 '170',
 '179',
 '18',
 '180',
 '1899',
 '18th',
 '19',
 '1915',
 '1930s',
 '1934',
 '1937',
 '1938',
 '1940s',
 '1949',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1958',
 '1959',
 '1960',
 '1960s',
 '1962',
 '1967',
 '1970s',
 '1971',
 '1972',
 '1975',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1982',
 '1984',
 '1986',
 '1987',
 '1990',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '2000',
 '2002',
 '20th',
 '21',
 '21st',
 '22',
 '24',
 '2455',
 '25',
 '2525',
 '26',
 '270',
 '28k',
 '30',
 '300',
 '3000',
 '30s',
 '33',
 '37',
 '3d',
 '40',
 '400',
 '40s',
 '45',
 '451',
 '48',
 '4ever',
 '50',
 '500',
 '50s',
 '51',
 '51st',
 '52',
 '53',
 '5ths',
 '60',
 '60s',
 '65',
 '65th',
 '66',
 '70',
 '70s',

In [55]:
# PREDICTION of USER DATA

#load all the models
tfidf = joblib.load('tfidf_vector_model.pkl')
model = joblib.load('netflix75.pkl')

#Prediction

user_data = ['great movie']

vector = tfidf.transform(user_data).toarray()  #toarray is given here so that we can see the array
my_pred = model.predict(vector)

if my_pred[0] ==  1:
    print("Positive review")
else:
    print("Negative review")

Positive review
