In [1]:
#import the required ibraries
import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
#import dataset into pandas dataframe
df_imdbrating = pd.read_csv('datasets/imdb_labelled.txt',sep='\t',names=['comment','rating'])
df_imdbrating.head()

Unnamed: 0,comment,rating
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [3]:
df_imdbrating['rating'].unique()

array([0, 1], dtype=int64)

In [5]:
df_imdbrating.groupby('rating').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
rating,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,Definitely worth checking out.,2


In [6]:
df_imdbrating.shape

(748, 2)

In [10]:
#view the length of message & add it as new column
df_imdbrating['length'] = df_imdbrating['comment'].apply(len)
df_imdbrating.head()

Unnamed: 0,comment,rating,length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [12]:
#convert the comments into bag-of-words using CountVectorizer
#i.e., convert text into numerical data
from sklearn.feature_extraction.text import CountVectorizer
bag_of_words = CountVectorizer().fit(df_imdbrating['comment'])
bag_of_words.vocabulary_

{'very': 2875,
 'slow': 2408,
 'moving': 1752,
 'aimless': 92,
 'movie': 1750,
 'about': 37,
 'distressed': 749,
 'drifting': 787,
 'young': 3041,
 'man': 1641,
 'not': 1815,
 'sure': 2571,
 'who': 2969,
 'was': 2921,
 'more': 1735,
 'lost': 1607,
 'the': 2642,
 'flat': 1036,
 'characters': 432,
 'or': 1864,
 'audience': 196,
 'nearly': 1779,
 'half': 1207,
 'of': 1839,
 'whom': 2973,
 'walked': 2909,
 'out': 1877,
 'attempting': 192,
 'artiness': 169,
 'with': 2989,
 'black': 290,
 'white': 2967,
 'and': 125,
 'clever': 479,
 'camera': 377,
 'angles': 131,
 'disappointed': 731,
 'became': 249,
 'even': 893,
 'ridiculous': 2202,
 'as': 174,
 'acting': 56,
 'poor': 2004,
 'plot': 1988,
 'lines': 1577,
 'almost': 104,
 'non': 1807,
 'existent': 922,
 'little': 1583,
 'music': 1762,
 'anything': 148,
 'to': 2698,
 'speak': 2454,
 'best': 276,
 'scene': 2270,
 'in': 1359,
 'when': 2959,
 'gerardo': 1135,
 'is': 1424,
 'trying': 2763,
 'find': 1019,
 'song': 2435,
 'that': 2641,
 'keeps': 1

In [21]:
comment_bagofwords = bag_of_words.transform(df_imdbrating['comment'])
print(comment_bagofwords.shape)
print(comment_bagofwords.toarray())

(748, 3051)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [23]:
#convert the word count occurrences into Term frequency for better prediction
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_values = TfidfTransformer().fit(comment_bagofwords)

In [24]:
comment_tfidf = tfidf_values.transform(comment_bagofwords)
print(comment_tfidf.shape)
print(comment_tfidf.toarray())

(748, 3051)
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [25]:
#train the model to predict good or bad ratings
from sklearn.naive_bayes import MultinomialNB
rating_model = MultinomialNB().fit(comment_tfidf,df_imdbrating['rating'])

In [26]:
#predict the rating to test the model
message = df_imdbrating.comment[2]
print(message)

#converting text into numerical data or bag of words
message_bagofwords = bag_of_words.transform([message])
print(message_bagofwords)

#converting numerical data into term frequency
message_tfidf = tfidf_values.transform(message_bagofwords)
print(message_tfidf)

print('Predicted value : ',rating_model.predict(message_tfidf))
print('expected value : ',df_imdbrating.rating[2])

Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  
  (0, 56)	1
  (0, 104)	1
  (0, 125)	3
  (0, 131)	1
  (0, 169)	1
  (0, 174)	1
  (0, 192)	1
  (0, 249)	1
  (0, 290)	1
  (0, 377)	1
  (0, 479)	1
  (0, 731)	1
  (0, 893)	1
  (0, 922)	1
  (0, 1577)	1
  (0, 1735)	1
  (0, 1750)	1
  (0, 1807)	1
  (0, 1988)	1
  (0, 2004)	1
  (0, 2202)	1
  (0, 2642)	3
  (0, 2921)	1
  (0, 2967)	1
  (0, 2989)	1
  (0, 2989)	0.1239814215160318
  (0, 2967)	0.20360981013595386
  (0, 2921)	0.10218579619255413
  (0, 2642)	0.18882422136959232
  (0, 2202)	0.21883741399227408
  (0, 2004)	0.21304814769129865
  (0, 1988)	0.16376781723311842
  (0, 1807)	0.22568466270938267
  (0, 1750)	0.10218579619255413
  (0, 1735)	0.16098462622608598
  (0, 1577)	0.2080332571159879
  (0, 922)	0.2600967785812006
  (0, 893)	0.15154628867074116
  (0, 731)	0.21304814769129865
  (0, 479)	0.21883741399227408
  (

In [29]:
#lets predict a good comment
#predict the rating to test the model
goodmessage = 'This movie is awesome and worthy to spend time!'

#converting text into numerical data or bag of words
message_bagofwords = bag_of_words.transform([goodmessage])

#converting numerical data into term frequency
message_tfidf = tfidf_values.transform(message_bagofwords)

print('Predicted value : ',rating_model.predict(message_tfidf)[0])

Predicted value :  1


In [31]:
#lets predict a bad comment
badmessage = 'such a bad movie i ever saw phew!!'

#converting text into numerical data or bag of words
message_bagofwords = bag_of_words.transform([badmessage])

#converting numerical data into term frequency
message_tfidf = tfidf_values.transform(message_bagofwords)

print('Predicted value : ',rating_model.predict(message_tfidf)[0])

Predicted value :  0
