## Movie_review-prediction

In [1]:
x = ["This was an awesome awesome movie!",
    "Great Movie. I liked it a lot.",
    "Happy Ending. Awesome acting by the hero.",
    "loved it. truly great.",
    "bad, not upto the mark.",
    "could have been better.",
    "surely a disappointing movie"]

In [2]:
y = [1,1,1,1,0,0,0] # 1 stands for a positive review, 0 stands for negative review.

In [3]:
## To do: Given these reviews and their class i.e. positive or negative, predict class for a new review.

In [4]:
# testing data
x_test = ["I was happy happy and I loved the acting in the movie",
         "The movie I saw was bad"]

## 1. Cleaning:

In [5]:
from nltk.tokenize import RegexpTokenizer #import regular expression tokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [6]:
# Init objects
tokenizer = RegexpTokenizer(r'\w+') # r stands for regular expression and '\w+' stands for all the words.
en_stopwords = set(stopwords.words('english'))
ps = PorterStemmer()

In [7]:
def getCleanReview(review):
    review = review.lower()
    review = review.replace("<br /><br />"," ") # there are many break tags as the training data is extracted from some html page
    # replaced all break tags with space.
    #Tokenize
    tokens = tokenizer.tokenize(review)
    new_tokens = [token for token in tokens if token not in en_stopwords]
    stemmed_tokens = [ps.stem(token) for token in new_tokens]
    
    cleaned_review = ' '.join(stemmed_tokens)
    
    return cleaned_review

In [8]:
x_clean = [getCleanReview(i) for i in x] # This is called List Comprehension in Python

In [9]:
x_test_clean = [getCleanReview(i) for i in x_test]

In [10]:
print(x_clean)

['awesom awesom movi', 'great movi like lot', 'happi end awesom act hero', 'love truli great', 'bad upto mark', 'could better', 'sure disappoint movi']


In [11]:
print(x_test_clean)

['happi happi love act movi', 'movi saw bad']


## 2. Vectorization

In [12]:
# Our fit method of scikit learn accepts same no. of words per sentence. But, each sentence or review can have different number of
# words. So, using sparse matrix.

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cv = CountVectorizer()

x_vect = cv.fit_transform(x_clean).toarray()

In [15]:
print(x_vect) # Making a vocabulary of all the words that occur in the review.
# Then, marking that words (actually their frequencies) wherever they occur and forming a matrix.
# One can count different words in x_clean
# It is equal to 18 and the length of underlying arrays is 18.


# See, awesome has been marked its frequency i.e. 2.

[[0 2 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 1 1 0 0 1 0 0 0]
 [1 1 0 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1]
 [0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0]]


In [16]:
print(x_vect.shape) # There are 18 features and 7 training examples.

(7, 18)


In [17]:
print(cv.get_feature_names())

# awesom is placed at number 2.
# this is the whole vocab and their index number. for eg. awesom has an index of 2.
# In this way each example is represented in the form of a sparse vector.

['act', 'awesom', 'bad', 'better', 'could', 'disappoint', 'end', 'great', 'happi', 'hero', 'like', 'lot', 'love', 'mark', 'movi', 'sure', 'truli', 'upto']


## Vectorization on the test set

In [18]:
# fit_transform should only be called on the training data because we don't want to learn parameters from test data.
x_test_vect = cv.transform(x_test_clean).toarray()
# In vectorizing test data, only transform should be called so that the number of features remains same in train and test.


In [19]:
print(x_test_vect)

[[1 0 0 0 0 0 0 0 2 0 0 0 1 0 1 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0]]


In [20]:
cv.get_feature_names()

['act',
 'awesom',
 'bad',
 'better',
 'could',
 'disappoint',
 'end',
 'great',
 'happi',
 'hero',
 'like',
 'lot',
 'love',
 'mark',
 'movi',
 'sure',
 'truli',
 'upto']

In [21]:
x_test_vect.shape

(2, 18)

## 3. Multinomial Naive Bayes

In [22]:
from sklearn.naive_bayes import MultinomialNB

In [23]:
mnb = MultinomialNB()

In [24]:
print(mnb)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [25]:
# Training

mnb.fit(x_vect,y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [26]:
# Prediction

mnb.predict(x_test_vect)

array([1, 0])

### So, the model predicts that the first review is positive and the second review is negative which is TRUE!

In [28]:
mnb.predict_proba(x_test_vect)

array([[0.09332629, 0.90667371],
       [0.61699717, 0.38300283]])

In [45]:
mnb.score(x_vect,y) # To calculate accuracy.

1.0

## Multivariate Bernoulli Event Model

In [46]:
from sklearn.naive_bayes import BernoulliNB

In [47]:
bnb = BernoulliNB(binarize=0.0) # Binarize is the threshold value.

In [48]:
print(bnb)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)


In [49]:
bnb.fit(x_vect,y)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [50]:
bnb.predict_proba(x_test_vect) 
# These probabilities are slightly different from the Multinomial Naive Bayes prob's.
# It makes sense as Bernoulli Naive Bayes is not worried about the frequencies, it is just worried about the occurence of the feature,
# whether the feature is occuring or not!

array([[0.07647628, 0.92352372],
       [0.68830318, 0.31169682]])

In [51]:
bnb.predict(x_test_vect) # It is also giving the same result as MNB.

array([1, 0])

In [52]:
bnb.score(x_vect,y)

1.0