In [1]:
# Importing the NLTK library
import nltk

In [2]:
# Imporiting the relevant libraries
import pandas as pd
import numpy as np
import random

In [3]:
# Importing  tokenizers
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
# Importing the movie reviews corpus
from nltk.corpus import movie_reviews

### **Overview of the corpus**

In [5]:
# Get the file IDs of the corpus
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [6]:
# Let's check the categories of the reviews
movie_reviews.categories()

['neg', 'pos']

In [7]:
# Let's check the first file ID in raw text 
first_review = movie_reviews.raw('neg/cv000_29416.txt')
first_review

'plot : two teen couples go to a church party , drink and then drive . \nthey get into an accident . \none of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . \nwhat\'s the deal ? \nwatch the movie and " sorta " find out . . . \ncritique : a mind-fuck movie for the teen generation that touches on a very cool idea , but presents it in a very bad package . \nwhich is what makes this review an even harder one to write , since i generally applaud films which attempt to break the mold , mess with your head and such ( lost highway & memento ) , but there are good and bad ways of making all types of films , and these folks just didn\'t snag this one correctly . \nthey seem to have taken this pretty neat concept , but executed it terribly . \nso what are the problems with the movie ? \nwell , its main problem is that it\'s simply too jumbled . \nit starts off " normal " but then downshifts into this " fantasy " world in which you , as an audience membe

In [8]:
# let's check the lengths of positive and negative reviews 
print("#total negative reviews :", len(movie_reviews.fileids('neg')))
print("#total positive reviews :", len(movie_reviews.fileids('pos')))

#total negative reviews : 1000
#total positive reviews : 1000


In [9]:
# Creating a list which will contain the file IDs and its correspodning categories
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

In [10]:
# let's have a look at our documents list
print(documents[0])         # the first element of the documents list

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')


In [11]:
# Shuffling the documents list
random.shuffle(documents)

### **Feature Extraction**

To classify the text into any category, we need to convert the word into features. So, this will help in training our classifier.

In our first case, we will be using the **top-N words feature**.

1. So, with this approach, we will be using the top 2000 most frequent words.

2. We'll create a feature set which will contain the top-N words in a boollean form and the categories of the reviews.

3. We have already shuffled the documents list which contain all the 2000 review texts - both positive and negative reviews. From these, we will be taking the top 2000 most common words and will create the feature set.

4. After that we will be segregating the data to training and testing sets.

In [12]:
# Print out the total number of word tokens in the reviews corpus
len(movie_reviews.words())

1583820

#### Creating a list of all word tokens

In [13]:
# First, we need to take all the words present in the corpus and store them in a list
all_words_tokens = [token.lower() for token in movie_reviews.words()]

# Print out the first 10 words
print(all_words_tokens[:10])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']


#### Frequency Distribution of all the word tokens

--> This will calculate the number if occurrence of each word in the entire list of words.

In [14]:
# Importing the FreqDist (Frequency Distribution) from nltk
from nltk import FreqDist

In [15]:
# Calculating the freq dist of the all the word tokens
all_words_tokens_freq = FreqDist(all_words_tokens)
print(all_words_tokens_freq)

<FreqDist with 39768 samples and 1583820 outcomes>


So, we have **39768 unique** word tokens out of the total word tokens i.e. 1583820.

In [16]:
# Let's have a look for the top 15 most common word tokens (15 most frequently occurring word tokens)
all_words_tokens_freq.most_common(15)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595)]

We can see some punctuations and stopwords in the data. And most frequently occurring words are either punction marks or stopwords.

#### Removing Stopwords and Punctuations

In [17]:
# Importing stopwords and punctuations
from nltk.corpus import stopwords
import string               # for punctuations

In [18]:
# Getting the English stopwords
stopwords_eng = stopwords.words("english")
print(stopwords_eng)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [19]:
# Creating a function to remove stopwords and punctuations
def remove_punc_stopwords(txt):
    """
        1. First we will remove punctutations
        2. Then, we will remove stopwords
        3. Lastly, we will return the clean word tokens
    """
    nopunc = [char for char in txt if char not in string.punctuation]
    no_stops = [word for word in nopunc if word.lower() not in stopwords_eng]
    return no_stops

In [20]:
# Running the function on all word tokens
all_words_tokens_cleaned = remove_punc_stopwords(all_words_tokens)

In [21]:
# Let's see the lengths of the all word tokens list prior and after removing stopwords and punctuations
print("Original len of all word tokens = ", len(all_words_tokens))
print("After removal of stopwords and punctuations,  len of all word tokens = ", len(all_words_tokens_cleaned))

Original len of all word tokens =  1583820
After removal of stopwords and punctuations,  len of all word tokens =  710578


In [22]:
#### Frequency Distribution of all the word tokens after removing punctuations and stopwords
all_words_tokens_cleaned_freq = FreqDist(all_words_tokens_cleaned)

In [23]:
# Now let's see the top 15 most common words 
all_words_tokens_cleaned_freq.most_common(15)

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906)]

In [24]:
print(all_words_tokens_freq)
print(all_words_tokens_cleaned_freq)

<FreqDist with 39768 samples and 1583820 outcomes>
<FreqDist with 39586 samples and 710578 outcomes>


In [25]:
710578/1583820 

0.4486482049727873

So, now we can see that the frequently occurring word tokens are not stopwords and punctuations. It got changed to some more meaningful word tokens when we print the top 15 most common words.

Now, we are going to create word feature using 2000 most common words.

In [26]:
# Let's check the length of all using words
print(len(all_words_tokens_cleaned_freq))

39586


In [27]:
100*2000/40000

5.0

We will be using around 5% of the most common words.

In [28]:
# Most common words (2000 freq words)
most_common_word_tokens = all_words_tokens_cleaned_freq.most_common(2000)

# print top 10 most common words
print(most_common_word_tokens[:10])

[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049)]


In [29]:
# Least 10 freq words (botton 10 common words)
print(most_common_word_tokens[1990:])

[('remain', 64), ('anna', 64), ('moved', 64), ('asking', 64), ('genuinely', 64), ('rain', 64), ('path', 64), ('aware', 64), ('causes', 64), ('international', 64)]


In [30]:
# Since the elements of the most_common_word_tokens list are in the form of tuples, we need to extract the first element of each tuple to get the words as word features
word_features = [token[0] for token in most_common_word_tokens]

# Print out the top 10 word features
print(word_features[:10])

['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much']


In [31]:
# Length of the word_features list
len(word_features)

2000

#### Creating a Feature Set

--> We'll create a function to get the word features as a set.

In [32]:
# Creating a function to get the features (words) in a dictionary
def doc_features(doc):
    
    # creating a set for all the unique words present in a document
    doc_words = set(doc)
    
    # creating an empty features list
    features = {}
    
    # Will iterate through all the words present in the word_features list
    for word in word_features:
        
        # Get that word and see its presence in the document (will return a bollean value)
        features[word] = (word in doc_words)
    
    return features

In [33]:
# Get the file ID of the first positive review
movie_reviews.fileids('pos')[0]

'pos/cv000_29590.txt'

In [34]:
# Let's use the function on the first positive review
print(doc_features(movie_reviews.words('pos/cv000_29590.txt')))

{'film': True, 'one': True, 'movie': False, 'like': True, 'even': True, 'good': True, 'time': True, 'story': False, 'would': True, 'much': True, 'character': False, 'also': False, 'get': True, 'two': False, 'well': True, 'characters': False, 'first': True, '--': False, 'see': True, 'way': False, 'make': True, 'life': False, 'really': True, 'films': True, 'plot': False, 'little': True, 'people': False, 'could': True, 'scene': False, 'man': False, 'bad': True, 'never': True, 'best': False, 'new': True, 'scenes': True, 'many': False, 'director': False, 'know': False, 'movies': False, 'action': False, 'great': True, 'another': True, 'love': True, 'go': True, 'made': True, 'us': False, 'big': True, 'end': True, 'something': False, 'back': True, 'still': False, 'world': True, 'seems': True, 'work': False, 'makes': False, 'however': True, 'every': False, 'though': True, 'better': True, 'real': False, 'audience': False, 'enough': True, 'seen': False, 'take': False, 'around': False, 'going': Fa

In [35]:
# Now, we are going to create a feature set which will contain the word features of the review and its correspoding category
feature_sets = [(doc_features(review), category) for (review, category) in documents]

print(feature_sets[0])

({'film': True, 'one': True, 'movie': False, 'like': True, 'even': True, 'good': True, 'time': True, 'story': False, 'would': True, 'much': True, 'character': True, 'also': True, 'get': True, 'two': True, 'well': True, 'characters': True, 'first': True, '--': False, 'see': True, 'way': True, 'make': True, 'life': True, 'really': True, 'films': True, 'plot': True, 'little': True, 'people': True, 'could': False, 'scene': True, 'man': False, 'bad': True, 'never': True, 'best': True, 'new': False, 'scenes': True, 'many': False, 'director': True, 'know': True, 'movies': False, 'action': True, 'great': True, 'another': True, 'love': True, 'go': False, 'made': False, 'us': False, 'big': True, 'end': False, 'something': True, 'back': False, 'still': True, 'world': True, 'seems': True, 'work': True, 'makes': True, 'however': True, 'every': False, 'though': True, 'better': False, 'real': False, 'audience': False, 'enough': True, 'seen': True, 'take': True, 'around': True, 'going': True, 'year': 

### **Model Training**

Now, we will create training and testing sets. 

In [36]:
# Training set and Testing set
train_data = feature_sets[:1600]
test_data = feature_sets[1600:]

In [37]:
# Length of training set
len(train_data)

1600

In [38]:
# Length of testing set
len(test_data)

400

In [39]:
(1600/2000, 400/2000)

(0.8, 0.2)

Training our model i.e. classifier.

We will be using the **Naive Bayes Classifier**.

**Base Model**

In [40]:
# Importing the NaiveBayesClassifier from nltk
from nltk import NaiveBayesClassifier

# Creating an instance of our classifier and training the model
base_model = NaiveBayesClassifier.train(train_data)

Testing our base model.

In [41]:
# Importing classify from nltk
from nltk import classify

# Calculating the accuracy of the base model 
accuracy_score = classify.accuracy(base_model, test_data)
print("Accuracy Score of Base Model : {}%".format(100 * accuracy_score))

Accuracy Score of Base Model : 82.5%


In [42]:
# Show 10 most informative features
print(base_model.show_most_informative_features(10))

Most Informative Features
             outstanding = True              pos : neg    =     10.3 : 1.0
             wonderfully = True              pos : neg    =      8.1 : 1.0
                   mulan = True              pos : neg    =      7.3 : 1.0
                  seagal = True              neg : pos    =      6.9 : 1.0
                  poorly = True              neg : pos    =      6.6 : 1.0
                   damon = True              pos : neg    =      5.6 : 1.0
                  wasted = True              neg : pos    =      5.4 : 1.0
                   hanks = True              pos : neg    =      5.2 : 1.0
                    anna = True              pos : neg    =      5.0 : 1.0
                    jedi = True              pos : neg    =      5.0 : 1.0
None


The above result basically gives the **likelihood ratios** of the 10 most common. 

It shows that the word **"outstanding"** is used in positive reviews **10.3** times more often than it is used in negative reviews. The word **"poorly"** is used in negative reviwes **6.6** times more than it's used in positive reviews.

From this, we can have an idea about our **base_model**'s classification. A review will have a higher chance of getting a tag as positive if it contains words like **outstanding** and **wonderfully**. Likewise, a review has a higher chance of getting a negative tag if it contains words like **poorly**, **wasted**, etc.

Let's see how our base model gives result to some new reviews.

In [43]:
# Creating a new review
new_review = "I hated the movie. It was a disaster.Poor direction and bad acting"

# Creating word tokens
new_review_tokens = word_tokenize(new_review)

# Creating the word feature set
new_review_set = doc_features(new_review_tokens)

In [44]:
# let's test the classifier on the custom review
print(base_model.classify(new_review_set))

neg


So, we can see that the output comes out as **negative**.

In [45]:
# Let's see the probability of getting the above result
prob = base_model.prob_classify(new_review_set)
print("Maximum proba of getting the above classification =", prob.max())
print("Proba of getting a negative tag for the given review =", prob.prob('neg'))
print("Proba of getting a positive tag for the given review =", prob.prob('pos'))

Maximum proba of getting the above classification = neg
Proba of getting a negative tag for the given review = 0.99999907581857
Proba of getting a positive tag for the given review = 9.241814448779119e-07


In [46]:
# Let's take another custom review
new_review_1 = "It was an awesome movie. The direction was perfect. I loved it."

# Getting the word tokens
new_review_1_tokens = word_tokenize(new_review_1)

# Getting the feature set
new_review_1_set = doc_features(new_review_1_tokens)

In [47]:
# let's test the classifier on the custom review
print(base_model.classify(new_review_1_set))

neg


In [48]:
# Let's see the probability of getting the above result
prob_1 = base_model.prob_classify(new_review_1_set)
print("Maximum proba of getting the above classification =", prob_1.max())
print("Proba of getting a negative tag for the given review =", prob_1.prob('neg'))
print("Proba of getting a positive tag for the given review =", prob_1.prob('pos'))

Maximum proba of getting the above classification = neg
Proba of getting a negative tag for the given review = 0.9999828688920718
Proba of getting a positive tag for the given review = 1.713110791317439e-05


So, our **base_model** is not able to classify a positive review correctly. We have used the **top-N words** feature in our **base_model**. While creating the training and testing sets, we can have imbalance classes of positive and negative reviews.

### **Bag-of-Words Approach**

Now, we will be using a **Bag-of-words** feature for our model. We'll be using **unigrams** (item having a single word i.e. the N-gram of size 1. E.g. bad.  So, it's just a token with only one word and we will unigramns features).

1. In this approach, while creating the feature set we will be using all the useful words of each review. We will basically create two lists which will contain words for each review category (one for postive reviews and another for negative reviews).

2. Then, we will create a function (bag-of-words) which will clean the word tokens for each review texts. Here, we will remove punctuations and stopwords and the function will return a dictionary of cleaned words. We, will use a dictionary because the dictionary will not take duplicate words and thus will oncly contain unique word tokens.

3. After that, we will be using the bag-of-words function for positive reviews and negative reviews lists alongwith that, it will assign the positive and negative categories for the concerned reviews.

4. Now, we will be taking a fixed number of positive and negative reviews for both the training and testing sets.

5. So, this will ensure a balanced classes (categories) of the reviews for our model training.

#### Creating the word tokens lists for both **positive** and **negative** reviews.

In [49]:
# Now, will create two list for postive and negative reviews word tokens

# Positive reviews word tokens list
pos_word_tokens = []
for fileid in movie_reviews.fileids('pos'):
    word_tokens = movie_reviews.words(fileid)
    pos_word_tokens.append(word_tokens)

# Negative reviews word tokens list
neg_word_tokens = []
for fileid in movie_reviews.fileids('neg'):
    word_tokens = movie_reviews.words(fileid)
    neg_word_tokens.append(word_tokens)  

In [50]:
# Print out the first pos review of pos_word_tokens
print(pos_word_tokens[0][:30])

['films', 'adapted', 'from', 'comic', 'books', 'have', 'had', 'plenty', 'of', 'success', ',', 'whether', 'they', "'", 're', 'about', 'superheroes', '(', 'batman', ',', 'superman', ',', 'spawn', ')', ',', 'or', 'geared', 'toward', 'kids', '(']


In [51]:
# Print out the first neg review of pos_word_tokens
print(neg_word_tokens[0][:30])

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his']


#### **Bag-of-Words** function

In [52]:
# Creating the bag-of-words function which will return a dictionary of cleanded words
def bag_of_words(tokens):

    # Using the remove_punc_stopwords function to remove the stopwords and punctuations
    clean_word_tokens = remove_punc_stopwords(tokens)

    # storing the cleaned words in a dictionary
    words_dict = dict([char, True] for char in clean_word_tokens)
    return words_dict

In [53]:
# Let's check our bag-of-words function
print(bag_of_words(['the', 'the', 'a', 'good', 'bad', 'the', 'poor', 'love', 'awesome']))

{'good': True, 'bad': True, 'poor': True, 'love': True, 'awesome': True}


#### Creating Feature Sets for both positive and negative reviews.

In [54]:
# Positive reviews feature set
pos_feature_set = []
for tokens in pos_word_tokens:
    pos_feature_set.append((bag_of_words(tokens), 'pos'))

# Negative reviews feature set
neg_feature_set = []
for tokens in neg_word_tokens:
    neg_feature_set.append((bag_of_words(tokens), 'neg'))

In [55]:
# Printing out the first element of the pos feature set
print(pos_feature_set[0])

({'films': True, 'adapted': True, 'comic': True, 'books': True, 'plenty': True, 'success': True, 'whether': True, 'superheroes': True, 'batman': True, 'superman': True, 'spawn': True, 'geared': True, 'toward': True, 'kids': True, 'casper': True, 'arthouse': True, 'crowd': True, 'ghost': True, 'world': True, 'never': True, 'really': True, 'book': True, 'like': True, 'hell': True, 'starters': True, 'created': True, 'alan': True, 'moore': True, 'eddie': True, 'campbell': True, 'brought': True, 'medium': True, 'whole': True, 'new': True, 'level': True, 'mid': True, '80s': True, '12': True, 'part': True, 'series': True, 'called': True, 'watchmen': True, 'say': True, 'thoroughly': True, 'researched': True, 'subject': True, 'jack': True, 'ripper': True, 'would': True, 'saying': True, 'michael': True, 'jackson': True, 'starting': True, 'look': True, 'little': True, 'odd': True, 'graphic': True, 'novel': True, '500': True, 'pages': True, 'long': True, 'includes': True, 'nearly': True, '30': Tru

In [56]:
# Printing out the first element of the neg feature set
print(neg_feature_set[0])

({'plot': True, 'two': True, 'teen': True, 'couples': True, 'go': True, 'church': True, 'party': True, 'drink': True, 'drive': True, 'get': True, 'accident': True, 'one': True, 'guys': True, 'dies': True, 'girlfriend': True, 'continues': True, 'see': True, 'life': True, 'nightmares': True, 'deal': True, 'watch': True, 'movie': True, 'sorta': True, 'find': True, 'critique': True, 'mind': True, 'fuck': True, 'generation': True, 'touches': True, 'cool': True, 'idea': True, 'presents': True, 'bad': True, 'package': True, 'makes': True, 'review': True, 'even': True, 'harder': True, 'write': True, 'since': True, 'generally': True, 'applaud': True, 'films': True, 'attempt': True, 'break': True, 'mold': True, 'mess': True, 'head': True, 'lost': True, 'highway': True, 'memento': True, 'good': True, 'ways': True, 'making': True, 'types': True, 'folks': True, 'snag': True, 'correctly': True, 'seem': True, 'taken': True, 'pretty': True, 'neat': True, 'concept': True, 'executed': True, 'terribly': 

In [57]:
# Lengths of the feature sets
print("Length of the pos feature set :", len(pos_feature_set))
print("Length of the neg feature set :", len(neg_feature_set))

Length of the pos feature set : 1000
Length of the neg feature set : 1000


In [58]:
# Shuffling the feature sets
random.shuffle(pos_feature_set)
random.shuffle(neg_feature_set)

### **New Model Training**

In [59]:
# Creating training set and testing set by taking 20% of pos reviews and 20% of neg reviews for the testing set
# and remaining will be for the training set
test_data1 = pos_feature_set[:200] + neg_feature_set[:200]
train_data1 = pos_feature_set[200:] + neg_feature_set[200:]

In [60]:
# Printing out the lengths of the training and testing datasets
print("Length of the test_data1 :", len(test_data1))
print("Length of the train_data1 :", len(train_data1))

Length of the test_data1 : 400
Length of the train_data1 : 1600


Training our datasets with **NaiveBayesClassifier**

**Model1**

In [61]:
# Creating an instance of NaiveBayesClassifier and training it
model1 = NaiveBayesClassifier.train(train_data1)

In [62]:
# Calculating the accuracy of the model1 
accuracy_score1 = classify.accuracy(model1, test_data1)
print("Accuracy Score of Model1 : {}%".format(100 * accuracy_score1))

Accuracy Score of Model1 : 67.5%


In [63]:
# Show 10 most informative features
print(model1.show_most_informative_features(10))

Most Informative Features
               insulting = True              neg : pos    =     17.0 : 1.0
                   sucks = True              neg : pos    =     14.3 : 1.0
              schumacher = True              neg : pos    =     12.3 : 1.0
                  avoids = True              pos : neg    =     11.7 : 1.0
             outstanding = True              pos : neg    =     11.2 : 1.0
              astounding = True              pos : neg    =     11.0 : 1.0
                  finest = True              pos : neg    =     11.0 : 1.0
                    slip = True              pos : neg    =     11.0 : 1.0
                 insipid = True              neg : pos    =     10.3 : 1.0
            similarities = True              pos : neg    =     10.3 : 1.0
None


In [64]:
# Creating a new review
new_review = "I hated the movie. It was a disaster.Poor direction and bad acting"

# Creating word tokens
new_review_tokens = word_tokenize(new_review)

# Creating the word feature set
new_review_set = doc_features(new_review_tokens)

In [65]:
# let's test the classifier on the custom review
print(model1.classify(new_review_set))

neg


In [66]:
# Let's see the probability of getting the above result
prob_new = model1.prob_classify(new_review_set)
print("Maximum proba of getting the above classification =", prob_new.max())
print("Proba of getting a negative tag for the given review =", prob_new.prob('neg'))
print("Proba of getting a positive tag for the given review =", prob_new.prob('pos'))

Maximum proba of getting the above classification = neg
Proba of getting a negative tag for the given review = 0.6693792564832514
Proba of getting a positive tag for the given review = 0.33062074351697973


In [67]:
# Let's take another custom review
new_review_1 = "It was an awesome movie. The direction was perfect. I loved it."

# Getting the word tokens
new_review_1_tokens = word_tokenize(new_review_1)

# Getting the feature set
new_review_1_set = doc_features(new_review_1_tokens)

In [68]:
# let's test the classifier on the custom review
print(model1.classify(new_review_1_set))

pos


In [69]:
# Let's see the probability of getting the above result
prob_new_1 = model1.prob_classify(new_review_1_set)
print("Maximum proba of getting the above classification =", prob_new_1.max())
print("Proba of getting a negative tag for the given review =", prob_new_1.prob('neg'))
print("Proba of getting a positive tag for the given review =", prob_new_1.prob('pos'))

Maximum proba of getting the above classification = pos
Proba of getting a negative tag for the given review = 0.15574850984325253
Proba of getting a positive tag for the given review = 0.844251490157979


So, the above model - **model1** is much better than the **base_model**. The model1 has correctly classified both the custom reviews. This shows an improvement in our modelling by using the Bag-of-words approach and using balanced datasets. Next, we will try using **Bi-grams** features for our model.

**Bi-gram** - Item having two words, i.e. the N-gram of size 2. E.g. "Very bad".

Similary, we also have **Tri-Grams** (trigrams) - Item having three words, i.e. The N-gram of size 3. E.g. "Not very bad".

All these grams are known as **N-grams**. N-grams are continuous sequences of N words or symbols or tokens in a document. E.g “Medium blog” is a 2-gram (a bigram), “A Medium blog post” is a 4-gram, and “Write on Medium” is a 3-gram (trigram).

Now, we will be defining 3 functions:
1. For removing punctuations and stopwords.
2. For getting only unigram features.
3. For getting the bigram fetaures.

In [70]:
# Importing the ngrams 
from nltk import ngrams

In [71]:
# We already created for clean words
# remove_punc_stopwords()
# But we will be modifying it later for getting bigrams

In [72]:
# Let's create a bag-of-words func for extracting only the unigram features
def bag_of_unigrams(tokens):
    words_dict_uni = dict([token, True] for token in tokens)
    return words_dict_uni


# Creating a bag-of-words fucn for extracting for bigrams
def bag_of_ngrams(tokens, n=2):
    
    # A list for storing the bigrams
    words_ngrams = []

    # Iterating through items that being created as bigrams and appending them in the words_ngrams
    for item in iter(ngrams(tokens, n)):
        words_ngrams.append(item)

    # Will create a dictionary to store the unique bigram words
    words_dict_bi = dict([token, True] for token in words_ngrams)
    return words_dict_bi

In [73]:
# Let's check our above functions on sample texts
txt = "The movie was amazing. Brilliantly played all roles by all the actors. It was very good."
word_tokens_sample = word_tokenize(txt)
print("Sample word tokens of the given text :", word_tokens_sample, '\n')

# First removing puncs and stopwords
cleaned_words_sample = remove_punc_stopwords(word_tokens_sample)
print("Cleaned Words :", cleaned_words_sample, '\n')

# Making unigrams
print("Unigrams :", bag_of_unigrams(cleaned_words_sample), '\n')

# Making bigrams
print("Bigrams :", bag_of_ngrams(cleaned_words_sample))

Sample word tokens of the given text : ['The', 'movie', 'was', 'amazing', '.', 'Brilliantly', 'played', 'all', 'roles', 'by', 'all', 'the', 'actors', '.', 'It', 'was', 'very', 'good', '.'] 

Cleaned Words : ['movie', 'amazing', 'Brilliantly', 'played', 'roles', 'actors', 'good'] 

Unigrams : {'movie': True, 'amazing': True, 'Brilliantly': True, 'played': True, 'roles': True, 'actors': True, 'good': True} 

Bigrams : {('movie', 'amazing'): True, ('amazing', 'Brilliantly'): True, ('Brilliantly', 'played'): True, ('played', 'roles'): True, ('roles', 'actors'): True, ('actors', 'good'): True}


Cleaned word tokens are fine for unigrams but **cleaning words can be a disadvantage for bigrams because the cleaning process can omit important word tokens for bigrams**.

E.g. stopwords like over, under, so, very, etc. are important for bigrams.

In [74]:
# Let's use the uncleaned sample word tokens for getting bigrams
print(bag_of_ngrams(word_tokens_sample))

{('The', 'movie'): True, ('movie', 'was'): True, ('was', 'amazing'): True, ('amazing', '.'): True, ('.', 'Brilliantly'): True, ('Brilliantly', 'played'): True, ('played', 'all'): True, ('all', 'roles'): True, ('roles', 'by'): True, ('by', 'all'): True, ('all', 'the'): True, ('the', 'actors'): True, ('actors', '.'): True, ('.', 'It'): True, ('It', 'was'): True, ('was', 'very'): True, ('very', 'good'): True, ('good', '.'): True}


In [75]:
# We can create a new stopwords list especially for bigrams by subtracting important words from the whole set of stopwords
important_words = ['above', 'below', 'off', 'over', 'under', 'more', 'most', 'such', 'no', 'nor', 'not', 'only', 'so', 'than', 'too', 'very', 'just', 'but']

stopwords_eng_for_bigrams = set(stopwords_eng) - set(important_words)
stopwords_eng_for_bigrams = list(stopwords_eng_for_bigrams)

In [76]:
# Printing out the stopwords for bigrams
print(stopwords_eng_for_bigrams)

['there', 'during', 'had', 'were', 'does', 'that', "mustn't", 'theirs', 'did', 'hadn', 'have', 'herself', 'themselves', 'and', 'here', 'shan', 'on', 'into', 'is', 'm', 'wouldn', "didn't", "wouldn't", 'between', 'because', "you'd", "isn't", 'wasn', 'mustn', 'when', 'our', 'until', 'isn', "you'll", 'shouldn', 'weren', 'ma', 'a', 's', 'down', 'they', 'y', 'to', 'few', 'hasn', 've', 'how', 'same', 'mightn', 'them', 'further', 'then', 'has', 'my', 'yourself', 'myself', 'it', 'couldn', 'through', 'him', 'each', 'won', 'he', "it's", 'been', "doesn't", 'his', "weren't", 'both', "couldn't", "you've", 'hers', 'himself', 'if', 'ourselves', 'who', 'while', 'with', 'in', "haven't", "hadn't", 'again', 'll', "mightn't", 'don', 'this', 'be', 'having', 'can', 'out', 'your', 'are', 'of', 'itself', 'before', "shouldn't", 'should', 'd', 'against', 'yours', "should've", 'will', 'you', "don't", "she's", 'ain', 'those', 'yourselves', 'ours', 'any', 'we', 'at', 'as', 'why', "that'll", 'needn', "shan't", 'her'

In [77]:
# Creating a function to remove stopwords and punctuations for bigrams
def remove_punc_stopwords_for_bigrams(txt):
    """
        1. First we will remove punctutations
        2. Then, we will remove stopwords for bigrams
        3. Lastly, we will return the clean word tokens
    """
    nopunc = [char for char in txt if char not in string.punctuation]
    no_stops = [word for word in nopunc if word.lower() not in stopwords_eng_for_bigrams]
    return no_stops

In [78]:
# First removing puncs and stopwords
cleaned_words_sample = remove_punc_stopwords(word_tokens_sample)
cleaned_words_sample_for_bigrams = remove_punc_stopwords_for_bigrams(word_tokens_sample)
print("Cleaned Words :", cleaned_words_sample, '\n')
print("Cleaned Words for bigrams:", cleaned_words_sample_for_bigrams, '\n')

# Making unigrams
print("Unigrams :", bag_of_unigrams(cleaned_words_sample), '\n')

# Making bigrams
print("Bigrams :", bag_of_ngrams(cleaned_words_sample_for_bigrams))

Cleaned Words : ['movie', 'amazing', 'Brilliantly', 'played', 'roles', 'actors', 'good'] 

Cleaned Words for bigrams: ['movie', 'amazing', 'Brilliantly', 'played', 'roles', 'actors', 'very', 'good'] 

Unigrams : {'movie': True, 'amazing': True, 'Brilliantly': True, 'played': True, 'roles': True, 'actors': True, 'good': True} 

Bigrams : {('movie', 'amazing'): True, ('amazing', 'Brilliantly'): True, ('Brilliantly', 'played'): True, ('played', 'roles'): True, ('roles', 'actors'): True, ('actors', 'very'): True, ('very', 'good'): True}


In [79]:
# Now, let's combine unigram and bigram features of the sample text
unigram_feats = bag_of_unigrams(cleaned_words_sample)
bigram_feats = bag_of_ngrams(cleaned_words_sample_for_bigrams)

all_features = unigram_feats.copy()
all_features.update(bigram_feats)
print(all_features)

{'movie': True, 'amazing': True, 'Brilliantly': True, 'played': True, 'roles': True, 'actors': True, 'good': True, ('movie', 'amazing'): True, ('amazing', 'Brilliantly'): True, ('Brilliantly', 'played'): True, ('played', 'roles'): True, ('roles', 'actors'): True, ('actors', 'very'): True, ('very', 'good'): True}


#### Creating a function that extracts all features - unigrams and bigrams combined.

In [80]:
# Creating a function extract all features
def bag_of_all_words(tokens, n=2):
    clean_word_tokens = remove_punc_stopwords(tokens)
    clean_word_tokens_bigrams = remove_punc_stopwords_for_bigrams(tokens)

    unigram_feats = bag_of_unigrams(clean_word_tokens)
    bigram_feats = bag_of_ngrams(clean_word_tokens_bigrams)

    all_features = unigram_feats.copy()
    all_features.update(bigram_feats)

    return all_features

In [81]:
# Let's print out the bag of all words for the sample text
print(bag_of_all_words(word_tokens_sample))

{'movie': True, 'amazing': True, 'Brilliantly': True, 'played': True, 'roles': True, 'actors': True, 'good': True, ('movie', 'amazing'): True, ('amazing', 'Brilliantly'): True, ('Brilliantly', 'played'): True, ('played', 'roles'): True, ('roles', 'actors'): True, ('actors', 'very'): True, ('very', 'good'): True}


Let's use the pos_word_tokens and neg_word_tokens that we have already created.

In [82]:
#### Creating Feature Sets for both postive and negative reviews
pos_feature_set_new = []
for tokens in pos_word_tokens:
    pos_feature_set_new.append((bag_of_all_words(tokens), 'pos'))

neg_feature_set_new = []
for tokens in neg_word_tokens:
    neg_feature_set_new.append((bag_of_all_words(tokens), 'neg'))

In [83]:
# Shuffling the feature sets
random.shuffle(pos_feature_set_new)
random.shuffle(neg_feature_set_new)

### **New Model Training with both Unigrams and Bigrams combined**

In [84]:
# Creating training set and testing set by taking 20% of pos reviews and 20% of neg reviews for the testing set
# and remaining will be for the training set
test_data2 = pos_feature_set_new[:200] + neg_feature_set_new[:200]
train_data2 = pos_feature_set_new[200:] + neg_feature_set_new[200:]

In [85]:
# Printing out the lengths of the training and testing datasets
print("Length of the test_data2 :", len(test_data2))
print("Length of the train_data2 :", len(train_data2))

Length of the test_data2 : 400
Length of the train_data2 : 1600


Training our dataset with **NaiveBayesClassifier**

**Model2**

In [86]:
# Creating an instance of NaiveBayesClassifier and training it
model2 = NaiveBayesClassifier.train(train_data2)

In [87]:
# Calculating the accuracy of the model1 
accuracy_score2 = classify.accuracy(model2, test_data2)
print("Accuracy Score of Model2 : {}%".format(100 * accuracy_score2))

Accuracy Score of Model2 : 78.25%


In [88]:
# Creating a new review
new_review = "I hated the movie. It was a disaster.Poor direction and bad acting"

# Creating word tokens
new_review_tokens = word_tokenize(new_review)

# Creating the word feature set
new_review_set = doc_features(new_review_tokens)

In [89]:
# let's test the classifier on the custom review
print(model2.classify(new_review_set))

neg


In [90]:
# Let's see the probability of getting the above result
prob_new_model2 = model2.prob_classify(new_review_set)
print("Maximum proba of getting the above classification =", prob_new_model2.max())
print("Proba of getting a negative tag for the given review =", prob_new_model2.prob('neg'))
print("Proba of getting a positive tag for the given review =", prob_new_model2.prob('pos'))

Maximum proba of getting the above classification = neg
Proba of getting a negative tag for the given review = 0.6536501530044657
Proba of getting a positive tag for the given review = 0.34634984699519883


In [91]:
# Let's take another custom review
new_review_1 = "It was an awesome movie. The direction was perfect. I loved it."

# Getting the word tokens
new_review_1_tokens = word_tokenize(new_review_1)

# Getting the feature set
new_review_1_set = doc_features(new_review_1_tokens)

In [92]:
# let's test the classifier on the custom review
print(model2.classify(new_review_1_set))

pos


In [93]:
# Let's see the probability of getting the above result
prob_new_2_model2 = model2.prob_classify(new_review_1_set)
print("Maximum proba of getting the above classification =", prob_new_2_model2.max())
print("Proba of getting a negative tag for the given review =", prob_new_2_model2.prob('neg'))
print("Proba of getting a positive tag for the given review =", prob_new_2_model2.prob('pos'))

Maximum proba of getting the above classification = pos
Proba of getting a negative tag for the given review = 0.13485834819774806
Proba of getting a positive tag for the given review = 0.865141651802668


From the above results - accuracy scores of **model1** and **model2**, we can conclude that **model2** is an improvement over the **model1**.

**model1 accuracy = 67.5%   (having only unigram features)**

**model2 accuracy = 78.25%  (having both unigram and bigram features)**

So, combining both unigrams and bigrams will be (sometimes) a better option in NLP.

Note: I haven't displayed the result and code for show_most_informative_features, because I'm getting some errors in it. It's mostly related to unsupported oprations between datatypes and I'm not able to come around with solutions.