In [32]:
## Importing libraries
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
## To print the fileid into a txt file
with open("fileid.txt", "w") as f:
    for category in movie_reviews.categories():
        for fileid in movie_reviews.fileids(category):
            f.write(fileid + "\n")

In [2]:
## Creation of a list for each review and its category
documents = []
for category in movie_reviews.categories():
    # print(category)
    
    for fileid in movie_reviews.fileids(category):
        # print(fileid)
        review_word_list = list(movie_reviews.words(fileid))
        
        document = (review_word_list, category)
        documents.append(document)

In [3]:
documents[0]

(['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'generally',
  'a

Documents contains ordered reviews from neg to positives. That is why a shuffle is needed here to make it more random

In [4]:
random.shuffle(documents)

Now let's preprocess the documents text by removing **stop words**, **punctuation** and **lower casing**. Also, I using **Lemmatization** here to normalize the reviews text.

In [66]:
all_words = []
stop_words = set(stopwords.words("english"))
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

for word in movie_reviews.words():
    
    ## Getting rid of stop words and punctuations
    if word not in stop_words and word not in string.punctuation and word not in string.digits:
        # all_words.append(ps.stem(word.lower())) ## Stemming here 
        all_words.append(lemmatizer.lemmatize(word.lower())) ## Lemmatization

In [67]:
## Let's check out the most common words
all_words = nltk.FreqDist(all_words)
all_words.most_common(20)

[('film', 11053),
 ('movie', 6977),
 ('one', 6028),
 ('character', 3879),
 ('like', 3789),
 ('time', 2979),
 ('get', 2814),
 ('scene', 2671),
 ('make', 2634),
 ('even', 2568),
 ('good', 2429),
 ('story', 2345),
 ('would', 2109),
 ('much', 2049),
 ('also', 1967),
 ('well', 1921),
 ('life', 1913),
 ('two', 1911),
 ('see', 1885),
 ('way', 1882)]

In [68]:
len(all_words)

35152

In [69]:
all_words

FreqDist({'film': 11053, 'movie': 6977, 'one': 6028, 'character': 3879, 'like': 3789, 'time': 2979, 'get': 2814, 'scene': 2671, 'make': 2634, 'even': 2568, ...})

In [70]:
all_words["bad"]

1395

In [71]:
## Setting up the most common words 
word_features = []
for common_word in all_words.most_common(20000):
    word_features.append(common_word[0])

In [72]:
len(word_features)

20000

In [73]:
def find_features(feature_doc):
    """
    This function helps us to generate a dictionary with each most common word as key
    and its presence or not (True or false) in a particular review as value.

    Parameters
    ----------
    feature_doc : str
        text of a review.

    Returns
    -------
    features : dict
        key: a most common word
        value: True or False.

    """
    words = set(feature_doc)
    
    features = {}
    for word in word_features:
        is_feature_in_words = word in words
        
        features[word] = is_feature_in_words
        
    return features

In [75]:
find_features(movie_reviews.words("pos/cv686_13900.txt"))

{'film': True,
 'movie': True,
 'one': True,
 'character': False,
 'like': False,
 'time': True,
 'get': True,
 'scene': False,
 'make': False,
 'even': False,
 'good': True,
 'story': False,
 'would': True,
 'much': False,
 'also': False,
 'well': False,
 'life': False,
 'two': False,
 'see': True,
 'way': True,
 'first': True,
 '--': False,
 'go': True,
 'year': False,
 'thing': True,
 'take': False,
 'plot': True,
 'really': False,
 'come': False,
 'little': False,
 'know': False,
 'people': False,
 'could': False,
 'man': False,
 'bad': True,
 'work': False,
 'never': True,
 'director': False,
 'best': False,
 'end': False,
 'performance': False,
 'new': False,
 'look': False,
 'many': False,
 'action': False,
 'actor': False,
 'u': False,
 'love': False,
 'play': False,
 'star': False,
 'role': False,
 'show': False,
 'great': False,
 'another': True,
 'find': False,
 'made': False,
 'audience': False,
 'back': False,
 'give': True,
 'big': False,
 'world': False,
 'something': Fa

We are joining here the features previously created with its category (positive or negative)

In [76]:
featureSets = []
for (review, category) in documents:
    feature = (find_features(review), category)
    featureSets.append(feature)

In [77]:
featureSets[-1]

({'film': True,
  'movie': False,
  'one': True,
  'character': True,
  'like': True,
  'time': True,
  'get': True,
  'scene': False,
  'make': False,
  'even': True,
  'good': False,
  'story': False,
  'would': True,
  'much': True,
  'also': True,
  'well': True,
  'life': False,
  'two': True,
  'see': False,
  'way': False,
  'first': True,
  '--': True,
  'go': True,
  'year': False,
  'thing': True,
  'take': True,
  'plot': False,
  'really': False,
  'come': False,
  'little': True,
  'know': False,
  'people': True,
  'could': False,
  'man': False,
  'bad': True,
  'work': True,
  'never': True,
  'director': True,
  'best': True,
  'end': False,
  'performance': False,
  'new': True,
  'look': False,
  'many': False,
  'action': False,
  'actor': False,
  'u': False,
  'love': False,
  'play': True,
  'star': False,
  'role': True,
  'show': False,
  'great': False,
  'another': True,
  'find': True,
  'made': False,
  'audience': False,
  'back': True,
  'give': False,
  

In [16]:
len(featureSets)

2000

**Creating Training and Test Set**

In [25]:
training_set = featureSets[:1800]
test_set = featureSets[1800:]

In [30]:
training_set[0]

({'film': True,
  'movie': True,
  'one': True,
  'character': True,
  'like': False,
  'time': True,
  'get': True,
  'scene': False,
  'make': True,
  'even': False,
  'good': False,
  'story': True,
  'would': False,
  'much': False,
  'also': True,
  'well': True,
  'life': True,
  'two': True,
  'see': False,
  'way': True,
  'first': False,
  '--': False,
  'go': False,
  'year': True,
  'thing': False,
  'take': False,
  'plot': False,
  'really': False,
  'come': False,
  'little': True,
  'know': False,
  'people': True,
  'could': True,
  'man': True,
  'bad': False,
  'work': False,
  'never': False,
  'director': True,
  'best': True,
  'end': True,
  'performance': False,
  'new': True,
  'look': False,
  'many': True,
  'action': True,
  'actor': False,
  'u': False,
  'love': False,
  'play': False,
  'star': False,
  'role': True,
  'show': False,
  'great': True,
  'another': False,
  'find': False,
  'made': False,
  'audience': False,
  'back': False,
  'give': False

In [78]:
test_set[-1]

({'film': True,
  'movie': False,
  'one': True,
  'character': True,
  'like': True,
  'time': True,
  'get': True,
  'scene': False,
  'make': False,
  'even': True,
  'good': False,
  'story': False,
  'would': True,
  'much': True,
  'also': True,
  'well': True,
  'life': False,
  'two': True,
  'see': False,
  'way': False,
  'first': True,
  '--': True,
  'go': True,
  'year': False,
  'thing': True,
  'take': True,
  'plot': False,
  'really': False,
  'come': False,
  'little': True,
  'know': False,
  'people': True,
  'could': False,
  'man': False,
  'bad': True,
  'work': True,
  'never': True,
  'director': True,
  'best': True,
  'end': False,
  'performance': False,
  'new': True,
  'look': False,
  'many': False,
  'action': False,
  'actor': False,
  'u': False,
  'love': False,
  'play': True,
  'star': False,
  'role': True,
  'show': False,
  'great': False,
  'another': True,
  'find': True,
  'made': False,
  'audience': False,
  'back': True,
  'give': False,
  

**Applying a classification algorithm**

In [79]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

In [80]:
## Calcilating the accuracy
acc = nltk.classify.accuracy(classifier, test_set)

In [81]:
print("Accuracy: {0:.2f}%".format(100*acc))

Accuracy: 83.50%


In [82]:
## Most informative features
classifier.show_most_informative_features()

Most Informative Features
             outstanding = True              pos : neg    =     13.1 : 1.0
              schumacher = True              neg : pos    =      9.0 : 1.0
             wonderfully = True              pos : neg    =      8.9 : 1.0
                   damon = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.8 : 1.0
                  finest = True              pos : neg    =      7.8 : 1.0
                   mulan = True              pos : neg    =      6.4 : 1.0
                  poorly = True              neg : pos    =      5.8 : 1.0
             beautifully = True              pos : neg    =      5.5 : 1.0
                   worst = True              neg : pos    =      5.4 : 1.0


# Now let's use the classify to predict one sentence

In [99]:
test_sentence = "This is a bad and worst movie ever!"

In [100]:
# Our new feature
test_sent_features = {word: (word in word_tokenize(test_sentence.lower())) for word in all_words}
test_sent_features

{'film': False,
 'movie': True,
 'one': False,
 'character': False,
 'like': False,
 'time': False,
 'get': False,
 'scene': False,
 'make': False,
 'even': False,
 'good': False,
 'story': False,
 'would': False,
 'much': False,
 'also': False,
 'well': False,
 'life': False,
 'two': False,
 'see': False,
 'way': False,
 'first': False,
 '--': False,
 'go': False,
 'year': False,
 'thing': False,
 'take': False,
 'plot': False,
 'really': False,
 'come': False,
 'little': False,
 'know': False,
 'people': False,
 'could': False,
 'man': False,
 'bad': True,
 'work': False,
 'never': False,
 'director': False,
 'best': False,
 'end': False,
 'performance': False,
 'new': False,
 'look': False,
 'many': False,
 'action': False,
 'actor': False,
 'u': False,
 'love': False,
 'play': False,
 'star': False,
 'role': False,
 'show': False,
 'great': False,
 'another': False,
 'find': False,
 'made': False,
 'audience': False,
 'back': False,
 'give': False,
 'big': False,
 'world': False,
 

In [101]:
## Let's classify our new feature
classifier.classify(test_sent_features)

'neg'