# Importing movie_reviews dataset

In [1]:
from nltk.corpus import movie_reviews

In [2]:
movie_reviews.categories() #output labels

['neg', 'pos']

In [3]:
movie_reviews.fileids()

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt',
 'neg/cv005_29357.txt',
 'neg/cv006_17022.txt',
 'neg/cv007_4992.txt',
 'neg/cv008_29326.txt',
 'neg/cv009_29417.txt',
 'neg/cv010_29063.txt',
 'neg/cv011_13044.txt',
 'neg/cv012_29411.txt',
 'neg/cv013_10494.txt',
 'neg/cv014_15600.txt',
 'neg/cv015_29356.txt',
 'neg/cv016_4348.txt',
 'neg/cv017_23487.txt',
 'neg/cv018_21672.txt',
 'neg/cv019_16117.txt',
 'neg/cv020_9234.txt',
 'neg/cv021_17313.txt',
 'neg/cv022_14227.txt',
 'neg/cv023_13847.txt',
 'neg/cv024_7033.txt',
 'neg/cv025_29825.txt',
 'neg/cv026_29229.txt',
 'neg/cv027_26270.txt',
 'neg/cv028_26964.txt',
 'neg/cv029_19943.txt',
 'neg/cv030_22893.txt',
 'neg/cv031_19540.txt',
 'neg/cv032_23718.txt',
 'neg/cv033_25680.txt',
 'neg/cv034_29446.txt',
 'neg/cv035_3343.txt',
 'neg/cv036_18385.txt',
 'neg/cv037_19798.txt',
 'neg/cv038_9781.txt',
 'neg/cv039_5963.txt',
 'neg/cv040_8829.txt',
 'neg/cv041_22364.txt',


In [4]:
#How to access the words for the particular text(let say 5)
movie_reviews.words(movie_reviews.fileids()[5])

['capsule', ':', 'in', '2176', 'on', 'the', 'planet', ...]

In [5]:
#nltk library requires input in the form of dictionary where each list has the words:freq format with the label attached to each text document
documents=[]
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid),category))
documents
documents[0:3]

[(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg'),
 (['the', 'happy', 'bastard', "'", 's', 'quick', 'movie', ...], 'neg'),
 (['it', 'is', 'movies', 'like', 'these', 'that', 'make', ...], 'neg')]

In [6]:
#initially positive reviews are followed by negative reviews ..so before train test split  let's shuffle them
import random
random.shuffle(documents)
documents[0:8]

[(['i', 'have', 'a', 'soft', 'spot', 'in', 'my', 'heart', ...], 'pos'),
 (['it', "'", 's', 'always', 'a', 'bad', 'sign', 'when', ...], 'neg'),
 (['when', 'i', 'originally', 'saw', 'the', 'trailer', ...], 'neg'),
 (['don', "'", 't', 'let', 'this', 'movie', 'fool', ...], 'neg'),
 (['when', 'it', 'comes', 'to', 'the', 'average', ...], 'neg'),
 (['in', 'the', 'continuation', 'of', 'warner', ...], 'neg'),
 (['"', 'crazy', '/', 'beautiful', '"', 'suffers', ...], 'pos'),
 (['after', 'sixteen', 'years', 'francis', 'ford', ...], 'pos')]

# Removing stopwords and punctuations

In [7]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [8]:
from nltk.corpus import stopwords
import string
stop=set(stopwords.words('english'))
punc=string.punctuation
stop=list(stop)+list(punc)


In [9]:
from nltk.corpus.reader import wordnet

# Lemmatizing the document

In [10]:
def getpos(pos):
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    elif pos.startswith('V'):
        return wordnet.VERB
    else :
        return wordnet.NOUN

In [11]:
from nltk import pos_tag

In [12]:
def clean_words(document):
    output_words=[]
    for word in document:
        if word.lower() not in stop:
            pos=pos_tag([word]) 
            clean_word=lemmatizer.lemmatize(word,pos=getpos(pos[0][1]))
            output_words.append(clean_word)
    return output_words

In [13]:
documents=[(clean_words(document),category) for document,category in documents]
documents[0:3]

[(['soft',
   'spot',
   'heart',
   'pure',
   'amoral',
   'sleaze',
   'like',
   'showgirl',
   'say',
   'typically',
   'enjoy',
   'sit',
   'eszterhaz',
   'sex',
   'fill',
   'cinematic',
   'tabloid',
   'ish',
   'trash',
   'wild',
   'thing',
   'best',
   'sleazy',
   'film',
   'ever',
   'see',
   'fill',
   'great',
   'performance',
   'graphic',
   'sex',
   'three',
   'way',
   'sex',
   'rape',
   'charge',
   'double',
   'cross',
   'plot',
   'twist',
   'genitalia',
   'gender',
   'graphic',
   'violence',
   'death',
   'witty',
   'humor',
   'best',
   'denise',
   'richards',
   'criticize',
   'film',
   'disgust',
   'raucous',
   'plain',
   'wrong',
   'fundamentally',
   'pointless',
   'criticize',
   'citizen',
   'kane',
   'disturb',
   'titanic',
   'lofty',
   'epic',
   'wild',
   'thing',
   'film',
   'succeed',
   'endeavor',
   'admire',
   '--',
   'admire',
   'entertain',
   'fun',
   'time',
   'movie',
   'must',
   'admit',
   'fron

# Splitting the document

In [14]:
from sklearn.model_selection import train_test_split
training_document=documents[0:1500]
testing_document=documents[1500:]


In [15]:
all_words=[]
for doc in training_document:
    all_words.extend(doc[0])
all_words

['soft',
 'spot',
 'heart',
 'pure',
 'amoral',
 'sleaze',
 'like',
 'showgirl',
 'say',
 'typically',
 'enjoy',
 'sit',
 'eszterhaz',
 'sex',
 'fill',
 'cinematic',
 'tabloid',
 'ish',
 'trash',
 'wild',
 'thing',
 'best',
 'sleazy',
 'film',
 'ever',
 'see',
 'fill',
 'great',
 'performance',
 'graphic',
 'sex',
 'three',
 'way',
 'sex',
 'rape',
 'charge',
 'double',
 'cross',
 'plot',
 'twist',
 'genitalia',
 'gender',
 'graphic',
 'violence',
 'death',
 'witty',
 'humor',
 'best',
 'denise',
 'richards',
 'criticize',
 'film',
 'disgust',
 'raucous',
 'plain',
 'wrong',
 'fundamentally',
 'pointless',
 'criticize',
 'citizen',
 'kane',
 'disturb',
 'titanic',
 'lofty',
 'epic',
 'wild',
 'thing',
 'film',
 'succeed',
 'endeavor',
 'admire',
 '--',
 'admire',
 'entertain',
 'fun',
 'time',
 'movie',
 'must',
 'admit',
 'front',
 'however',
 'directly',
 'center',
 'age',
 'group',
 'film',
 'engineer',
 'appeal',
 'imagine',
 'show',
 'hard',
 'give',
 'plot',
 'summary',
 'without

In [16]:
from nltk import FreqDist
freq=FreqDist(all_words) #saare words ka count ke sath store
common=freq.most_common(2000) # top k words
features=[i[0] for i in common]
features

['film',
 'movie',
 'one',
 'make',
 'like',
 'character',
 'get',
 'see',
 'go',
 'time',
 'well',
 'scene',
 'even',
 'good',
 'story',
 'take',
 'would',
 'much',
 'also',
 'give',
 'come',
 'life',
 'way',
 'two',
 'bad',
 'look',
 'seem',
 'know',
 'first',
 'end',
 '--',
 'year',
 'work',
 'thing',
 'plot',
 'say',
 'really',
 'play',
 'little',
 'show',
 'people',
 'love',
 'could',
 'man',
 'star',
 'great',
 'performance',
 'never',
 'director',
 'best',
 'try',
 'new',
 'many',
 'big',
 'want',
 'actor',
 'action',
 'watch',
 'find',
 'u',
 'role',
 'think',
 'act',
 'another',
 'audience',
 'something',
 'back',
 'turn',
 'world',
 'still',
 'day',
 'set',
 'however',
 'old',
 'use',
 'guy',
 'cast',
 'begin',
 'interest',
 'every',
 'comedy',
 'enough',
 'part',
 'though',
 'feel',
 'last',
 'around',
 'right',
 'real',
 'run',
 'point',
 'may',
 'john',
 'woman',
 'write',
 'effect',
 'actually',
 'fact',
 'friend',
 'name',
 'almost',
 'young',
 'funny',
 'script',
 'noth

In [17]:
def get_feature_dict(words):
    word_set=set(words)
    current_feature={}
    for f in features:
        current_feature[f]= f in word_set
    return current_feature

In [18]:
get_feature_dict(training_document[0][0])

{'film': True,
 'movie': True,
 'one': True,
 'make': True,
 'like': True,
 'character': True,
 'get': False,
 'see': True,
 'go': False,
 'time': True,
 'well': False,
 'scene': True,
 'even': True,
 'good': True,
 'story': True,
 'take': True,
 'would': False,
 'much': True,
 'also': False,
 'give': True,
 'come': False,
 'life': True,
 'way': True,
 'two': True,
 'bad': False,
 'look': False,
 'seem': False,
 'know': True,
 'first': True,
 'end': False,
 '--': True,
 'year': False,
 'work': True,
 'thing': True,
 'plot': True,
 'say': True,
 'really': False,
 'play': False,
 'little': True,
 'show': True,
 'people': True,
 'love': True,
 'could': True,
 'man': False,
 'star': True,
 'great': True,
 'performance': True,
 'never': False,
 'director': True,
 'best': True,
 'try': False,
 'new': False,
 'many': False,
 'big': False,
 'want': True,
 'actor': True,
 'action': False,
 'watch': False,
 'find': True,
 'u': True,
 'role': False,
 'think': False,
 'act': True,
 'another': Fals

In [19]:
training_data= [(get_feature_dict(doc),category) for doc,category in training_document]

In [20]:
testing_data= [(get_feature_dict(doc),category) for doc,category in testing_document]

In [21]:
training_data[0]

({'film': True,
  'movie': True,
  'one': True,
  'make': True,
  'like': True,
  'character': True,
  'get': False,
  'see': True,
  'go': False,
  'time': True,
  'well': False,
  'scene': True,
  'even': True,
  'good': True,
  'story': True,
  'take': True,
  'would': False,
  'much': True,
  'also': False,
  'give': True,
  'come': False,
  'life': True,
  'way': True,
  'two': True,
  'bad': False,
  'look': False,
  'seem': False,
  'know': True,
  'first': True,
  'end': False,
  '--': True,
  'year': False,
  'work': True,
  'thing': True,
  'plot': True,
  'say': True,
  'really': False,
  'play': False,
  'little': True,
  'show': True,
  'people': True,
  'love': True,
  'could': True,
  'man': False,
  'star': True,
  'great': True,
  'performance': True,
  'never': False,
  'director': True,
  'best': True,
  'try': False,
  'new': False,
  'many': False,
  'big': False,
  'want': True,
  'actor': True,
  'action': False,
  'watch': False,
  'find': True,
  'u': True,
  '

# Usin nltk Naive Bayes Classifier

In [22]:
import nltk
from nltk import NaiveBayesClassifier
clf=NaiveBayesClassifier.train(training_data)
nltk.classify.accuracy(clf,testing_data)

0.81

# Accuracy of 0.81 is achieved on the testing dataset

In [23]:
clf.show_most_informative_features(10)

Most Informative Features
                  seagal = True              neg : pos    =     11.9 : 1.0
                   damon = True              pos : neg    =     10.0 : 1.0
             outstanding = True              pos : neg    =      8.5 : 1.0
                  prinze = True              neg : pos    =      6.7 : 1.0
             wonderfully = True              pos : neg    =      6.1 : 1.0
              schumacher = True              neg : pos    =      5.9 : 1.0
               fantastic = True              pos : neg    =      5.8 : 1.0
                    lame = True              neg : pos    =      5.6 : 1.0
                   awful = True              neg : pos    =      5.4 : 1.0
              ridiculous = True              neg : pos    =      5.4 : 1.0


# SVM Classifier of sklearn

In [24]:
from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
svc=SVC()
classifier_sklearn=SklearnClassifier(svc)

In [25]:
classifier_sklearn.train(training_data)
nltk.classify.accuracy(classifier_sklearn,testing_data)

0.812

# Accuracy of 0.812 by svm classifier of sklearn