# Illustrate Text Classification

In [1]:
# Libraries
import nltk

In [2]:
# Get Data from NLTK Corpus
nltk. download ('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\diegoortizmatajira\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [3]:
from nltk.corpus import movie_reviews

# Dataset

In [4]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [5]:
len(movie_reviews.words())

1583820

In [6]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [7]:
movie_reviews.fileids()[-6:-1]

['pos/cv994_12270.txt',
 'pos/cv995_21821.txt',
 'pos/cv996_11592.txt',
 'pos/cv997_5046.txt',
 'pos/cv998_14111.txt']

# Text Preprocessing

In [8]:
reviews = ' '.join(movie_reviews.words())
reviews[:300]

'plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what \' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the '

## remove punctuation

In [9]:
import string

In [10]:
filtered_reviews = reviews.translate(str.maketrans('', '', string.punctuation))
filtered_reviews[:300]

'plot  two teen couples go to a church party  drink and then drive  they get into an accident  one of the guys dies  but his girlfriend continues to see him in her life  and has nightmares  what  s the deal  watch the movie and  sorta  find out    critique  a mind  fuck movie for the teen generation '

## Make lower and Tokenize

In [11]:
from nltk.tokenize import word_tokenize

In [12]:
tokens = word_tokenize(filtered_reviews.lower())
tokens[:10]

['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink']

## Stopwords removal

In [13]:
from nltk.corpus import stopwords

In [14]:
stopwords_eng = stopwords.words('English')
filtered_tokens = [word for word in tokens if word not in stopwords_eng]
filtered_tokens [:30]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation']

## Frequency of words

In [15]:
counter_dict = nltk.FreqDist(filtered_tokens)
counter_dict.most_common(10)

[('film', 9519),
 ('one', 5853),
 ('movie', 5774),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2170),
 ('would', 2110),
 ('much', 2050)]

## Create document with Categories

In [16]:
docs = [(list (movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]


In [28]:
# docs[1989:1990]

## Feature Extraction

In [30]:
word_features = [word[0] for word in counter_dict.most_common(3000) ]
word_features[:10]

['film',
 'one',
 'movie',
 'like',
 'even',
 'good',
 'time',
 'story',
 'would',
 'much']

In [31]:
def search_features (doc) :
    words = set(doc)
    features = {}
    for w in word_features:
        features [w] = (w in words)
    return features

In [32]:
len(docs), docs[0][1], docs[0] [0]

(2000,
 'neg',
 ['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'g

In [33]:
search_features(docs[0][0])

{'film': True,
 'one': True,
 'movie': True,
 'like': True,
 'even': True,
 'good': True,
 'time': False,
 'story': False,
 'would': True,
 'much': False,
 'character': True,
 'also': True,
 'get': True,
 'two': True,
 'well': True,
 'characters': True,
 'first': False,
 'see': True,
 'way': True,
 'make': True,
 'life': True,
 'really': True,
 'films': True,
 'plot': True,
 'little': True,
 'people': True,
 'could': False,
 'scene': False,
 'bad': True,
 'man': False,
 'never': False,
 'best': False,
 'new': True,
 'scenes': True,
 'many': False,
 'director': True,
 'know': True,
 'movies': True,
 'action': False,
 'great': False,
 'another': False,
 'love': False,
 'go': True,
 'made': False,
 'us': True,
 'big': True,
 'end': False,
 'something': False,
 'back': True,
 'still': True,
 'world': True,
 'seems': True,
 'work': False,
 'makes': True,
 'however': False,
 'every': True,
 'though': False,
 'better': False,
 'real': False,
 'audience': True,
 'enough': False,
 'seen': False

In [34]:
featureSet = [(search_features(doc), category) for (doc, category) in docs]
featureSet[0]

({'film': True,
  'one': True,
  'movie': True,
  'like': True,
  'even': True,
  'good': True,
  'time': False,
  'story': False,
  'would': True,
  'much': False,
  'character': True,
  'also': True,
  'get': True,
  'two': True,
  'well': True,
  'characters': True,
  'first': False,
  'see': True,
  'way': True,
  'make': True,
  'life': True,
  'really': True,
  'films': True,
  'plot': True,
  'little': True,
  'people': True,
  'could': False,
  'scene': False,
  'bad': True,
  'man': False,
  'never': False,
  'best': False,
  'new': True,
  'scenes': True,
  'many': False,
  'director': True,
  'know': True,
  'movies': True,
  'action': False,
  'great': False,
  'another': False,
  'love': False,
  'go': True,
  'made': False,
  'us': True,
  'big': True,
  'end': False,
  'something': False,
  'back': True,
  'still': True,
  'world': True,
  'seems': True,
  'work': False,
  'makes': True,
  'however': False,
  'every': True,
  'though': False,
  'better': False,
  'real':

## Training and Test Split

In [35]:
len(featureSet)

2000

In [36]:
X_train = featureSet [:1600]
X_test = featureSet [1600:]

## Classification

In [37]:
classifier = nltk.NaiveBayesClassifier.train(X_train)
classifier

<nltk.classify.naivebayes.NaiveBayesClassifier at 0x20478461610>

In [38]:
classifier.show_most_informative_features(10)

Most Informative Features
               ludicrous = True              neg : pos    =     15.4 : 1.0
             outstanding = True              pos : neg    =     10.8 : 1.0
                   mulan = True              pos : neg    =     10.5 : 1.0
                   inept = True              neg : pos    =     10.0 : 1.0
              whatsoever = True              neg : pos    =      8.8 : 1.0
                  seagal = True              neg : pos    =      8.2 : 1.0
                 idiotic = True              neg : pos    =      8.0 : 1.0
                   damon = True              pos : neg    =      8.0 : 1.0
                  finest = True              pos : neg    =      7.4 : 1.0
                 freddie = True              neg : pos    =      7.4 : 1.0


## Evaluate the model

In [39]:
model_accuracy = nltk.classify.accuracy(classifier, X_test)
model_accuracy * 100

73.75