# Illustrate Text Classification

In [22]:
# Libraries
import nltk

In [23]:
# Get Data from NLTK Corpus
nltk. download ('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\diegoortizmatajira\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [4]:
from nltk.corpus import movie_reviews

# Dataset

In [6]:
movie_reviews.words()

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [8]:
len(movie_reviews.words())

1583820

In [10]:
movie_reviews.fileids()[:5]

['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt',
 'neg/cv004_12641.txt']

In [11]:
movie_reviews.fileids()[-6:-1]

['pos/cv994_12270.txt',
 'pos/cv995_21821.txt',
 'pos/cv996_11592.txt',
 'pos/cv997_5046.txt',
 'pos/cv998_14111.txt']

# Text Preprocessing

In [13]:
reviews = ' '.join(movie_reviews.words())
reviews[:300]

'plot : two teen couples go to a church party , drink and then drive . they get into an accident . one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . what \' s the deal ? watch the movie and " sorta " find out . . . critique : a mind - fuck movie for the '

## remove punctuation

In [14]:
import string

In [17]:
filtered_reviews = reviews.translate(str.maketrans('', '', string.punctuation))
filtered_reviews[:300]

'plot  two teen couples go to a church party  drink and then drive  they get into an accident  one of the guys dies  but his girlfriend continues to see him in her life  and has nightmares  what  s the deal  watch the movie and  sorta  find out    critique  a mind  fuck movie for the teen generation '

## Make lower and Tokenize

In [19]:
from nltk.tokenize import word_tokenize

In [20]:
tokens = word_tokenize(filtered_reviews.lower())
tokens[:10]

['plot', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', 'drink']

## Stopwords removal

In [24]:
from nltk.corpus import stopwords

In [25]:
stopwords_eng = stopwords.words('English')
filtered_tokens = [word for word in tokens if word not in stopwords_eng]
filtered_tokens [:30]

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation']

## Frequency of words

In [26]:
counter_dict = nltk.FreqDist(filtered_tokens)
counter_dict.most_common(10)

[('film', 9519),
 ('one', 5853),
 ('movie', 5774),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2170),
 ('would', 2110),
 ('much', 2050)]

## Create document with Categories

In [35]:
docs = [(list (movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]


In [39]:
docs[1989:1990]

[(['lisa',
   'cholodenko',
   "'",
   's',
   '"',
   'high',
   'art',
   ',',
   '"',
   'is',
   'an',
   'intelligent',
   ',',
   'quiet',
   'drama',
   '.',
   'its',
   'strongest',
   'quality',
   ',',
   'aside',
   'from',
   'the',
   'top',
   '-',
   'notch',
   'central',
   'performances',
   ',',
   'is',
   'the',
   'perceptive',
   'way',
   'in',
   'which',
   'the',
   'film',
   ',',
   'also',
   'written',
   'by',
   'cholodenko',
   ',',
   'observes',
   'its',
   'characters',
   '.',
   'they',
   'are',
   'all',
   'flawed',
   'people',
   ',',
   'some',
   'more',
   'troubled',
   'than',
   'others',
   ',',
   'but',
   'they',
   'are',
   'not',
   'judged',
   '.',
   'judging',
   'the',
   'characters',
   'in',
   'this',
   'picture',
   'would',
   'be',
   'a',
   'creative',
   'misstep',
   'on',
   'the',
   'filmmakers',
   "'",
   'parts',
   ',',
   'because',
   'no',
   'one',
   ',',
   'no',
   'matter',
   'how',
   'bad',
  

## Feature Extraction

In [32]:
word_features = [word for word in counter_dict.most_common(3000) ]
word_features[:10]

[('film', 9519),
 ('one', 5853),
 ('movie', 5774),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2170),
 ('would', 2110),
 ('much', 2050)]

In [31]:
def search_features (doc) :
    words = set(doc)
    features = {}
    for w in word_features:
        features [w] = (w in words)
    return features

In [41]:
len(docs), docs[0][1], docs[0] [0]

(2000,
 'neg',
 ['plot',
  ':',
  'two',
  'teen',
  'couples',
  'go',
  'to',
  'a',
  'church',
  'party',
  ',',
  'drink',
  'and',
  'then',
  'drive',
  '.',
  'they',
  'get',
  'into',
  'an',
  'accident',
  '.',
  'one',
  'of',
  'the',
  'guys',
  'dies',
  ',',
  'but',
  'his',
  'girlfriend',
  'continues',
  'to',
  'see',
  'him',
  'in',
  'her',
  'life',
  ',',
  'and',
  'has',
  'nightmares',
  '.',
  'what',
  "'",
  's',
  'the',
  'deal',
  '?',
  'watch',
  'the',
  'movie',
  'and',
  '"',
  'sorta',
  '"',
  'find',
  'out',
  '.',
  '.',
  '.',
  'critique',
  ':',
  'a',
  'mind',
  '-',
  'fuck',
  'movie',
  'for',
  'the',
  'teen',
  'generation',
  'that',
  'touches',
  'on',
  'a',
  'very',
  'cool',
  'idea',
  ',',
  'but',
  'presents',
  'it',
  'in',
  'a',
  'very',
  'bad',
  'package',
  '.',
  'which',
  'is',
  'what',
  'makes',
  'this',
  'review',
  'an',
  'even',
  'harder',
  'one',
  'to',
  'write',
  ',',
  'since',
  'i',
  'g

In [42]:
search_features(docs[0][0])

{('film', 9519): False,
 ('one', 5853): False,
 ('movie', 5774): False,
 ('like', 3690): False,
 ('even', 2565): False,
 ('good', 2411): False,
 ('time', 2411): False,
 ('story', 2170): False,
 ('would', 2110): False,
 ('much', 2050): False,
 ('character', 2020): False,
 ('also', 1967): False,
 ('get', 1949): False,
 ('two', 1912): False,
 ('well', 1906): False,
 ('characters', 1859): False,
 ('first', 1836): False,
 ('see', 1749): False,
 ('way', 1693): False,
 ('make', 1642): False,
 ('life', 1589): False,
 ('really', 1565): False,
 ('films', 1536): False,
 ('plot', 1513): False,
 ('little', 1502): False,
 ('people', 1456): False,
 ('could', 1427): False,
 ('scene', 1397): False,
 ('bad', 1396): False,
 ('man', 1396): False,
 ('never', 1375): False,
 ('best', 1333): False,
 ('new', 1292): False,
 ('scenes', 1274): False,
 ('many', 1269): False,
 ('director', 1237): False,
 ('know', 1217): False,
 ('movies', 1206): False,
 ('action', 1172): False,
 ('great', 1149): False,
 ('another',

In [45]:
featureSet = [(search_features(doc), category) for (doc, category) in docs]
featureSet[0]

({('film', 9519): False,
  ('one', 5853): False,
  ('movie', 5774): False,
  ('like', 3690): False,
  ('even', 2565): False,
  ('good', 2411): False,
  ('time', 2411): False,
  ('story', 2170): False,
  ('would', 2110): False,
  ('much', 2050): False,
  ('character', 2020): False,
  ('also', 1967): False,
  ('get', 1949): False,
  ('two', 1912): False,
  ('well', 1906): False,
  ('characters', 1859): False,
  ('first', 1836): False,
  ('see', 1749): False,
  ('way', 1693): False,
  ('make', 1642): False,
  ('life', 1589): False,
  ('really', 1565): False,
  ('films', 1536): False,
  ('plot', 1513): False,
  ('little', 1502): False,
  ('people', 1456): False,
  ('could', 1427): False,
  ('scene', 1397): False,
  ('bad', 1396): False,
  ('man', 1396): False,
  ('never', 1375): False,
  ('best', 1333): False,
  ('new', 1292): False,
  ('scenes', 1274): False,
  ('many', 1269): False,
  ('director', 1237): False,
  ('know', 1217): False,
  ('movies', 1206): False,
  ('action', 1172): False