In [35]:
import sklearn
import nltk
from sklearn.datasets import load_files
from nltk.corpus import movie_reviews


In [36]:
moviedir = r'C:\Users\aditt\AppData\Roaming\nltk_data\corpora\movie_reviews'


In [37]:
movie_train = load_files(moviedir, shuffle=True)
len(movie_train.data)


2000

In [38]:
# target names ("classes") are automatically generated from subfolder names
movie_train.target_names


['neg', 'pos']

In [39]:
movie_train.data[0][:500]


b"arnold schwarzenegger has been an icon for action enthusiasts , since the late 80's , but lately his films have been very sloppy and the one-liners are getting worse . \nit's hard seeing arnold as mr . freeze in batman and robin , especially when he says tons of ice jokes , but hey he got 15 million , what's it matter to him ? \nonce again arnold has signed to do another expensive blockbuster , that can't compare with the likes of the terminator series , true lies and even eraser . \nin this so cal"

### Detour for Countvec

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
# Turn off pretty printing of jupyter notebook... it generates long lines
%pprint


Pretty printing has been turned ON


In [41]:
sents = ['A rose is a rose is a rose is a rose.',
         'Oh, what a fine day it is.',
        "It ain't over till it's over, I tell you!!"]

In [42]:
foovec = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)


In [43]:
# sents turned into sparse vector of word frequency counts
sents_counts = foovec.fit_transform(sents)
# foovec now contains vocab dictionary which maps unique words to indexes
foovec.vocabulary_

{'a': 4,
 'rose': 14,
 'is': 9,
 '.': 3,
 'oh': 12,
 ',': 2,
 'what': 17,
 'fine': 7,
 'day': 6,
 'it': 10,
 'ai': 5,
 "n't": 11,
 'over': 13,
 'till': 16,
 "'s": 1,
 'i': 8,
 'tell': 15,
 'you': 18,
 '!': 0}

In [44]:
# sents_counts has a dimension of 3 (document count) by 19 (# of unique words)
sents_counts.shape

(3, 19)

In [45]:
sents_counts.toarray()


array([[0, 0, 0, 1, 4, 0, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0, 0],
       [0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0],
       [2, 1, 1, 0, 0, 1, 0, 0, 1, 0, 2, 1, 0, 2, 0, 1, 1, 0, 1]],
      dtype=int64)

In [46]:
# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
sents_tfidf = tfidf_transformer.fit_transform(sents_counts)

In [47]:
# TF-IDF values
# raw counts have been normalized against document length, 
# terms that are found across many docs are weighted down
sents_tfidf.toarray()


array([[0.        , 0.        , 0.        , 0.13650997, 0.54603988,
        0.        , 0.        , 0.        , 0.        , 0.40952991,
        0.        , 0.        , 0.        , 0.        , 0.71797683,
        0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.28969526, 0.28969526, 0.28969526,
        0.        , 0.38091445, 0.38091445, 0.        , 0.28969526,
        0.28969526, 0.        , 0.38091445, 0.        , 0.        ,
        0.        , 0.        , 0.38091445, 0.        ],
       [0.47282517, 0.23641258, 0.17979786, 0.        , 0.        ,
        0.23641258, 0.        , 0.        , 0.23641258, 0.        ,
        0.35959573, 0.23641258, 0.        , 0.47282517, 0.        ,
        0.23641258, 0.23641258, 0.        , 0.23641258]])

In [48]:
# initialize movie_vector object, and then turn movie train data into a vector 
movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize)         # use all 25K words. 82.2% acc.
# movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features = 3000) # use top 3000 words only. 78.5% acc.
movie_counts = movie_vec.fit_transform(movie_train.data)

In [49]:
movie_vec.vocabulary_.get('screen')


19604

In [50]:
movie_counts.shape


(2000, 25280)

In [51]:
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)

In [52]:
movie_tfidf.shape


(2000, 25280)

In [53]:
from sklearn.naive_bayes import MultinomialNB
# Split data into training and test sets
# from sklearn.cross_validation import train_test_split  # deprecated in 0.18
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(
    movie_tfidf, movie_train.target, test_size = 0.20, random_state = 12)

In [54]:
# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(docs_train, y_train)

In [55]:
# Predicting the Test set results, find accuracy
y_pred = clf.predict(docs_test)
print('Accuracy: {}'.format(sklearn.metrics.accuracy_score(y_test, y_pred)))
print('Precision: {}'.format(sklearn.metrics.precision_score(y_test, y_pred)))
print('Recall: {}'.format(sklearn.metrics.recall_score(y_test, y_pred)))
print('F1: {}'.format(sklearn.metrics.f1_score(y_test, y_pred)))

Accuracy: 0.82
Precision: 0.8315217391304348
Recall: 0.788659793814433
F1: 0.8095238095238095


In [56]:
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[175,  31],
       [ 41, 153]], dtype=int64)