# Part 2: Train My Own Machine Learning Model
#### train a simple ML model to analyze movie reviews

In [3]:
#download corpus
from nltk import download

download('movie_reviews')
download('punkt') # for tokenization
download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [4]:
# cleanup data
from nltk.corpus import stopwords
from string import punctuation

stopwords_eng = stopwords.words('english')

# remove stopwords and punctuation
def extract_features(words):
    return [w for w in words if w not in stopwords_eng and w not in punctuation]

# pair each word with # of its occurence
def bag_of_words(words):
    bag = {}
    for w in words:
        bag[w] = bag.get(w,0)+1
    return bag

In [5]:
# apply cleaning procedures to pre-tokenized reviews
from nltk.corpus import movie_reviews

reviews_pos = []
reviews_neg = []

for fileid in movie_reviews.fileids('pos'):
    words = extract_features(movie_reviews.words(fileid))
    reviews_pos.append((bag_of_words(words), 'pos'))
    
for fileid in movie_reviews.fileids('neg'):
    words = extract_features(movie_reviews.words(fileid))
    reviews_neg.append((bag_of_words(words), 'neg'))

In [6]:
# split data into train (80%) and test (20%)
from random import seed, shuffle 

# set seed
seed(0) 

# shuffle reviews
shuffle(reviews_pos)
shuffle(reviews_neg)

# split into train and test
split_pct = .80 # 80% to train

def split_set(review_set):
    split = int(len(review_set)*split_pct)
    return (review_set[:split], review_set[split:])

pos_train, pos_test = split_set(reviews_pos)
neg_train, neg_test = split_set(reviews_neg)

train_set = pos_train + neg_train
test_set  = pos_test + neg_test

In [7]:
# train model using NaiveBayes
from nltk.classify import NaiveBayesClassifier

# each entry of train_set has X (bag of words) and Y ('pos' or 'neg')
model = NaiveBayesClassifier.train(train_set)

In [8]:
# check model accuracy
from nltk.classify.util import accuracy

print('accuracy: {}% correct'.format(100*accuracy(model, test_set)))

accuracy: 70.75% correct


In [9]:
# save the model
import pickle

model_file = open('sa_classifier.pickle','wb')
pickle.dump(model, model_file)
model_file.close()