In [1]:
#Get the movie sentiment corpus data
import os
import numpy as np
from collections import Counter

words_to_select = 5000;
total_examples = 2000; #this is same number of examples in our input corpus
corpus_path = './corpus/' #this path needs to be changed depending on where your files lie
sub_directories = [ 'pos', 'neg' ]

def get_data():
    all_words = []
    positive_ex = 0;
    negative_ex = 0;
    for subdir in sub_directories:
        sentiment = corpus_path + subdir;
        files = [ os.path.join(sentiment,f) for f in os.listdir(sentiment) ]
        if( subdir == 'pos' ):
            positive_ex = positive_ex + len( files )
        else:
            negative_ex = negative_ex + len( files )
        for file in files:
            for line in open( file, 'r' ).read().splitlines():
                words = line.split()
                all_words += words
    dictionary = Counter(all_words)
    list_to_check = dictionary.keys()
    elem_to_remove = [];
    
    for item in list_to_check:
        if ( item.isalpha() == False ) or ( len(item) == 1 ):
            #word with length > 1 and with alphabetic characters only
            elem_to_remove.append( item )
    for item in elem_to_remove:
        del dictionary[item]
    dictionary = dictionary.most_common( words_to_select )
    return [ positive_ex, negative_ex, dictionary ]

In [2]:
def extract_features_frequency( dictionary ):
    docId = 0
    features_matrix = np.zeros((total_examples, words_to_select))
    for subdir in sub_directories:
        sentiment = corpus_path + subdir;
        files = [ os.path.join(sentiment,f) for f in os.listdir(sentiment) ]
        for file in files:
            all_words = []
            for line in open( file, 'r' ).read().splitlines():
                words = line.split()
                all_words += words
            for word in all_words:
                for i,d in enumerate(dictionary):
                    if d[0] == word:
                        features_matrix[docId,i] = \
                            all_words.count(word)
            docId = docId + 1
    return features_matrix

In [3]:
[ positive_ex, negative_ex, dictionary ] = get_data()
word_features_matrix = extract_features_frequency( dictionary )

In [6]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold

total_examples = positive_ex + negative_ex
total_splits = 10

labels = np.zeros(total_examples);
labels[0:positive_ex] = 0;
labels[positive_ex:total_examples] = 1;

folds = StratifiedKFold( n_splits=total_splits )

model = MultinomialNB()
test_accuracy_h = [ 0.0, 0.0 ]
train_accuracy_h = [ 0.0, 0.0 ]
for train_indices, test_indices in folds.split(word_features_matrix , labels):
    
    X_train = [ word_features_matrix[ index ] for index in train_indices ]
    X_test  = [ word_features_matrix[ index ] for index in test_indices ]
    Y_train = labels[ train_indices ]
    Y_test  = labels[ test_indices ]
    
    model.fit( X_train, Y_train )
    train_result = model.predict( X_train )
    test_result = model.predict( X_test )
    
    train_accuracy_h[0] = train_accuracy_h[0] + sum( train_result==Y_train )
    test_accuracy_h[0] = test_accuracy_h[0] + sum( test_result==Y_test )
    train_accuracy_h[1] = train_accuracy_h[1] + len( train_result )
    test_accuracy_h[1] = test_accuracy_h[1] + len( test_result )

train_acc = (train_accuracy_h[0]*100)/train_accuracy_h[1]    
test_acc = (test_accuracy_h[0]*100)/test_accuracy_h[1]
print('Train accuracy is ', round(train_acc, 2 ), '%')
print('Test accuracy is ', round(test_acc, 2 ), '%')

Train accuracy is  90.43 %
Test accuracy is  82.1 %
