In [5]:
import os, string
from nltk.corpus import stopwords
from collections import Counter
# specify directory to load
directory = '/home/chandresh/ckm/data/movies review/review_polarity/txt_sentoken/'
vocab_dict ='movie_dict.txt'
outfile    = 'processed_review.txt'
url ="http://www.cs.cornell.edu/people/pabo/movie-review-data/review_polarity.tar.gz"

In [7]:
with open(directory+'pos/'+'cv000_29590.txt','r') as fid:
    print(fid.readlines())
    

["films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . \n", "for starters , it was created by alan moore ( and eddie campbell ) , who brought the medium to a whole new level in the mid '80s with a 12-part series called the watchmen . \n", 'to say moore and campbell thoroughly researched the subject of jack the ripper would be like saying michael jackson is starting to look a little odd . \n', 'the book ( or " graphic novel , " if you will ) is over 500 pages long and includes nearly 30 more that consist of nothing but footnotes . \n', "in other words , don't dismiss this film because of its source . \n", "if you can get past the whole comic book thing , you might find another stumbling block in from hell's directors , albert and allen hughes . \n", "getting the hughes brothers

In [2]:
"""
Step 1. Data loading
"""
data=[]
def process_docs(directory):
    for filename in os.listdir(directory):
		# skip files that do not have the right extension
        if not filename.endswith(".txt"):
            continue		
		# create the full path of the file to open
        path = directory + '/' + filename
		# load document
        with open(path,'r') as fid:
           data.append(fid.read())
        #print("loaded:", filename)          
    print("processed ",len(data), " docs")

process_docs(directory+'pos')
process_docs(directory+'neg')

processed  1000  docs
processed  2000  docs


In [3]:
"""
step 2. data cleaning

Remove punctuation from words (e.g. ‘what’s’).
Removing tokens that are just punctuation (e.g. ‘-‘).
Removing tokens that contain numbers (e.g. ’10/10′).
Remove tokens that have one character (e.g. ‘a’).
Remove tokens that don’t have much meaning (e.g. ‘and’)

Some ideas:

We can filter out punctuation from tokens using the string translate() function.
We can remove tokens that are just punctuation or contain numbers by using an isalpha() check on each token.
We can remove English stop words using the list loaded using NLTK.
We can filter out short tokens by checking their length.
"""
def clean_doc(data):
    # split into tokens by white space
    for i,doc in enumerate(data):
        tokens = doc.split()
        # remove punctuation from each token
        table = str.maketrans('', '', string.punctuation)
        tokens = [w.translate(table) for w in tokens]
        # remove remaining tokens that are not alphabetic
        tokens = [word for word in tokens if word.isalpha()]
        # filter out stop words
        stop_words = set(stopwords.words('english'))
        tokens = [w for w in tokens if not w in stop_words]
        # filter out short tokens
        tokens = [word for word in tokens if len(word) > 1]
        #print(tokens)
        data[i]=tokens

# clean the doc
clean_doc(data)


In [4]:
#print cleand doc
print(data[0])

['rated', 'strong', 'language', 'sexual', 'dialogue', 'drug', 'use', 'crude', 'humor', 'violence', 'brief', 'nudity', 'starring', 'ben', 'affleck', 'matt', 'damon', 'linda', 'fiorentino', 'salma', 'hayek', 'alan', 'rickman', 'chris', 'rock', 'kevin', 'smith', 'jason', 'mewes', 'jason', 'lee', 'george', 'carlin', 'alanis', 'morissette', 'running', 'time', 'minutes', 'huge', 'fan', 'kevin', 'smith', 'expecting', 'lot', 'newest', 'project', 'dogma', 'might', 'kevins', 'best', 'work', 'date', 'funny', 'smart', 'foulmouthed', 'dialogue', 'unexpectedly', 'serious', 'undertone', 'besides', 'going', 'god', 'jesus', 'christ', 'actually', 'tries', 'tell', 'people', 'thing', 'rather', 'isnt', 'little', 'movie', 'premiered', 'sundance', 'little', 'film', 'called', 'clerks', 'little', 'film', 'went', 'become', 'huge', 'video', 'sensation', 'really', 'cant', 'meet', 'person', 'green', 'earth', 'seen', 'clerks', 'hailed', 'critics', 'one', 'best', 'movies', 'year', 'soon', 'sequel', 'coming', 'kevin'

In [8]:
"""
Step 3. Build the vocabulary
"""
 # define vocab
vocab = Counter()

for doc in data:
    vocab.update(doc)
# print lenngth of the vocab
print(len(vocab))    
# print the top most_common words
print(vocab.most_common(50))
# print lenngth of the vocab
print(len(vocab))  
# keep vocab with words whose freq is at least 5
min_freq = 5
vocab = [w for w, c in vocab.items() if c>=min_freq]  

# save the vocab
with open(directory+vocab_dict,'w') as fid:
    lines ="\n".join(vocab)
    fid.write(lines)

46557
[('film', 8860), ('one', 5521), ('movie', 5440), ('like', 3553), ('even', 2555), ('good', 2320), ('time', 2283), ('story', 2118), ('films', 2102), ('would', 2042), ('much', 2024), ('also', 1965), ('characters', 1947), ('get', 1921), ('character', 1906), ('two', 1825), ('first', 1768), ('see', 1730), ('well', 1694), ('way', 1668), ('make', 1590), ('really', 1563), ('little', 1491), ('life', 1472), ('plot', 1451), ('people', 1420), ('movies', 1416), ('could', 1395), ('bad', 1374), ('scene', 1373), ('never', 1364), ('best', 1301), ('new', 1277), ('many', 1268), ('doesnt', 1267), ('man', 1266), ('scenes', 1265), ('dont', 1210), ('know', 1207), ('hes', 1150), ('great', 1141), ('another', 1111), ('love', 1089), ('action', 1078), ('go', 1075), ('us', 1065), ('director', 1056), ('something', 1048), ('end', 1047), ('still', 1038)]
46557


In [9]:
"""
step 4. Save the prepared data for modeling
Q. why to do that?
A. Decouple the data preparation from modeling

Q. How to use dict to clean the reviews
A. here are the steps:
    1. Load dict
    2. process each doc, remvoing tokens not in dict
    3. save the doc
"""
lines=''
for doc in data:
    tokens = [w for w in doc if w in vocab]
    line = " ".join(tokens)
    lines+=line+'\n'
    
with open(directory+outfile,'w') as fid:
    fid.write(lines)    


# Modeling starts here

In [19]:
from collections import defaultdict, Counter
import numpy as np
import random
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# specify directory to load
directory = '/home/chandresh/ckm/data/movies review/review_polarity/txt_sentoken/'
inpfile    = 'processed_review.txt'
vocab_dict ='movie_dict.txt'


In [20]:
"""
Step 1. load the data
"""
data =[]
with open(directory+inpfile,'r') as fid:
    for doc in fid:
         tokens = doc.strip('\n')
         data.append(tokens)

In [21]:
print(data[0])

rated strong language sexual dialogue drug use crude humor violence brief nudity starring ben affleck matt damon linda fiorentino salma hayek alan rickman chris rock kevin smith jason mewes jason lee george carlin running time minutes huge fan kevin smith expecting lot newest project dogma might kevins best work date funny smart foulmouthed dialogue unexpectedly serious besides going god jesus christ actually tries tell people thing rather isnt little movie premiered sundance little film called clerks little film went become huge video sensation really cant meet person green earth seen clerks hailed critics one best movies year soon sequel coming kevin made another movie mallrats flopped horribly thought best movie ever made pretty good little flick panned critics hailed audiences chasing amy came kevins biggest success date get great little movie name dogma wow cast ok plot movie linda fiorentino plays regular woman works abortion clinic one day house voice god appears house much surp

In [22]:
"""
step 2. load the dictionary
"""
vocabulary=defaultdict(int)

with open(directory+vocab_dict,'r') as fid:
    words = fid.readlines()
    words = [w.strip('\n') for w in words]
    vocabulary = {w:i for i, w in enumerate(words)}
    

In [23]:
"""
step 3. Using tf-idf model for encoding documents
"""

# define data matrix X and label array y
pos_label = [1]*1000
neg_label = [0]*1000
y = np.array(pos_label+neg_label)
tfidf = TfidfVectorizer(ngram_range=(1,1), vocabulary=vocabulary)
X = tfidf.fit_transform(data)

# clear data to free up space
del vocabulary
del data


In [24]:
#split data into train/test
# step 1. shuffle the data
n,d = X.shape
index = list(range(0,n))
random.shuffle(index)

X = X[index, :]
y = y[index]

# step 2. split into train/test in 2/3-1/3
Xtrain = X[:1500,:]
ytrain = y[:1500]

Xtest = X[1500:,:]
ytest = y[1500:]

del X
del y

In [25]:
print(Xtrain.shape)

(1500, 14803)


In [30]:
"""
custome logistic regression
"""
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=10000, fit_intercept=True, verbose=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.w = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.w)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.w -= self.lr * gradient
            
            if(self.verbose == True and i % 1000 == 0):
                z = np.dot(X, self.w)
                h = self.__sigmoid(z)
                print('loss: ',self.__loss(h, y))
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.w))
    
    def predict(self, X, threshold=0.5):
        return self.predict_prob(X) >= threshold

#Evaluation
model = LogisticRegression(lr=0.1, num_iter=10000)
Xtrain =Xtrain.toarray()
Xtest = Xtest.toarray()
model.fit(Xtrain, ytrain)
preds = model.predict(Xtest)
# accuracy
print('accuracy',(preds == ytest).mean())


loss:  0.6931066665423955
loss:  0.6557437667477811
loss:  0.6225077958395836
loss:  0.5927778675403912
loss:  0.5660398731625527
loss:  0.5418622769752304
loss:  0.5198852255815232
loss:  0.4998092593566594
loss:  0.4813849306187443
loss:  0.4644038283644683
accuracy 0.83


In [29]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

logreg.fit(Xtrain, ytrain)
#logreg.score(Xtrain, ytrain)
#logreg.score(Xtest, ytest)
y_hat = logreg.predict(Xtest)
#print accuracy
from sklearn.metrics import accuracy_score
print("test accuracy:",accuracy_score(ytest,y_hat))

##print (confusion matrix
from sklearn.metrics import confusion_matrix
from util import plot_confusion_matrix
import matplotlib.pyplot as plt
# Compute confusion matrix
cnf_matrix = confusion_matrix(ytest, y_hat)
np.set_printoptions(precision=2)
# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=np.unique(y_hat),title='Confusion matrix, without normalization')
    

test accuracy: 0.85
Confusion matrix, without normalization
[[213  33]
 [ 42 212]]
