In [4]:
import os
import numpy as np
import pandas as pd

In [5]:
basepath = 'aclImdb'
labels = { 'pos': 1, 'neg': 0 }

df_all = []
for i in ( 'test', 'train' ):
    for j in ( 'pos', 'neg' ):
        
        path = os.path.join( basepath, i, j )
        for file in os.listdir(path):
            
            with open( os.path.join( path, file ), 'r', encoding = 'utf-8' ) as f:
                txt = f.read()
            df = pd.DataFrame( [[ txt, labels[j] ]] )
            df_all.append(df)

In [6]:
movie = pd.concat( df_all, ignore_index = True )
movie.columns = [ 'review', 'sentiment' ]
movie.head()

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [7]:
# need to use nltk.downoad("stopwords") to actually download the stopword corpus
import os
import string
import argparse
import numpy as np
from collections import Counter
from nltk.corpus import stopwords

In [8]:
def tokenize( folder, ngrams ):
    """all the tokenized word for each ngram and each text file in the folder"""
    tokens = []
    for file in os.listdir(folder):
        with open( os.path.join( folder, file ), 'r' ) as f:
            text = f.read()
            
            # remove numbers, punctuation marks, leading and trailing whitespaces
            # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python
            table = str.maketrans({ key: None for key in string.punctuation + string.digits })
            text = text.translate(table).strip()
            
            # convert to lower case, remove stop words
            words = text.lower().split()
            stop_words = set( stopwords.words("english") )
            words = [ w for w in words if not w in stop_words ]
                        
            for ngram in set(ngrams):
                if ngram == 1:
                    tokens.extend(words)
                else:
                    for i in range( len(words) - ngram + 1 ):
                        word = '_'.join( words[ i:(i + ngram) ] )
                        tokens.append(word)
    return tokens


def build_dict( folder, ngrams ):
    """return the word count for each ngrams"""
    word_counts = Counter()    
    tokens = tokenize( folder, ngrams )   
    word_counts.update(tokens)
    return word_counts

In [9]:
def compute_ratio( pos_counts, neg_counts, alpha = 1 ):

    # sets's .union return unique elements that are in either of the two sets
    all_tokens = set( pos_counts.keys() ).union( set( neg_counts.keys() ) )

    # assign each token an index number
    word_dict = { token: index for index, token in enumerate(all_tokens) }
    word_count = len(word_dict)

    p, q = np.ones(word_count) * alpha, np.ones(word_count) * alpha
    for t in all_tokens:
        p[ word_dict[t] ] += pos_counts[t]
        q[ word_dict[t] ] += neg_counts[t]
        
    p /= p.sum()
    q /= q.sum()
    r = np.log( p / q )
    return word_dict, r

In [10]:
basepath = 'aclImdb'
pos_path = os.path.join( basepath, 'train', 'pos1' )
neg_path = os.path.join( basepath, 'train', 'neg1' )

ngrams = ( 1, 2 )
pos_counts = build_dict( pos_path, ngrams )
neg_counts = build_dict( neg_path, ngrams ) 

In [11]:
word_dict, r = compute_ratio( pos_counts, neg_counts, alpha = 1 )

http://ai.stanford.edu/~amaas/data/sentiment/

https://github.com/mesnilgr/nbsvm

https://github.com/lrei/nbsvm

https://github.com/vivekn/sentiment

https://github.com/vsl9/Sentiment-Analysis-with-Convolutional-Networks