In [None]:
import glob
import math
import numpy as np
import os
import re
import time
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import sentiwordnet as swn

In [None]:
def createData(file):
    fileReader = open(file,"r")

    cnt=0
    for r in fileReader:
        tokens= tokenize(r)
        score = 0
        for t in tokens:
            try:
                synset = list(swn.senti_synsets(t))[0]
                score = score + synset.pos_score() - synset.neg_score()
            
            except(IndexError):
                pass

        if(score > 0):
            fPos=open("data/pos/"+str(cnt)+"_1.txt","w")
            cnt+=1
            fPos.write(r)
            fPos.close()
        elif(score < 0):
            fNeg=open("data/neg/"+str(cnt)+"_0.txt","w")
            cnt+=1
            fNeg.write(r)
            fNeg.close()
        else:
            pass

In [None]:
def read_data(path):
    """
    Walks all subdirectories of this path and reads all
    the text files and labels.
    DONE ALREADY.

    Params:
      path....path to files
    Returns:
      docs.....list of strings, one per document
      labels...list of ints, 1=positive, 0=negative label.
               Inferred from file path (i.e., if it contains
               'pos', it is 1, else 0)
    """
    data=[]
    
    resuldDict={}
    
    #get Posisitve File records
    posFnames = sorted([f for f in glob.glob(os.path.join(path, 'pos', '*.txt'))])
    resuldDict['posFiles']=len(posFnames)
    resuldDict['posRecord']=open(posFnames[0]).readlines()[0].strip()
    for f1 in sorted(posFnames):
        with open(f1,'r') as fp:
            data.append((1,fp.readlines()[0].strip("\n"))) 
            
            
#     data = [(1, open(f).readlines()[0].strip("\n")) for f in sorted(posFnames)]
    
    #get Negative File records
    negFnames = [f for f in glob.glob(os.path.join(path, 'neg', '*.txt'))]
    resuldDict['negFiles']=len(negFnames)
    resuldDict['negRecord']=open(negFnames[2]).readlines()[0].strip()
    for f2 in sorted(negFnames):
        with open(f2,'r') as fp:
            data.append((0,fp.readlines()[0].strip("\n"))) 
#     data += [(0, open(f).readlines()[0].strip("\n")) for f in sorted(negFnames)]
    
    data = sorted(data, key=lambda x: x[1])

    #Write Classifier answers to the file
    opFile=open(path+"/classifierAnswers.txt","w+")
    flag=True
    for r in resuldDict:
        if flag:
            opFile.write(r+":"+str(resuldDict[r]))
            flag=False
        else:
            opFile.write("\n"+r+":"+str(resuldDict[r]))

    opFile.close()

    return np.array([d[1] for d in data]), np.array([d[0] for d in data])

In [None]:
def tokenize(doc, keep_internal_punct=False):
    """
    Tokenize a string.
    The string should be converted to lowercase.
    If keep_internal_punct is False, then return only the alphanumerics (letters, numbers and underscore).
    If keep_internal_punct is True, then also retain punctuation that
    is inside of a word. E.g., in the example below, the token "isn't"
    is maintained when keep_internal_punct=True; otherwise, it is
    split into "isn" and "t" tokens.

    Params:
      doc....a string.
      keep_internal_punct...see above
    Returns:
      a numpy array containing the resulting tokens.

    >>> tokenize(" Hi there! Isn't this fun?", keep_internal_punct=False)
    array(['hi', 'there', 'isn', 't', 'this', 'fun'], 
          dtype='<U5')
    >>> tokenize("Hi there! Isn't this fun? ", keep_internal_punct=True)
    array(['hi', 'there', "isn't", 'this', 'fun'], 
          dtype='<U5')
     >>> tokenize("??necronomicon?? geträumte sünden.<br>Hi", True)
     array(['necronomicon', 'geträumte', 'sünden.<br>hi'], 
           dtype='<U13')
     >>> tokenize("??necronomicon?? geträumte sünden.<br>Hi", False)
     array(['necronomicon', 'geträumte', 'sünden', 'br', 'hi'], 
           dtype='<U12')      

    """
    ###TODO
    
    t=[]
    if(keep_internal_punct):
        for i in doc.split():
            if(i):
                t.append(re.sub(r"^\W+|\W+$", "", i))
    else:

        t.extend(re.findall('[^\W\s]+', doc))
    return(np.array([x.lower() for x in t]))

In [None]:
def do_vectorize(docs, tokenizer_fn=tokenize, min_df=1,
                 max_df=1., binary=True, ngram_range=(1,1)):
    """
    Convert a list of filenames into a sparse csr_matrix, where
    each row is a file and each column represents a unique word.
    Use sklearn's CountVectorizer: http://goo.gl/eJ2PJ5
    Params:
        filenames.......list of review file names
        tokenizer_fn....the function used to tokenize each document
        min_df..........remove terms from the vocabulary that don't appear
                        in at least this many documents
        max_df..........remove terms from the vocabulary that appear in more
                        than this fraction of documents
        binary..........If true, each documents is represented by a binary
                        vector, where 1 means a term occurs at least once in 
                        the document. If false, the term frequency is used instead.
        ngram_range.....A tuple (n,m) means to use phrases of length n to m inclusive.
                        E.g., (1,2) means consider unigrams and bigrams.
    Return:
        A tuple (X, vec), where X is the csr_matrix of feature vectors,
        and vec is the CountVectorizer object.
    """
    ###TODO
    vec = CountVectorizer(input='docs', tokenizer=tokenizer_fn,
                          binary=binary, min_df=min_df, max_df=max_df,
                          ngram_range=ngram_range)

    X = vec.fit_transform(docs)
    return (X, vec)

In [None]:
def accuracy_score(truth, predicted):
    """ Compute accuracy of predictions.
    DONE ALREADY
    Params:
      truth.......array of true labels (0 or 1)
      predicted...array of predicted labels (0 or 1)
    """
    return len(np.where(truth==predicted)[0]) / len(truth)

In [None]:
def cross_validation_accuracy(clf, X, labels, k):
    """
    Compute the average testing accuracy over k folds of cross-validation. You
    can use sklearn's KFold class here (no random seed, and no shuffling
    needed).

    Params:
      clf......A LogisticRegression classifier.
      X........A csr_matrix of features.
      labels...The true labels for each instance in Xh
      k........The number of cross-validation folds.

    Returns:
      The average testing accuracy of the classifier
      over each fold of cross-validation.
    """
    ###TODO
   
    kf = KFold(len(labels),n_folds=k,shuffle=True)
    ac = []    
    train_ac = []
    for fIndex , (train, test) in enumerate(kf):
        
        clf.fit(X[train], labels[train])
        train_ac.append(accuracy_score(labels[train], clf.predict(X[train])))
        predicted=clf.predict(X[test])
        ac1=accuracy_score(labels[test], predicted)
        ac.append(ac1)

    return np.mean(ac), np.mean(train_ac)

In [None]:
def main():
    """
    Put it all together.
    ALREADY DONE.
    """
    startTime = time.clock()
    
    if not os.path.exists('data/pos'):
        os.makedirs('data/pos')
    if not os.path.exists('data/neg'):
        os.makedirs('data/neg')
    
    # Do sentiment analysis on movie reviews
    createData("data/movie_reviews.txt")
    
    # read data.
    docs, labels = read_data("data")

    # Do vectorize
    matrix, vec = do_vectorize(docs)

    # Do k-fold Cross validation
    print('average cross validation test accuracy=%.4f train accuracy=%.4f' %
      cross_validation_accuracy(LogisticRegression(random_state=42, C=1, penalty='l2'),matrix,labels,5))
    endTime =  time.clock()
    
    print("Total running time :%f"%(endTime -  startTime))
if __name__ == '__main__':
    main()