In [8]:
#import statements
import numpy as np
from pandas import DataFrame
import pandas as pd
from sklearn.cross_validation import train_test_split,KFold
from sklearn.feature_extraction.text import CountVectorizer
from collections import defaultdict
from collections import Counter
import re
from scipy.special import comb
import itertools
from collections import Counter 
from sknn.mlp import Classifier, Convolution, Layer

### Data Processing

In [219]:
#read the file
def readFile(fileName):
    with open(fileName,'r') as f:
        lines = f.readlines()
    return lines

In [220]:
lines_read = readFile('imdb_labelled.txt')

In [221]:
'''
Function desc: Remove non-alphanumeric content from the text 
'''


def stripnonalphanumeric(string):
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [222]:
def tokenize(text):
    x =  re.sub(r'\d+', '', text)
    y =  re.sub(r'_+', '', x)
    return re.findall(r'\w+', y)

In [223]:
def split_data(sent):
    neg_sent=[]
    pos_sent=[]
    for s in sent:
        tab_sep_data = s.split('\t')
        if int(tab_sep_data[1]) == 0:
            neg_sent.append(tab_sep_data[0])
        else:
            pos_sent.append(tab_sep_data[0])
    return pos_sent,neg_sent

In [224]:
pos_sent,neg_sent=split_data(lines_read)

In [225]:
def prepare_data(pos_sent,neg_sent):
    
    x=pos_sent+neg_sent
    x=[stripnonalphanumeric(s) for s in x]
    x=[s.split(' ') for s in x]
    pos_lab = [[1,0] for p in pos_sent]
    neg_lab = [[0,1] for n in neg_sent]
    y=np.concatenate([pos_lab,neg_lab],0)
    return x,y

In [226]:
x,y=prepare_data(pos_sent,neg_sent)

In [227]:
def pool_data(x):
    append_word = '!!FILL!!'
    max_sent_length = max(len(s) for s in x)
    created_data = []
    for i in range(len(x)):
        sent = x[i]
        fills = max_sent_length - len(sent)
        n_sent = sent + [append_word] *fills
        created_data.append(n_sent)
    return created_data

In [228]:
data=pool_data(x)

In [229]:
def vocab_dict_builder(sent):
    w_count = Counter(itertools.chain(*sent))
    voc_dict = [x[0] for x in w_count.most_common()]
    voc_idx_map = {x: i for i, x in enumerate(voc_dict)}
    return [voc_dict,voc_idx_map]

In [230]:
words,words_idx=vocab_dict_builder(data)

In [231]:
def feat_vec(data,words,words_idx):
    x=np.array([[words_idx[i] for i in d] for d in data])
    return x

In [232]:
x=feat_vec(data,words,words_idx)

In [233]:
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices].argmax(axis=1)


In [24]:
def rec_create_data(data):
    z = data
    for j in z:
        j.insert(0,1)   
    return z

In [25]:
def rec_find_alpha(class_labels):
    alpha1 = {}
    class_count1 = {}
    for i in class_labels:
        if i[0] not in class_count1.keys():
            class_count1[i[0]] = 1
        else:
            class_count1[i[0]] += 1
    classes1 = class_count1.keys()
    for j in  class_count1:
        alpha1[j] = class_count1[j]*1.0/len(class_labels)
    return classes1,class_count1,alpha1

## Initial Assumption of Weights

In [127]:
def rec_assume_weights(g,h,n,nc):
    s = np.random.randn(g,n)/10000
    w = np.random.randn(h,(g+1))/10000
    v = (np.random.randn(nc-1,(h+1))-10)/10000
    return s,w,v

### Function of Indicator


In [27]:
def rec_indicator(y,cl):
    ind = np.zeros([len(y),len(cl)])
    for k in range(len(cl)):
        for l in range(len(y)):
            if y[l][0] == cl[k]:
                ind[l][k] = 1
    return ind

### Sigmoid Functions

In [28]:
def rec_find_sigmoid(ws,datas):
    sigmoid = []
    for ii in datas:
        tmp = []
        for jj in range(len(ws)):
            xz = np.dot(ws[jj].transpose(),ii)
            tmp.append(1.0/(1+np.exp(-xz)))
        sigmoid.append(tmp)
    sigmoid =rec_create_data(sigmoid)
    return sigmoid

In [247]:
def rec_find_sigmoid1(ws,datas):
    sigmoid = []
    for ii in datas:
        tmp = []
        for jj in range(len(ws)):
            xz = np.dot(ws[jj].transpose(),ii)
            tmp.append(np.tanh(xz))
        sigmoid.append(tmp)
    sigmoid =rec_create_data(sigmoid)
    return sigmoid

### SoftMax Function

In [29]:
def rec_softmax(theta,data):
    soft = np.zeros([len(data),len(theta)])
    for i in range(len(data)):
        s = 0
        for j in range(len(theta)):
            s = s + np.exp(np.dot(theta[j].T,data[i]))
        for k in range(len(theta)):
            xy = np.exp(np.dot(theta[k].transpose(),data[i]))
            soft[i][k] = (xy*1.0)/s
    return soft

## Code for layer V

In [154]:
def outputlayer(datam,wm,sm,indic,lr,v):
    qy = rec_find_sigmoid1(sm,datam)
    zy = rec_find_sigmoid1(wm,qy)
    yhat = rec_softmax(v,zy)
    for n in range(len(v)):
        sumsv = 0
        for m in range(len(yhat)):
            sumsv = sumsv + np.dot((yhat[m][n] - indic[m][n]) , zy[n])
        v[n]= v[n] - (lr*sumsv)
    return v,yhat,zy,qy

### Code for layer W

In [155]:
def rec_intrim(yh,y,v,j,k):
    sv = 0
    for ll in range(len(v)):
        xx = yh[k][ll]- y[k][ll]
        sv = sv + xx*v[ll][j]
    return sv

In [156]:
def firstlayer(v,z,yhh,dataq,indic,w,lr):
    for j in range(len(w)):
        sq = 0
        for i in range(len(dataq)):
            x = np.dot(rec_intrim(yhh,indic,v,j,i),z[i][j])
            y = np.dot((1-z[i][j]),dataq[i])
            sq = sq + np.dot(x,y)
        w[j] = w[j] - (lr*sq)
    return w

## Code for Layer S

In [157]:
def secondlayer(v,w,yhh,indic,q,data,s,lr):
    
    for i in range(len(s)):
        sums = 0
        for l in range(len(data)): 
            sumw =0
            for j in range(len(w)):
                sumv = 0
                for k in range(len(v)):
                    sumv = sumv + np.dot((yhh[l][k]-indic[l][k]),v[k][j])
                tmp1 = np.dot((1-w[j][i]),w[j][i])
                sumw = sumw + sumv*np.dot(tmp1,q[l][i])
            tmp2 = np.dot((1-q[l][i]),q[l][i])
            sums = sums + sumw*(np.dot(tmp2,data[l]))
        s[i] = s[i] - lr*sums
    return s     

### Code for Gradient Descent with RNN

In [251]:
def grad_desc(W,V,S,data,indic,lr,ic):
    for cnt in range(ic):
        V,yh,Z,Q = outputlayer(data,W,S,indic,lr,V)
        W1 = firstlayer(V,Z,yh,Q,indic,W,lr)
        W = firstlayer(V,Z,yh,Q,indic,W1,lr)
        S = secondlayer(V,W1,yh,indic,Q,data,S,lr)
    return V,W,S

In [245]:
def grad_desc1(W,V,S,data,indic,lr,ic):
    for cnt in range(ic):
        V,yh,Z,Q = outputlayer(data,W,S,indic,lr,V)
        W1 = firstlayer(V,Z,yh,Q,indic,W,lr)
        W = firstlayer(V,Z,yh,Q,indic,W1,lr)
        W = W + 0.002*W1
        S1 = secondlayer(V,W,yh,indic,Q,data,S,lr)
        S = secondlayer(V,W1,yh,indic,Q,data,S1,lr)
        S = S + 0.002*S1
    return V,W,S

### Training Module

In [252]:
def rec_train(data,label,g,h,cl,lr,ic):
    s0,w0,v0 = rec_assume_weights(g,h,data.shape[1],len(cl))
    ind = rec_indicator(label,cl)
    Vf,Wf,Sf = grad_desc(w0,v0,s0,data,ind,lr,ic)
    return Vf,Wf,Sf

### Prediction Module

In [279]:
def rec_prediction(w,v,s,data,cl):
    Qp = rec_find_sigmoid1(s,data)
    Zp = rec_find_sigmoid1(w,Qp)
    pre = rec_softmax(v,Zp)  
    ll = []
    for j in range(len(pre)):
        if pre[j][0] > np.mean(pre):
            ll.append([1])
        else:
            ll.append([0])
    return np.array(ll)

### Evaluation Module

In [260]:
def find_confusion_matrix(clabels,actual,predicted):
    cm= []
    for i in clabels:
        tmp =[0]*len(clabels)
        for j in range(len(actual)):
            if actual[j][0] == i and actual[j][0] == predicted[j][0]:
                tmp[clabels.index(i)] += 1
            elif actual[j][0] == i and actual[j][0] != predicted[j][0]:
                tmp[clabels.index(predicted[j][0])] += 1
        cm.append(tmp)
    return np.array(cm)
def find_accuracy(matrix):
    return np.trace(matrix)*1.0/np.sum(matrix)
def find_precision(matrix):
    pres = []
    x = np.sum(matrix,axis=0)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                pres.append(matrix[i][j]*1.0/x[i])
    return pres
def find_recall(matrix):
    rec = []
    x = np.sum(matrix,axis=1)
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            if i == j:
                rec.append(matrix[i][j]*1.0/x[i])
    return rec
def find_fmeasure(prec,rec):
    tmp = []
    for i,j in zip(prec,rec):
        tmp.append(2.0*(i*j)/(i+j))
    return tmp

In [261]:
def evaluation(clabels,acutal,predicted):
    confmatrix = find_confusion_matrix(clabels,acutal,predicted)
    print "Confusion Matrix"
    print confmatrix
    accuracy = find_accuracy(confmatrix)
    print "Accuracy", accuracy
    precision = find_precision(confmatrix)
    print "Precision", precision
    recall = find_recall(confmatrix)
    print "Recall", recall
    f_score =find_fmeasure(precision,recall)
    print "F_score", f_score

In [262]:
yy = []
yl = []
for i in y:
    if i[0] == 1:
        yy.append([1])
        yl.append(1)
    else:
        yy.append([0])
        yl.append(0)
yy = np.array(yy)
yl = np.array(yl)

## Experiment 1

provide the data,labels,number of featues in S, number of features in W, the classes, Learning rate, iteration count

In [263]:
classes,class_count,alpha = rec_find_alpha(yy)
Vf,Wf,Sf = rec_train(x,yy,10,10,classes,0.000454,100)

In [280]:
YY =rec_prediction(Wf,Vf,Sf,x,classes)

In [41]:
evaluation(classes,yy,YY)

Confusion Matrix
[[283 217]
 [260 240]]
Accuracy 0.523
Precision [0.52117863720073665, 0.52516411378555794]
Recall [0.56599999999999995, 0.47999999999999998]
F_score [0.54266538830297217, 0.5015673981191221]


## Experiment 2

provide the data,labels,number of featues in S, number of features in W, the classes, Learning rate, iteration count

In [42]:
Vf1,Wf1,Sf1 = rec_train(x,yy,30,30,classes,0.000454,300)

In [44]:
YY1 =rec_prediction(Wf1,Vf1,Sf1,x,classes)

In [45]:
evaluation(classes,yy,YY1)

Confusion Matrix
[[289 211]
 [233 267]]
Accuracy 0.556
Precision [0.55363984674329503, 0.55857740585774063]
Recall [0.57799999999999996, 0.53400000000000003]
F_score [0.56555772994129161, 0.54601226993865037]


## Experiment 3

## Recuurent Neural Network with complete features
provide the data,labels,number of featues in S, number of features in W, the classes, Learning rate, iteration count

In [43]:
Vf2,Wf2,Sf2 = rec_train(x,yy,80,80,classes,0.000454,10)

In [47]:
YY2 =rec_prediction(Wf2,Vf2,Sf2,x,classes)

In [213]:
evaluation(classes,yy,YY2)

Confusion Matrix
[[500   0]
 [ 83 417]]
Accuracy 0.917
Precision [0.85763293310463118, 1.0]
Recall [1.0, 0.83399999999999996]
F_score [0.92336103416435833, 0.90948745910577966]


## Experiment 4
provide the data,labels,number of featues in S, number of features in W, the classes, Learning rate, iteration count

In [None]:
Vf3,Wf3,Sf3 = rec_train(x,yy,20,30,classes,0.0000454,17)

In [None]:
YY3 =rec_prediction(Wf3,Vf3,Sf3,x,classes)

In [199]:
YY3 = np.append(YY3,[[0]]*127,axis = 0)

In [200]:
evaluation(classes,yy,YY3)

Confusion Matrix
[[142 358]
 [383 117]]
Accuracy 0.259
Precision [0.27047619047619048, 0.24631578947368421]
Recall [0.28399999999999997, 0.23400000000000001]
F_score [0.27707317073170729, 0.23999999999999999]


## Data for Logistic Regression

In [40]:
import io
filename = "imdb_labelled.txt"
r = io.open(filename, encoding='utf8').readlines()
review = []
Y = []
for i in r:
    x = i.split('\t')
    review.append(x[0])
    Y.append([float(x[-1])])
Y = np.array(Y)

## Countvectorizer Object

In [205]:
def count_vectorize(filenames, tokenizer_fn=tokenize, min_df=3,
                 max_df=.8, binary=True, ngram_range=(1,3)):
   
    vectorizer = CountVectorizer(tokenizer = tokenizer_fn, min_df=min_df, 
                                     max_df=max_df, binary=binary, ngram_range=ngram_range, 
                                 dtype = 'int',analyzer='word',token_pattern='(?u)\b\w\w+\b',encoding='utf-8' )
        
    X = vectorizer.fit_transform(review)
    return X,vectorizer
    
matrix, cv = count_vectorize(review)
print ('matrix represents %d documents with %d features' % (matrix.shape[0], matrix.shape[1]))
print('first doc has terms:\n%s' % (str(sorted(matrix[0].nonzero()[1]))))

matrix represents 1000 documents with 1441 features
first doc has terms:
[0, 27, 31, 32, 724, 756, 757, 774, 1019, 1311, 1316, 1438]


In [211]:
matrix1, vec = count_vectorize(review,ngram_range=(1,1))
matrix2, vec = count_vectorize(review,ngram_range=(1,2))

In [83]:
X =np.array(matrix.todense())

In [175]:
Vf4,Wf4,Sf4 = rec_train(X,Y,1000,1000,classes,0.001,10)

In [240]:
YY4=rec_prediction(Wf4,Vf4,Sf4,X,classes)

In [241]:
evaluation(classes,Y,YY4)

Confusion Matrix
[[  0 500]
 [  0 500]]
Accuracy 0.5
Precision [nan, 0.5]
Recall [0.0, 1.0]
F_score [nan, 0.66666666666666663]


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [44]:
def logistic():
    return LogisticRegression(C=1.0,random_state=42)

## Logistic Regression - TriGram

In [206]:
logistic_classifier = logistic()
logistic_classifier.fit(matrix, yl)
logistic_predictions = logistic_classifier.predict(matrix)
print "The acuracy through logisteic regression is ",accuracy_score(yl, logistic_predictions)

The acuracy through logisteic regression is  0.939


## Logistic Regression - UniGram

In [210]:
logistic_classifier.fit(matrix1, yl)
logistic_predictions = logistic_classifier.predict(matrix1)
print "The acuracy through logisteic regression is ",accuracy_score(yl, logistic_predictions)

The acuracy through logisteic regression is  0.867


## Logistic Regression - BiGram

In [212]:
logistic_classifier.fit(matrix2, yl)
logistic_predictions = logistic_classifier.predict(matrix2)
print "The acuracy through logisteic regression is ",accuracy_score(yl, logistic_predictions)

The acuracy through logisteic regression is  0.933
