#Naive Bayes model - Binarizing the features 

# Import statements

In [6]:
#import statements
import numpy as np
from pandas import DataFrame
import pandas as pd
from sklearn.cross_validation import train_test_split,KFold
from collections import defaultdict
from collections import Counter
import re
from scipy.special import comb


# Read data section

In [7]:
#read the file
def readFile(fileName):
    with open(fileName,'r') as f:
        lines = f.readlines()
    return lines

In [8]:
lines_read = readFile('imdb_labelled.txt')

# Prepare and transform data section

In [9]:
'''
Function desc: Split the data into texts and class labels
'''

def data_process(lines_read):
    x=[]
    y=[]
    xl=[]
    for i in range(len(lines_read)):
        split = lines_read[i].split()
        y.append(int(split[-1]))
        x.append(" ".join ([word.lower() for word in split[0:len(split)-1]]))
    return x,y
    

In [10]:
x,y=data_process(lines_read)

# Model parameter calculation

In [11]:

'''
    Function description: Calculate the prior alpha for each class

'''

def find_prior(y):
    key = np.unique(y)
    prior = [0.0 for i in range(len(key))]
    cls_split={i:0 for i in key }
    for i in y:
        if i in key:
            cls_split[i]+=1
        else:
            cls_split[i]+=1
    for i in range(len(cls_split.values())):
        prior[i] = 1.*cls_split.values()[i]/len(y)
    return prior
            
        
    

In [12]:
prior=find_prior(y)
print prior

[0.5, 0.5]


In [13]:
'''
Function desc: Remove non-alphanumeric content from the text 
'''


def stripnonalphanumeric(x):
    return re.findall('\w+',x.lower())

In [19]:
'''
Function desc: Build a dictionary of unique words. The text is tokenized and the occurence of each word is mapped. 
               In this function, care is taken to remove the stop words which would help in classifying the texts accurately
               
               Input: Text
               Output: unique words in the corpus

'''

def vocab_builder(x,y):
    word = {}
    bag_of_words = []
    for doc_id in range(0,len(x)):
        split_text = stripnonalphanumeric(x[doc_id]) # strip text 
        for i in split_text:
            bag_of_words.append(i)
    words =  dict(Counter(bag_of_words)) # word counter 
    bag_of_words =[]
    for i in words:
        if words[i] < 50: #remove words 
            bag_of_words.append(i)
    return (bag_of_words)
    
    

In [20]:
v=vocab_builder(x,y)

In [275]:
'''
Function desc: Create a feature vector for the given texts
               Mark the number of times the word appears
               
               Returns: Feature vetor

'''



def create_feature_vector(x,c):
    f_v = [np.zeros((len(c)),dtype=int) for i in range(len(x))]
    feat_vect = []
    for i in range(len(x)):
        co = Counter(stripnonalphanumeric(x[i]))
        for j in x[i].split():
            if j in c:
                ix=c.index(j)
                f_v[i][ix] = co[j]
    return [i.tolist() for i in f_v]

In [276]:
feat_vector = create_feature_vector(x,v)

In [277]:
'''
Function desc: To partition the given data into the respective classes
               Input: Feature vector and class labels
               Output: Partitioned data

'''




def data_partition(feat_vector,y):
    zipped =zip(feat_vector,y) #combine feature vector with its class labels
    pos_rev=[]
    neg_rev=[]
    for i in range(len(feat_vector)):
        if zipped[i][1] == 1:
            pos_rev.append(zipped[i][0]) #putthe data into negative list
        else:
            neg_rev.append(zipped[i][0]) #putthe data into positive list
    return pos_rev,neg_rev
    
    
        

In [278]:
pos,neg = data_partition(feat_vector,y)

# Model parameters calculation

In [279]:
'''
Function desc: To calulate the parameter alpha(j|y=i), where j is the jth word in the sentence and i is the class label
               Input: segregated data bases on the class label
               
               Output: Parameters of the distribution


'''


def calculate_parameter(pos,neg):
    eps=0.5 # for smoothing
    labels = np.unique(y)
    alpha={}
    P = np.sum(pos) # total no. of words in positive class
    N = np.sum(neg) # total no. of words in negative class
    alpha_pos = (np.sum(pos,axis=0)+eps)/((len(pos)*P)+len(labels)*eps)  #positive parameters
    alpha_neg = ((np.sum(neg,axis=0))+eps)/((len(neg)*N)+len(labels)*eps) #negative parameters
    alpha[labels[0]] = alpha_neg
    alpha[labels[1]] =alpha_pos
    return alpha # dictionary containing both alphas
    
    
        

In [280]:
alpha=calculate_parameter(pos,neg)
print alpha

{0: array([  3.28191449e-07,   3.28191449e-07,   3.28191449e-07, ...,
         9.84574346e-07,   3.28191449e-07,   2.95372304e-06]), 1: array([  2.80819837e-07,   2.80819837e-07,   8.42459510e-07, ...,
         2.80819837e-07,   8.42459510e-07,   2.52737853e-06])}


In [259]:
'''
Funtion desc: Membership funtion, whih provides the membership values for each class
              Input: 1) sample to be predicted
                     2) Model parameters
                     3) Prior probability
                     
        Output: Membership values for each class


'''

def g_x(x,alpha,prior):
    d=[]
    P = np.sum(x) #total number words in the corpus
    for i in alpha:
        p=0
        l_t1 = 0
        for j in range(len(x)):
            l_t1 += np.log(comb(P,x[j])*(alpha[i][j]**x[j])*((1-alpha[i][j])**(P-x[j]))) # using the formula
        p=l_t1+prior[i]
        d.append(p)
    return d
    

In [260]:
'''
Funtion description: Predict the class labels of the supplied sample. 
                     Call the membership function for each sample on the classes and find the maximum value, which 
                     gives the class of the particular sample.
                     
                     Input: Samples(x) and True class labels(y)

'''


def predict_labels(x,y):
    predicted=[]
    cls=np.unique(y)
    g = []
    for i in x:
        g.append(g_x(i,alpha,prior))  #call membership funtction for each sample
    for i in g:
        predicted.append(cls[i.index(max(i))]) #get the max value's index and get the corresponding class
    return predicted

    
    

In [261]:
pred = predict_labels(feat_vector,y)

In [262]:
'''
Function desc: To calculate the model parameters suh as precision, recall and f-measure from the confusion-matrix
Inuput: Actual and predicted values
Output: model parameters


Classification accuracy
(TP + TN) / (TP + TN + FP + FN)
Error rate
(FP + FN) / (TP + TN + FP + FN)

Precision: (or Positive predictive value)
proportion of predicted positives which
are actual positive
TP / (TP + FP)
Recall: proportion of actual positives
which are predicted positive
TP / (TP + FN)

Error rate: 1- classification accuracy

'''

def model_eval(actual,predicted):
    Truth= pd.Series(actual,name = 'Truth' )
    Predicted = pd.Series(predicted,name='Predicted')
    confusion_matrix = pd.crosstab(Truth, Predicted)
    arr_cm = confusion_matrix.as_matrix()
    diag = arr_cm.diagonal() 
    accuracy = float(sum(diag))/np.sum(arr_cm)
    precision = np.divide(diag,1.0*(np.sum(arr_cm,axis=0)))
    recall = np.divide(diag,1.0*(np.sum(arr_cm,axis=1)))
    fmeasure = 2*((precision * recall)/(precision + recall))
    return confusion_matrix,accuracy,precision,recall,fmeasure

# Model Evaluation

In [281]:
'''Function to perform 10-fold cross validation. In this method the test and train indices are split using using the inbuit
   'KFold' function.
    Calculation of parameters is done namely (prior and likelihood) for each of the train set
    With the model parameters the modelis tested on the test data
    Model parameters such as precision,reall,accuracy and error rate is calculated
    Input 1: 'x' 
    Input 2: True predicted values
    Input 3: No of folds (10 by default)

    Output: Predicted values, Model parameters 
'''
def x_fold_validation(x,y,nfolds=10):
    confusion_matrix_list=[]
    accuracy_list=[]
    precision_list=[]
    recall_list=[]
    fmeasure_list=[]
    y=np.array(y)
    x=np.array(x)
    cv = KFold(len(y), nfolds,shuffle=True,random_state=23) #inbuilt function to split the indices
    for train_idx, test_idx in cv:
        prior = find_prior(y[train_idx])
        pos,neg = data_partition(x[train_idx],y[train_idx])
        alfa = calculate_parameter(pos,neg)
        predict = predict_labels(x[test_idx],y[test_idx])
        conf_matrix,accuracy,precision,recall,fmeasure = model_eval(y[test_idx].tolist(),predict)
        print conf_matrix
        confusion_matrix_list.append(conf_matrix)
        accuracy_list.append(accuracy.tolist())
        precision_list.append(precision.tolist())
        recall_list.append(recall.tolist())
        fmeasure_list.append(fmeasure.tolist())
    print '################################'
    print 'Average of the model parameters'
    print '################################'
    print 'Error rate:', (1-np.mean(accuracy_list,axis=0))
    print 'Accuracy:',np.mean(accuracy_list)
    print 'Precision:', np.mean(precision_list,axis=0)
    print 'Recall:', np.mean(recall_list,axis=0)
    print 'F-measure:', np.mean(fmeasure_list,axis=0)

In [282]:
x_fold_validation(feat_vector,y)

Predicted   0   1
Truth            
0          49   3
1           1  47
Predicted   0   1
Truth            
0          45   0
1           4  51
Predicted   0   1
Truth            
0          40   4
1           2  54
Predicted   0   1
Truth            
0          46   2
1           4  48
Predicted   0   1
Truth            
0          52   2
1           5  41
Predicted   0   1
Truth            
0          49   1
1           8  42
Predicted   0   1
Truth            
0          56   1
1           2  41
Predicted   0   1
Truth            
0          44   4
1           2  50
Predicted   0   1
Truth            
0          47   1
1           4  48
Predicted   0   1
Truth            
0          53   1
1           7  39
################################
Average of the model parameters
################################
Error rate: 0.058
Accuracy: 0.942
Precision: [ 0.92696191  0.96179753]
Recall: [ 0.96124659  0.92110362]
F-measure: [ 0.94294667  0.94001407]
