## Generative Models - Text classification using Multinomial Naive Bayes 

This workbook has been classified into 2 sections:

1. Implementation of Multinomial Naive Bayes 
2. Model improvement using different techniques
    * Split the training data set into training and validation set
    * Replacing frequency f with log(1+f)
    * Removing stop words & reducing the size of the vocabulary
    
In the first section, I was able to achieve an accuracy of about 78%.

In the second section:
   * By splitting the training data into training and validation, an accuracy of 85% was achieved
   * Replacing the frequency f of a word ina document by log(1+f) didn't make much impact on the accuracy. An accuracy of 79% was achieved
   * By removing the stop words and by reducing the size of the vocabulary, I was able to get an accuary of about 80%

### Implementation of Multinomial Naive Bayes

In [1]:
## Import all the libraries
import pandas as pd;
import math;
import pandas.tools.util as tools;
import numpy as np;
from sklearn.cross_validation import train_test_split;

In [2]:
%cd 20news-bydate/matlab

/Users/Deepthi/Documents/DSE/DSE210 - Statistics and Probability/Day 2/Assignment/Work/20news-bydate/matlab


In [3]:
## Import all the datasets

test_data = pd.read_table('test.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])
train_data = pd.read_table('train.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])

test_label = pd.read_table('test.label',sep=' ',header = None, names = ['label_idx'])
train_label = pd.read_table('train.label',sep=' ',header = None, names = ['label_idx'])

test_map = pd.read_table('test.map',sep=' ',header = None, names = ['class','label_idx'])
train_map = pd.read_table('train.map',sep=' ',header = None, names = ['class','label_idx'])

vocabulary = pd.read_table('vocabulary.txt',header = None, names = ['words'])


In [4]:
## Data preparation

train_label = train_label.reset_index()
train_label = train_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
train_label['DocIdx'] = train_label['DocIdx']+1 # To match the document index in the data

test_label = test_label.reset_index()
test_label = test_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
test_label['DocIdx'] = test_label['DocIdx']+1 # To match the document index in the data

vocabulary = vocabulary.reset_index()
vocabulary = vocabulary.rename(columns = {'index':'WordIdx', 'words':'words'})
vocabulary['WordIdx'] = vocabulary['WordIdx']+1

## Merge train_data and train_label to get labels for each document in the train_data
train_data_2 = pd.merge(train_data, train_label,how = 'left', on='DocIdx')

In [5]:
## Calculate pi (document frequency) - priors
pi = pd.DataFrame(train_data_2.groupby(['label_idx']).DocIdx.nunique()).reset_index()
pi = pi.rename(columns={'DocIdx':'num_doc'})
total_docs = sum(pi.num_doc)
pi['pi'] = pi['num_doc']/total_docs 
pi['log_pi'] = pi['pi'].apply(lambda x: math.log(x)) ## Apply log

In [6]:
## Include all the words in the vocabulary to ensure higher accuracy on the test data

classes = train_map['label_idx']
vocabulary2 = vocabulary['WordIdx']
train_data_5 = pd.DataFrame(index= pd.MultiIndex.from_arrays(tools.cartesian_product([vocabulary2.tolist(),classes.tolist()]),names=['WordIdx','label_idx'])).reset_index()
train_data_3 = pd.DataFrame(train_data_2.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()
train_data_4 = pd.merge(train_data_5, train_data_3,how = 'left', on=(['WordIdx','label_idx']))
train_data_4['Frequency'].fillna(0,inplace=True)


In [7]:
## Calculate conditional probabilities P(word/class)

# Numerator = total wi in classj
df1 = pd.DataFrame(train_data_4.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()

# Denominator = total words in classj
df2 = pd.DataFrame(train_data_4.groupby(['label_idx']).Frequency.sum()).reset_index()
cond_prob = pd.merge(df1,df2,how='left', on = 'label_idx')
cond_prob = cond_prob.rename(columns={'label_idx':'label_idx', 'WordIdx': 'WordIdx', 'Frequency_x':'numerator', 'Frequency_y':'denominator'})
cond_prob['cond_prob'] = (cond_prob['numerator']+1)/(cond_prob['denominator']+len(vocabulary)) ## Laplace smoothing
cond_prob['cond_prob_log'] = cond_prob['cond_prob'].apply(lambda x: math.log(x))

In [8]:
## Create a routine 

def naivebayes(dat,doc):
    mask = dat[dat['DocIdx']==doc]['WordIdx']
    cond_prob_doc1 = cond_prob[cond_prob['WordIdx'].isin(mask)]
    cond_prob_doc1 = pd.merge(cond_prob_doc1,dat[dat['DocIdx']==doc], how = 'left', on = 'WordIdx')
    cond_prob_doc1['calc'] = cond_prob_doc1['Frequency']*cond_prob_doc1['cond_prob_log']
    cond_prob_doc1_2 = pd.DataFrame(cond_prob_doc1.groupby(['label_idx']).calc.sum()).reset_index()
    cond_prob_doc1_2 = pd.merge(cond_prob_doc1_2,pi,how='left',on='label_idx')
    cond_prob_doc1_2['final'] = cond_prob_doc1_2['log_pi']+cond_prob_doc1_2['calc']
    return int(cond_prob_doc1_2[cond_prob_doc1_2['final'] == cond_prob_doc1_2['final'].max()]['label_idx'])

label = []

for i in range(1,test_data['DocIdx'].nunique()+1):
    label.append(naivebayes(test_data,i))


In [9]:
original_label = list(test_label['label_idx'])
error_calc = pd.DataFrame({'label':label,'original_label':original_label})
error_calc['error'] = (error_calc['label']<>error_calc['original_label']).astype('int')
accuracy = 100 - float(error_calc['error'].sum())*100/float(error_calc['label'].count())
print 'Accuracy % =', accuracy

Accuracy % = 78.107928048


## Model improvement using different techniques

### Split the training data set into training and validation set

In [10]:
# %load 'Improve performance - Split the train data into test and train data.py'

## Import data

train_data_raw = pd.read_table('train.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])
train_label = pd.read_table('train.label',sep=' ',header = None, names = ['label_idx'])
train_map = pd.read_table('train.map',sep=' ',header = None, names = ['class','label_idx'])

## Data preparation

train_label = train_label.reset_index()
train_label = train_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
train_label['DocIdx'] = train_label['DocIdx']+1 # To match the document index in the data

train_docs_to_split = pd.DataFrame(train_data_raw['DocIdx'].unique())
train_docs_to_split.columns = ['DocIdx']

## To see how the smaller training and validation set impacts the performance
## Split the training data into training and validation dataset in the ratio 80:20
train_docs, test_docs = train_test_split(train_docs_to_split, test_size = 0.2) 

train_docs= list(train_docs.reset_index()['DocIdx'])
test_docs= list(test_docs.reset_index()['DocIdx'])

test_data = train_data_raw[train_data_raw['DocIdx'].isin(test_docs)]
train_data = train_data_raw[train_data_raw['DocIdx'].isin(train_docs)]

test_label = train_label[train_label['DocIdx'].isin(test_docs)]

## Use train data to build a Multinomial Naive Bayes model

## Merge train_data and train_label to get labels for each document in the train_data
train_data_2 = pd.merge(train_data, train_label,how = 'left', on='DocIdx')

vocabulary = pd.DataFrame(train_data['WordIdx'].unique())
vocabulary.columns = ['WordIdx']

## Calculate pi (document frequency) - priors
pi = pd.DataFrame(train_data_2.groupby(['label_idx']).DocIdx.nunique()).reset_index()
pi = pi.rename(columns={'DocIdx':'num_doc'})
total_docs = sum(pi.num_doc)
pi['pi'] = pi['num_doc']/total_docs 
pi['log_pi'] = pi['pi'].apply(lambda x: math.log(x))

## Include all the words in the vocabulary

classes = train_map['label_idx']
vocabulary2 = vocabulary['WordIdx']
train_data_5 = pd.DataFrame(index= pd.MultiIndex.from_arrays(tools.cartesian_product([vocabulary2.tolist(),classes.tolist()]),names=['WordIdx','label_idx'])).reset_index()
train_data_3 = pd.DataFrame(train_data_2.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()
train_data_4 = pd.merge(train_data_5, train_data_3,how = 'left', on=(['WordIdx','label_idx']))
train_data_4['Frequency'].fillna(0,inplace=True)

## Calculate conditional probabilities P(word/class)

# Numerator = total w1 in class1
df1 = pd.DataFrame(train_data_4.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()

# Denominator = total words in class1
df2 = pd.DataFrame(train_data_4.groupby(['label_idx']).Frequency.sum()).reset_index()
cond_prob = pd.merge(df1,df2,how='left', on = 'label_idx')
cond_prob = cond_prob.rename(columns={'label_idx':'label_idx', 'WordIdx': 'WordIdx', 'Frequency_x':'numerator', 'Frequency_y':'denominator'})
cond_prob['cond_prob'] = (cond_prob['numerator']+1)/(cond_prob['denominator']+len(vocabulary)) ## Laplace smoothing
cond_prob['cond_prob_log'] = cond_prob['cond_prob'].apply(lambda x: math.log(x))

## Create a routine 

def naivebayes(dat,doc):
    mask = dat[dat['DocIdx']==doc]['WordIdx']
    cond_prob_doc1 = cond_prob[cond_prob['WordIdx'].isin(mask)]
    cond_prob_doc1 = pd.merge(cond_prob_doc1,dat[dat['DocIdx']==doc], how = 'left', on = 'WordIdx')
    cond_prob_doc1['calc'] = cond_prob_doc1['Frequency']*cond_prob_doc1['cond_prob_log']
    cond_prob_doc1_2 = pd.DataFrame(cond_prob_doc1.groupby(['label_idx']).calc.sum()).reset_index()
    cond_prob_doc1_2 = pd.merge(cond_prob_doc1_2,pi,how='left',on='label_idx')
    cond_prob_doc1_2['final'] = cond_prob_doc1_2['log_pi']+cond_prob_doc1_2['calc']
    return int(cond_prob_doc1_2[cond_prob_doc1_2['final'] == cond_prob_doc1_2['final'].max()]['label_idx'])

label = []

for i in test_data['DocIdx'].unique():
    label.append(naivebayes(test_data,i))

original_label = list(test_label['label_idx'])
error_calc = pd.DataFrame({'label':label,'original_label':original_label})
error_calc['error'] = (error_calc['label']<>error_calc['original_label']).astype('int')
accuracy = 100 - float(error_calc['error'].sum())*100/float(error_calc['label'].count())
print 'Accuracy % =', accuracy

Accuracy % = 86.3354037267


### Replacing frequency f with log(1+f)

In [11]:
# %load 'Improve performance - replace frequency f with (1+f).py'

## Improve performance by replacing the frequency f of a word in a document by log(1+f)

## Import data

test_data = pd.read_table('test.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])
train_data = pd.read_table('train.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])

test_label = pd.read_table('test.label',sep=' ',header = None, names = ['label_idx'])
train_label = pd.read_table('train.label',sep=' ',header = None, names = ['label_idx'])

test_map = pd.read_table('test.map',sep=' ',header = None, names = ['class','label_idx'])
train_map = pd.read_table('train.map',sep=' ',header = None, names = ['class','label_idx'])

vocabulary = pd.read_table('vocabulary.txt',header = None, names = ['words'])


## Data preparation

train_label = train_label.reset_index()
train_label = train_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
train_label['DocIdx'] = train_label['DocIdx']+1 # To match the document index in the data

test_label = test_label.reset_index()
test_label = test_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
test_label['DocIdx'] = test_label['DocIdx']+1 # To match the document index in the data

vocabulary = vocabulary.reset_index()
vocabulary = vocabulary.rename(columns = {'index':'WordIdx', 'words':'words'})
vocabulary['WordIdx'] = vocabulary['WordIdx']+1


## Use train data to build a Multinomial Naive Bayes model

## Merge train_data and train_label to get labels for each document in the train_data
train_data_2 = pd.merge(train_data, train_label,how = 'left', on='DocIdx')


train_data['one_plus_freq_log']= train_data['Frequency'].apply(lambda x: math.log(x+1))
test_data['one_plus_freq_log']= test_data['Frequency'].apply(lambda x: math.log(x+1))

## Calculate pi (document frequency) - priors
pi = pd.DataFrame(train_data_2.groupby(['label_idx']).DocIdx.nunique()).reset_index()
pi = pi.rename(columns={'DocIdx':'num_doc'})
total_docs = sum(pi.num_doc)
pi['pi'] = pi['num_doc']/total_docs 
pi['log_pi'] = pi['pi'].apply(lambda x: math.log(x))


## Include all the words in the vocabulary

classes = train_map['label_idx']
vocabulary2 = vocabulary['WordIdx']
train_data_5 = pd.DataFrame(index= pd.MultiIndex.from_arrays(tools.cartesian_product([vocabulary2.tolist(),classes.tolist()]),names=['WordIdx','label_idx'])).reset_index()
train_data_3 = pd.DataFrame(train_data_2.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()
train_data_4 = pd.merge(train_data_5, train_data_3,how = 'left', on=(['WordIdx','label_idx']))
train_data_4['Frequency'].fillna(0,inplace=True)

## Calculate conditional probabilities P(word/class)

# Numerator = total w1 in class1
df1 = pd.DataFrame(train_data_4.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()

# Denominator = total words in class1
df2 = pd.DataFrame(train_data_4.groupby(['label_idx']).Frequency.sum()).reset_index()
cond_prob = pd.merge(df1,df2,how='left', on = 'label_idx')
cond_prob = cond_prob.rename(columns={'label_idx':'label_idx', 'WordIdx': 'WordIdx', 'Frequency_x':'numerator', 'Frequency_y':'denominator'})
cond_prob['cond_prob'] = (cond_prob['numerator']+1)/(cond_prob['denominator']+len(vocabulary)) ## Laplace smoothing
cond_prob['cond_prob_log'] = cond_prob['cond_prob'].apply(lambda x: math.log(x))

## Create a routine 

def naivebayes(dat,doc):
    mask = dat[dat['DocIdx']==doc]['WordIdx']
    cond_prob_doc1 = cond_prob[cond_prob['WordIdx'].isin(mask)]
    cond_prob_doc1 = pd.merge(cond_prob_doc1,dat[dat['DocIdx']==doc], how = 'left', on = 'WordIdx')
    cond_prob_doc1['calc'] = cond_prob_doc1['one_plus_freq_log']+cond_prob_doc1['cond_prob_log']
    cond_prob_doc1_2 = pd.DataFrame(cond_prob_doc1.groupby(['label_idx']).calc.sum()).reset_index()
    cond_prob_doc1_2 = pd.merge(cond_prob_doc1_2,pi,how='left',on='label_idx')
    cond_prob_doc1_2['final'] = cond_prob_doc1_2['log_pi']+cond_prob_doc1_2['calc']
    return int(cond_prob_doc1_2[cond_prob_doc1_2['final'] == cond_prob_doc1_2['final'].max()]['label_idx'])

label = []

for i in range(1,test_data['DocIdx'].nunique()+1):
    label.append(naivebayes(test_data,i))


original_label = list(test_label['label_idx'])
error_calc = pd.DataFrame({'label':label,'original_label':original_label})
error_calc['error'] = (error_calc['label']<>error_calc['original_label']).astype('int')
accuracy = 100 - float(error_calc['error'].sum())*100/float(error_calc['label'].count())
print 'Accuracy % =', accuracy

Accuracy % = 78.7341772152


### Removing stop words & reducing the size of the vocabulary


In [12]:
# %load 'Improve performance - Remove stopwords.py'
## Improve performance by removing stop words

## Import data
test_data_raw = pd.read_table('test.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])
train_data_raw = pd.read_table('train.data',sep=' ',header = None, names = ['DocIdx','WordIdx','Frequency'])

test_label = pd.read_table('test.label',sep=' ',header = None, names = ['label_idx'])
train_label = pd.read_table('train.label',sep=' ',header = None, names = ['label_idx'])

test_map = pd.read_table('test.map',sep=' ',header = None, names = ['class','label_idx'])
train_map = pd.read_table('train.map',sep=' ',header = None, names = ['class','label_idx'])

vocabulary = pd.read_table('vocabulary.txt',header = None, names = ['words'])

stop_words = pd.read_table("stop_words.txt", header = None, names = ['stop_words'])
stop_words = list(stop_words['stop_words'])

vocabulary2 = vocabulary[~vocabulary['words'].isin(stop_words)]


## Data preparation

train_label = train_label.reset_index()
train_label = train_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
train_label['DocIdx'] = train_label['DocIdx']+1 # To match the document index in the data

test_label = test_label.reset_index()
test_label = test_label.rename(columns={'index': 'DocIdx', 'label_idx': 'label_idx'})
test_label['DocIdx'] = test_label['DocIdx']+1 # To match the document index in the data

vocabulary2 = vocabulary2.reset_index()
vocabulary2 = vocabulary2.rename(columns = {'index':'WordIdx', 'words':'words'})
vocabulary2['WordIdx'] = vocabulary2['WordIdx']+1


test_data = test_data_raw[test_data_raw['WordIdx'].isin(list(vocabulary2['WordIdx']))]
train_data = train_data_raw[train_data_raw['WordIdx'].isin(list(vocabulary2['WordIdx']))]

## Use train data to build a Multinomial Naive Bayes model

## Merge train_data and train_label to get labels for each document in the train_data
train_data_2 = pd.merge(train_data, train_label,how = 'left', on='DocIdx')

test_label = test_label[test_label['DocIdx'].isin(list(test_data['DocIdx']))]

train_label = train_label[train_label['DocIdx'].isin(list(train_data['DocIdx']))]

## Calculate pi (document frequency) - priors
pi = pd.DataFrame(train_data_2.groupby(['label_idx']).DocIdx.nunique()).reset_index()
pi = pi.rename(columns={'DocIdx':'num_doc'})
total_docs = sum(pi.num_doc)
pi['pi'] = pi['num_doc']/total_docs 
pi['log_pi'] = pi['pi'].apply(lambda x: math.log(x))


## Include all the words in the vocabulary

classes = train_map['label_idx']
vocabulary2 = vocabulary2['WordIdx']
train_data_5 = pd.DataFrame(index= pd.MultiIndex.from_arrays(tools.cartesian_product([vocabulary2.tolist(),classes.tolist()]),names=['WordIdx','label_idx'])).reset_index()
train_data_3 = pd.DataFrame(train_data_2.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()
train_data_4 = pd.merge(train_data_5, train_data_3,how = 'left', on=(['WordIdx','label_idx']))
train_data_4['Frequency'].fillna(0,inplace=True)

## Calculate conditional probabilities P(word/class)

# Numerator = total w1 in class1
df1 = pd.DataFrame(train_data_4.groupby(['label_idx','WordIdx']).Frequency.sum()).reset_index()

# Denominator = total words in class1
df2 = pd.DataFrame(train_data_4.groupby(['label_idx']).Frequency.sum()).reset_index()
cond_prob = pd.merge(df1,df2,how='left', on = 'label_idx')
cond_prob = cond_prob.rename(columns={'label_idx':'label_idx', 'WordIdx': 'WordIdx', 'Frequency_x':'numerator', 'Frequency_y':'denominator'})
cond_prob['cond_prob'] = (cond_prob['numerator']+1)/(cond_prob['denominator']+len(vocabulary)) ## Laplace smoothing
cond_prob['cond_prob_log'] = cond_prob['cond_prob'].apply(lambda x: math.log(x))

## Create a routine 

def naivebayes(dat,doc):
    mask = dat[dat['DocIdx']==doc]['WordIdx']
    cond_prob_doc1 = cond_prob[cond_prob['WordIdx'].isin(mask)]
    cond_prob_doc1 = pd.merge(cond_prob_doc1,dat[dat['DocIdx']==doc], how = 'left', on = 'WordIdx')
    cond_prob_doc1['calc'] = cond_prob_doc1['Frequency']*cond_prob_doc1['cond_prob_log']
    cond_prob_doc1_2 = pd.DataFrame(cond_prob_doc1.groupby(['label_idx']).calc.sum()).reset_index()
    cond_prob_doc1_2 = pd.merge(cond_prob_doc1_2,pi,how='left',on='label_idx')
    cond_prob_doc1_2['final'] = cond_prob_doc1_2['log_pi']+cond_prob_doc1_2['calc']
    return int(cond_prob_doc1_2[cond_prob_doc1_2['final'] == cond_prob_doc1_2['final'].max()]['label_idx'])

label = []

for i in test_data['DocIdx'].unique():
    label.append(naivebayes(test_data,i))

original_label = list(test_label['label_idx'])
error_calc = pd.DataFrame({'label':label,'original_label':original_label})
error_calc['error'] = (error_calc['label']<>error_calc['original_label']).astype('int')
accuracy = 100 - float(error_calc['error'].sum())*100/float(error_calc['label'].count())
print 'Accuracy % =', accuracy

Accuracy % = 80.2905117271
