## Problem Statement

Using the 20 Newsgroups data set: 
http://qwone.com/~jason/20Newsgroups/

The data consists of  19,000 documents, each from one of 20 newsgroups. 

Create a naive bayes model that can classify documents based on how often words appear in the text.

In [1]:
%cd ~/Documents/DSE/c1steven/DSE210/20news-bydate 2/matlab

/Users/cjstev/Documents/DSE/c1steven/DSE210/20news-bydate 2/matlab


In [2]:
import pandas as pd
import numpy
from numpy import log
from math import isnan
from time import time
from numpy.random import uniform
import random


In [3]:
## start timer
time1 = time()

In [4]:
## Read and Format Data

wordlist = pd.read_csv("train.data", header=None, sep=r"\s+")
wordlist.columns = ['docIdx', 'wordIdx' ,'counts']
label = pd.read_csv("train.label", header=None, sep=r"\s+")
maps = pd.read_csv("train.map", header=None, sep=r"\s+")
label.reset_index(inplace=True)
label['index']=label['index']+1  # to fix indices to rownumbers
label.columns = ['docIdx','genreID']
maps.columns=['genreName','genreID']
vocab = pd.read_csv("../../vocabulary.txt", header=None, sep=r"\s+")
vocab.reset_index(inplace=True)
vocab['index']=vocab['index']+1  # to fix indices to rownumbers
vocab.columns = ['wordIdx','wordName']
vocabCount = vocab  ## create a second vocab list that will later be appended for smoothing purposes
vocabCount['counts'] = 1

In [5]:
## Merge DFs
mergedDF = wordlist.merge(label).merge(maps).merge(vocab)

In [6]:
## split train and validation
docselector = pd.DataFrame(mergedDF['docIdx'].drop_duplicates())
numpy.random.seed(1)
docselector['rand']=uniform(size=len(docselector))
mergedDF = mergedDF.merge(docselector)
mergedTrain = mergedDF[mergedDF.rand<=.8]
mergedValid = mergedDF[mergedDF.rand>.8]

In [7]:
## Calculate pi sub j (probability of each genre occuring)
docCount = mergedTrain[['genreID', 'docIdx']].drop_duplicates()
docCount = docCount.groupby(['genreID']).count().reset_index()
docCount['PIj']=docCount['docIdx']/sum(docCount.docIdx)

## Calculate baseline train / valid prediction accuracy 

In [8]:
## Create Dictionary of probability distributions of words in genres
MNdict = {}
for i in mergedTrain.genreID.unique():
    byGenre = mergedTrain[mergedTrain['genreID']==i]
    byGenre = pd.concat([byGenre[['wordName','wordIdx','counts']],vocabCount[['wordName','wordIdx','counts']]])
    byGenre = byGenre.groupby(['wordName','wordIdx']).sum()
    byGenre.reset_index(inplace=True)
    byGenre['Pj']=byGenre['counts']/sum(byGenre['counts'])
    byGenre = byGenre[['wordName','wordIdx','Pj']]
    MNdict[i] = byGenre

In [9]:
## convert dictionairy distribution to one dataframe.  bring in PIj to get ready to classify
mastertrain = pd.DataFrame()
for i in MNdict:
    trainset = MNdict[i]
    trainset['genreID'] = i
    trainset['PIj'] = float(docCount[docCount.genreID==i]['PIj'])
    mastertrain=pd.concat([mastertrain,trainset])

In [10]:
### classify each document in the testDF
bigmerged = pd.merge(mastertrain,mergedValid[['docIdx','wordIdx','counts']])  ## drop actual genre for merging
bigmerged['calc']=log(bigmerged['Pj'])+bigmerged['counts']*log(bigmerged['Pj'])
bigmerged = bigmerged[['docIdx','genreID','calc']].groupby(['docIdx','genreID']).sum()
bigmerged = bigmerged.reset_index()
idx = bigmerged.groupby(['docIdx'])['calc'].transform(max) == bigmerged['calc']  ## select argmax for each document
testresults = bigmerged[idx]

In [11]:
## Check results
testresults.rename(columns={'genreID':'predgenre'},inplace=True)
testcompare = pd.merge(testresults,mergedValid)
sum(testcompare.predgenre == testcompare.genreID)/float(len(testcompare))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)


0.8270985943377549

#### Baseline accuracy is 82.7%

## Test using Log(1+f)

In [12]:
## Create Dictionary of probability distributions of words in genres
MNdict = {}
for i in mergedTrain.genreID.unique():
    byGenre = mergedTrain[mergedTrain['genreID']==i]
    byGenre = pd.concat([byGenre[['wordName','wordIdx','counts']],vocabCount[['wordName','wordIdx','counts']]])
    byGenre = byGenre.groupby(['wordName','wordIdx']).sum()
    byGenre.reset_index(inplace=True)
    byGenre['counts'] = log(1+byGenre['counts'])
    byGenre['Pj']=byGenre['counts']/sum(byGenre['counts'])
    byGenre = byGenre[['wordName','wordIdx','Pj']]
    MNdict[i] = byGenre

In [13]:
## convert dictionairy distribution to one dataframe.  bring in PIj to get ready to classify
mastertrain = pd.DataFrame()
for i in MNdict:
    trainset = MNdict[i]
    trainset['genreID'] = i
    trainset['PIj'] = float(docCount[docCount.genreID==i]['PIj'])
    mastertrain=pd.concat([mastertrain,trainset])

In [14]:
### classify each document in the testDF
bigmerged = pd.merge(mastertrain,mergedValid[['docIdx','wordIdx','counts']])  ## drop actual genre for merging
bigmerged['calc']=log(bigmerged['Pj'])+log(1+bigmerged['counts'])*log(bigmerged['Pj'])
bigmerged = bigmerged[['docIdx','genreID','calc']].groupby(['docIdx','genreID']).sum()
bigmerged = bigmerged.reset_index()
idx = bigmerged.groupby(['docIdx'])['calc'].transform(max) == bigmerged['calc']  ## select argmax for each document
testresults = bigmerged[idx]

In [15]:
## Check results
testresults.rename(columns={'genreID':'predgenre'},inplace=True)
testcompare = pd.merge(testresults,mergedValid)
sum(testcompare.predgenre == testcompare.genreID)/float(len(testcompare))

0.83998713126113644

#### Accuracy is slightly boosted to 84.0%, and improvement of 1.3%

## Remove Stopwords

In [16]:
from stop_words import get_stop_words
stop_words = get_stop_words('en')

In [17]:
## split train and validation
docselector = pd.DataFrame(mergedDF['docIdx'].drop_duplicates())
numpy.random.seed(1)
docselector['rand']=uniform(size=len(docselector))
mergedDF = mergedDF.merge(docselector)
mergedTrain = mergedDF[mergedDF.rand<=.8]
mergedValid = mergedDF[mergedDF.rand>.8]

In [18]:
mergedTrain = mergedTrain[~(mergedTrain['wordName'].isin(stop_words))]
mergedValid = mergedValid[~(mergedValid['wordName'].isin(stop_words))]

In [19]:
## Create Dictionary of probability distributions of words in genres
MNdict = {}
for i in mergedTrain.genreID.unique():
    byGenre = mergedTrain[mergedTrain['genreID']==i]
    byGenre = pd.concat([byGenre[['wordName','wordIdx','counts']],vocabCount[['wordName','wordIdx','counts']]])
    byGenre = byGenre.groupby(['wordName','wordIdx']).sum()
    byGenre.reset_index(inplace=True)
    byGenre['counts'] = log(1+byGenre['counts'])
    byGenre['Pj']=byGenre['counts']/sum(byGenre['counts'])
    byGenre = byGenre[['wordName','wordIdx','Pj']]
    MNdict[i] = byGenre

In [20]:
## convert dictionairy distribution to one dataframe.  bring in PIj to get ready to classify
mastertrain = pd.DataFrame()
for i in MNdict:
    trainset = MNdict[i]
    trainset['genreID'] = i
    trainset['PIj'] = float(docCount[docCount.genreID==i]['PIj'])
    mastertrain=pd.concat([mastertrain,trainset])

In [21]:
### classify each document in the testDF
bigmerged = pd.merge(mastertrain,mergedValid[['docIdx','wordIdx','counts']])  ## drop actual genre for merging
bigmerged['calc']=log(bigmerged['Pj'])+log(1+bigmerged['counts'])*log(bigmerged['Pj'])
bigmerged = bigmerged[['docIdx','genreID','calc']].groupby(['docIdx','genreID']).sum()
bigmerged = bigmerged.reset_index()
idx = bigmerged.groupby(['docIdx'])['calc'].transform(max) == bigmerged['calc']  ## select argmax for each document
testresults = bigmerged[idx]

In [22]:
## Check results
testresults.rename(columns={'genreID':'predgenre'},inplace=True)
testcompare = pd.merge(testresults,mergedValid)
sum(testcompare.predgenre == testcompare.genreID)/float(len(testcompare))

0.84334386863458899

#### A small boost in accuracy to 84.3%

## Refit new model and test on orig test set

In [23]:
docselector = pd.DataFrame(mergedDF['docIdx'].drop_duplicates())
mergedDF = mergedDF.merge(docselector)

In [24]:
mergedDF = mergedDF[~(mergedDF['wordName'].isin(stop_words))]

In [25]:
## Create Dictionary of probability distributions of words in genres
MNdict = {}
for i in mergedDF.genreID.unique():
    byGenre = mergedDF[mergedDF['genreID']==i]
    byGenre = pd.concat([byGenre[['wordName','wordIdx','counts']],vocabCount[['wordName','wordIdx','counts']]])
    byGenre = byGenre.groupby(['wordName','wordIdx']).sum()
    byGenre.reset_index(inplace=True)
    byGenre['counts'] = log(1+byGenre['counts'])
    byGenre['Pj']=byGenre['counts']/sum(byGenre['counts'])
    byGenre = byGenre[['wordName','wordIdx','Pj']]
    MNdict[i] = byGenre

In [26]:
## convert dictionairy distribution to one dataframe.  bring in PIj to get ready to classify
mastertrain = pd.DataFrame()
for i in MNdict:
    trainset = MNdict[i]
    trainset['genreID'] = i
    trainset['PIj'] = float(docCount[docCount.genreID==i]['PIj'])
    mastertrain=pd.concat([mastertrain,trainset])

In [27]:
## Read in test data
testdata = pd.read_csv("test.data", header=None, sep=r"\s+")
testdata.columns = ['docIdx', 'wordIdx' ,'counts']
testlabel = pd.read_csv("test.label", header=None, sep=r"\s+")
testmaps = pd.read_csv("test.map", header=None, sep=r"\s+")
testlabel.reset_index(inplace=True)
testlabel['index']=testlabel['index']+1
testlabel.columns = ['docIdx','genreID']
testmaps.columns=['genreName','genreID']
testDF = testdata.merge(testlabel).merge(testmaps).merge(vocab[['wordIdx','wordName']])

In [28]:
## New features
testDF = testDF[~(testDF['wordName'].isin(stop_words))]
#testDF['counts']=log(1+testDF['counts'])

In [29]:
### classify each document in the testDF
bigmerged = pd.merge(mastertrain,testDF[['docIdx','wordIdx','counts']])  ## drop actual genre for merging
bigmerged['calc']=log(bigmerged['Pj'])+log(1+bigmerged['counts'])*log(bigmerged['Pj'])
bigmerged = bigmerged[['docIdx','genreID','calc']].groupby(['docIdx','genreID']).sum()
bigmerged = bigmerged.reset_index()
idx = bigmerged.groupby(['docIdx'])['calc'].transform(max) == bigmerged['calc']  ## select argmax for each document
testresults = bigmerged[idx]

In [30]:
## Check results
testresults.rename(columns={'genreID':'predgenre'},inplace=True)
testcompare = pd.merge(testresults,testDF)
sum(testcompare.predgenre == testcompare.genreID)/float(len(testcompare))

0.79886930486425645

#### A small increase in accuracy from 79.4% to 79.9%