# Text Classification with Naive Bayes
CS 6375.004 Machine Learning,  Assignment 4

Authors: Jianjun  Du, Bo Huang

In [1]:
import os
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from itertools import chain
import numpy as np
import pandas as pd
import math


## data input and preprocess
5 categories have been chosen. The letter headers were removed. Only English words were kept, the other non-english tokens were removed. All the words are lower cases. All words were stemmed.

In [2]:
def databuilding(path):
    traindata=[]
    for root, dirs, files in os.walk(path):
         for file in files:
            with open(os.path.join(root, file), "r") as auto:
                file_content=auto.read()

                # set the pattern for the word, only english words are kept
                pattern=r'[A-Za-z]+' 
                tokenizer = RegexpTokenizer(pattern)            
                tokens = tokenizer.tokenize(file_content)

                # transfer to lower case
                tokens=[w.lower() for w in tokens]

                # stem the words, so that all the words are in the basic format
                porter = nltk.PorterStemmer()
                tokens=[porter.stem(t) for t in tokens]

                # get rid of stop words
                tokens = [word for word in tokens if word not in stopwords.words('english')]

                #get rid of header
                for i in range(len(tokens)):
                    if i<len(tokens) and tokens[i]=='lines':
                        tokens=tokens[i+1:]

                # label the class the document belongs to
                tokens.append(root)
                traindata.append(tokens)
                
    return traindata          

In [3]:
traindata=databuilding("20news-bydate-train")

# split the data to features and classes
X=[]
y=[]
for data in traindata:
    X.append(data[:-1])
    y.append(data[-1])

# build up features, which are the unique words from all the documents
flatterned=list(chain.from_iterable(X))
dictionary=set(flatterned)
words=list(dictionary)

# initialize the dataframe; and all of them initialized as 1, for the smoothing 
columns=len(words)
rows=len(traindata)
trainning=np.ones((rows,columns))

# finishing the building of dataframe
for i in range(len(X)):
    for j in range(len(words)):
        for word in X[i]:
            if word==words[j]:
                trainning[i][j] =trainning[i][j]+1

In [4]:
testdata=databuilding("20news-bydate-test")

# split the data to features and classes
X_test=[]
y_test=[]
for data in testdata:
    X_test.append(data[:-1])
    y_test.append(data[-1])
    
rows=len(testdata)
testing=np.ones((rows,columns))

# test dataframe needs to use the same features as the training data, even if they have some different words.
for i in range(len(X_test)):
    for j in range(len(words)):
        for word in X_test[i]:
            if word==words[j]:
                testing[i][j]=testing[i][j]+1


In [5]:
# convert the string labels to numbers
tmp1=[w.split("\\")[1] for w in y]
tmp2=[w.split("\\")[1] for w in y_test]
cmap={'comp.graphics':0, 'rec.autos':1, 'sci.crypt':2, 'sci.space':3, 'soc.religion.christian':4 }
y_train=[cmap[word] for word in tmp1]
y_test=[cmap[word] for word in tmp2]

In [6]:
# conver trainning data table to the check table 
data_train=pd.DataFrame(trainning)
data_train.columns=words
data_train['classes']=y_train

tmp=data_train.groupby('classes').sum().values
total=tmp.sum(axis=1).reshape(-1,1)
model=np.divide(tmp,total)

In [7]:
# predict the test data

tmp=pd.DataFrame(y_train)
tmp.columns=['classes']
pc=tmp.groupby('classes')['classes'].count().values/tmp.shape[0]

p=np.zeros((testing.shape[0],model.shape[0]))

for i in range(len(testing)):
    for j in range(model.shape[0]):
        for k in range(testing.shape[1]):
            if testing[i][k]!=1:
                p[i][j]=testing[i][k]*math.log(model[j][k])+math.log(pc[j])+p[i][j]

In [8]:
predicted=np.argmax(p,axis=1)
count=0
for i in range(len(predicted)):
    if predicted[i]==y_test[i]:
        count = count+1

accuracy=count/len(y_test)
print("the accuracy  is: "+str(accuracy))
        

the accuracy  is: 0.6041561074505829
