In [1]:
import re, string
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

Defining all the function to read data, extract features and traing the model

In [2]:
#Read the datasets
path = 'C:\\tmp\\'
filePrefix = 'training_'
categories=['ARTS','SPORTS']
minWordLength = 3
maxWordLength = 20
totArticles = 0

dataset={}
allFeatures=set()
for category in categories:
    fileName=path+filePrefix+category.lower()
    f=open(fileName,'r')
    dataset[category]=f.readlines()
    f.close
    totArticles+=len(dataset[category])

# print dataset['ARTS']
# print '==========================='
# print dataset['SPORTS']

Calculating the probabilities for each category (you can define any number of categories)

In [3]:
feature_count = {}
category_count = {}
probCat = {}

# Calculate the probabilities for each category
for category in categories:
    probCat[category]=len(dataset[category])*1.0/totArticles
    print ("%s - p(%s)=%s" % (category,category,probCat[category]))

ARTS - p(ARTS)=0.5
SPORTS - p(SPORTS)=0.5


Calculating term probabilities 𝑝(𝑡|C) and 𝑝(𝑡)

In [4]:
freqWord = {}
wordCounts = {}

def buildFrequencies(dataset):
    for category in categories:
        freqWordCat = {}
        count = 0
        for article in dataset[category]:
            #You can do a lot of optimization here
            words = [w for w in word_tokenize(article)]
            count+=len(words)
            for word in words:
                allFeatures.add(word)
                if word in freqWordCat:
                    freqWordCat[word] = freqWordCat[word]+1
                else:
                    freqWordCat[word] = 1
        freqWord[category] = freqWordCat
        wordCounts[category] = count


#Generate frequencies
buildFrequencies(dataset)

print ("Checking Frequencies for word 'team':")
print ("F('team'|'ARTS')=%s" % freqWord['ARTS']['team'])
print ("F('team'|'SPORTS')=%s" % freqWord['SPORTS']['team'])

Checking Frequencies for word 'team':
F('team'|'ARTS')=1
F('team'|'SPORTS')=19


From frequencies to probabilities

In [5]:
def getTermProbability(word):
    count = 0
    total = 0
    for category in categories:
        total += wordCounts[category]
        if word in freqWord[category]:
            count+=freqWord[category][word]
    return count*1.0/total

def getTermCondProbability(word,category):
    count = 0
    total = wordCounts[category]

    if word in freqWord[category]:
        count=freqWord[category][word]
    else:
        #Apply Laplace Smoothing
        count=1.0/(len(freqWord[category])+len(allFeatures))
    
    return count*1.0/total
    
print ("probability for word 'team' - p('team')=%s" % getTermProbability('team'))
print ("probability for word 'team' in ARTS - p('team'|'ARTS')=%s" % getTermCondProbability('team','ARTS'))
print ("probability for word 'team' in SPORTS - p('team'|'SPORTS')=%s" % getTermCondProbability('team','SPORTS'))

probability for word 'team' - p('team')=0.00320821302534
probability for word 'team' in ARTS - p('team'|'ARTS')=0.000304043782305
probability for word 'team' in SPORTS - p('team'|'SPORTS')=0.00645161290323


Build Naive Bayes Classifier

In [6]:
def NaiveBayesClassifier(article):
    words = [w for w in word_tokenize(article)]
    results={}
    for category in categories:
        pCat = probCat[category]
        pNumerator = 1.0
        for word in words:
            pN = getTermCondProbability(word,category)
            pNumerator*= pN
        pClassification = pNumerator*pCat
        results[category] = pClassification
    
    pMax = 0.0
    predictedClass = ''
    for category in categories:
        if results[category]>pMax:
            pMax = results[category]
            predictedClass = category

    #print ('The article has been assigned to class "%s" with a probability of %s' % (predictedClass,pMax))
    return predictedClass

article = "Without Any Title at Stake, Cavaliers Relive Rally Past Warriors"
NaiveBayesClassifier(article)


'SPORTS'

Testing the Classif

In [9]:
f=open('C:\\tmp\\testing.txt','r')
lines=f.readlines()
f.close

correct = 0
total = len(lines)
index = 1

TP=0.0
TN=0.0
FP=0.0
FN=0.0

F=0.0
precision = 0.0
recall = 0.0

for line in lines:
    elems = line.split('\t')
    article=elems[0]
    category=elems[1][:-1]
    predictedCategory = NaiveBayesClassifier(article)
    
    print '%s. Prediction[%s] Class[%s]' % (index,predictedCategory,category)
    index+=1
    
    #Calculating quality measures
    if (predictedCategory == category):
        correct+=1
        if (category == categories[1]):
            TP+=1
        else:
            TN+=1
    else:
        if (predictedCategory == categories[1]):
            FN+=1
        else:
            FP+=1

precision = TP/(TP+FP)
recall = TP/(TP+FN)
F=2*(precision*recall)/(precision+recall)

print ('\nThe classifer was correct %s out of %s or %s' % (correct,total,correct*1.0/total))
print 'precision=%s' % precision
print 'recall=%s' % recall
print 'F=%s' % F

1. Prediction[SPORTS] Class[SPORTS]
2. Prediction[ARTS] Class[SPORTS]
3. Prediction[ARTS] Class[SPORTS]
4. Prediction[SPORTS] Class[SPORTS]
5. Prediction[ARTS] Class[SPORTS]
6. Prediction[ARTS] Class[ARTS]
7. Prediction[ARTS] Class[ARTS]
8. Prediction[ARTS] Class[ARTS]
9. Prediction[SPORTS] Class[ARTS]
10. Prediction[SPORTS] Class[ARTS]

The classifer was correct 5 out of 10 or 0.5
precision=0.4
recall=0.5
F=0.444444444444
