In [1]:
import os
import sqlite3 as sqlite   # replaces import stmt from book
import re
import math

In [2]:
def getwords(doc):
    splitter=re.compile('\W+')  # different than book
    #print (doc)
    # Split the words by non-alpha characters
    words=[s.lower() for s in splitter.split(doc) 
          if len(s)>2 and len(s)<20]
  
    # Return the unique set of words only
    uniq_words = dict([(w,1) for w in words])

    return uniq_words

In [3]:
class basic_classifier:

    def __init__(self,getfeatures,filename=None):
        # Counts of feature/category combinations
        self.fc={}
        # Counts of documents in each category
        self.cc={}
        self.getfeatures=getfeatures
    
    # Increase the count of a feature/category pair  
    def incf(self,f,cat):
        self.fc.setdefault(f, {})
        self.fc[f].setdefault(cat, 0)
        self.fc[f][cat]+=1
  
    # Increase the count of a category  
    def incc(self,cat):
        self.cc.setdefault(cat, 0)
        self.cc[cat]+=1  

    # The number of times a feature has appeared in a category
    def fcount(self,f,cat):
        if f in self.fc and cat in self.fc[f]:
            return float(self.fc[f][cat])
        return 0.0

    # The number of items in a category
    def catcount(self,cat):
        if cat in self.cc:
            return float(self.cc[cat])
        return 0

    # The total number of items
    def totalcount(self):
        return sum(self.cc.values())

    # The list of all categories
    def categories(self):
        return self.cc.keys()

    def train(self,item,cat):
        features=self.getfeatures(item)
        # Increment the count for every feature with this category
        for f in features:
            self.incf(f,cat)

        # Increment the count for this category
        self.incc(cat)

    def fprob(self,f,cat):
        if self.catcount(cat)==0: return 0

        # The total number of times this feature appeared in this 
        # category divided by the total number of items in this category
        return self.fcount(f,cat)/self.catcount(cat)

    def weightedprob(self,f,cat,prf,weight=1.0,ap=0.5):
        # Calculate current probability
        basicprob=prf(f,cat)

        # Count the number of times this feature has appeared in
        # all categories
        totals=sum([self.fcount(f,c) for c in self.categories()])

        # Calculate the weighted average
        bp=((weight*ap)+(totals*basicprob))/(weight+totals)
        return bp

In [4]:
class naivebayes(basic_classifier):   # change for basic_classifier

    def __init__(self,getfeatures):   
        basic_classifier.__init__(self,getfeatures)  # change for basic_classifier
        self.thresholds={}
  
    def docprob(self,item,cat):
        features=self.getfeatures(item)   

        # Multiply the probabilities of all the features together
        p=1
        for f in features: p*=self.weightedprob(f,cat,self.fprob)
        return p

    def prob(self,item,cat):
        catprob=self.catcount(cat)/self.totalcount()
        docprob=self.docprob(item,cat)
        return docprob*catprob
  
    def setthreshold(self,cat,t):
        self.thresholds[cat]=t
    
    def getthreshold(self,cat):
        if cat not in self.thresholds: return 1.0
        return self.thresholds[cat]
  
    def classify(self,item,default=None):
        probs={}
        # Find the category with the highest probability
        max=0.0
        for cat in self.categories():
            probs[cat]=self.prob(item,cat)
            if probs[cat]>max: 
                max=probs[cat]
                best=cat

        # Make sure the probability exceeds threshold*next best
        for cat in probs:
            if cat==best: continue
            if probs[cat]*self.getthreshold(best)>probs[best]: return default
        return best

In [5]:
def sampletrain(cl):
    for file in os.listdir('./emails/college/train/'):
        try:
            with open(f"./emails/college/train/{file}", "r") as f:
                Lines = f.readlines() 

                
                # Strips the newline character 
                for line in Lines:
                    cl.train(line.strip(),'college')
                    #print("Line{}: {}".format(count, line.strip()))  
                #print({file})
        except:
            continue
    for file in os.listdir('./emails/promotional/train/'):
        try:
            with open(f"./emails/promotional/train/{file}", "r") as f:
                Lines = f.readlines() 

                
                # Strips the newline character 
                for line in Lines:
                    cl.train(line.strip(),'promotional')
                    #print("Line{}: {}".format(count, line.strip()))  
                #print({file})
        except:
            continue    
#     cl.train('Nobody owns the water.','good')
#     cl.train('the quick rabbit jumps fences','good')
#     cl.train('buy pharmaceuticals now','bad')
#     cl.train('make quick money at the online casino','bad')
#     cl.train('the quick brown fox jumps','good')

In [6]:
cl = basic_classifier(getwords)

In [7]:
sampletrain(cl)
print("")
print("Total items:", cl.totalcount())
print("Categories:", cl.categories())
for cat in cl.categories():
    print(cat, cl.catcount(cat))


Total items: 53
Categories: dict_keys(['college', 'promotional'])
college 39.0
promotional 14.0


In [15]:
cl = naivebayes(getwords)
sampletrain(cl)
cl.classify('odu', default='unknown')

'college'

In [17]:
cl = naivebayes(getwords)
sampletrain(cl)
for file in os.listdir('./emails/college/test/'):
    try:
        with open(f"./emails/college/test/{file}", "r") as f: 
   
            # reading each line     
            for line in f: 

                # reading each word         
                for word in line.split(): 
                    #print(word)
                    cl.classify(word, default='unknown')
    except:
        continue

In [10]:
cl = naivebayes(getwords)
sampletrain(cl)
#cl.classify('quick rabbit', default='unknown')

for file in os.listdir('./emails/college/test/'):
    try:
        with open(f"./emails/college/test/{file}", "r") as f:
            Lines = f.readlines()
            #lines = f.read().split()


            # Strips the newline character 
            for line in Lines:
                cl.classify(line.strip(), default='unknown')
                #cl.train(line.strip(),'college')
                #print("Line{}: {}".format(count, line.strip()))  
                #print({file})
    except:
        continue
        
for file in os.listdir('./emails/promotional/test/'):
    try:
        with open(f"./emails/promotional/test/{file}", "r") as f:
            Lines = f.readlines() 
            #lines = f.read().split()


            # Strips the newline character 
            for line in Lines:
                cl.classify(line.strip(), default='unknown')
                #cl.train(line.strip(),'promotional')
                #print("Line{}: {}".format(count, line.strip()))  
            #print({file})
    except:
        continue 


In [11]:
# for file in os.listdir('./emails/promotional/test/'):
#     try:
#         with open(f"./emails/promotional/test/{file}", "r") as f:
#             #lines = f.read().split()
#             Lines = f.readlines() 

#             count = 0
#             # Strips the newline character 
#             for line in Lines: 
#                 print("Line{}: {}".format(count, line.strip()))  
#             print({file})
#     except:
#         continue

In [12]:
# def remove_newlines(fname):
#     flist = open(fname).readlines()
# #     for s in flist:
# #         print(s.rstrip('\n'))
#     return [s.rstrip('\n') for s in flist]


# for file in os.listdir('./emails/college/'):
#     try:
#         print({file})
#         print(remove_newlines(f"./emails/college/{file}"))
#     except:
#         continue