In [23]:
import pandas as pd 
import numpy as np 
from collections import defaultdict
import re 

In [9]:

data_df = pd.read_csv("../data/Sharktankpitchesdeals.csv").drop(['Season_Epi_code', 'Pitched_Business_Identifier', 'Deal_Shark'], axis = 1) 


In [10]:
data_df.head()

Unnamed: 0,Pitched_Business_Desc,Deal_Status
0,a functional slip worn under a wedding gown th...,1
1,hair-care products made with pheromones . Laid...,0
2,a notebook that can scan contents to cloud ser...,0
3,painting classes with wine served . Wine & Des...,1
4,a mixing bowl with a built-in scoop . Peoples ...,1


In [66]:
data_df.iloc[0]['Pitched_Business_Desc']

'a functional slip worn under a wedding gown that allows the wearer to use the restroom on their own . Bridal Buddy is a lightweight slip worn under the gown that lets brides go to the bathroom while wearing it. When nature calls, the bride can bag up her bustle to safely relieve herself without making a mess.'

In [12]:
train, validate, test = np.split(data_df, [int(.7 * len(data_df)), int(.8 * len(data_df))])

# Get all data and labels in particular dataframes
train_data = train.drop(columns='Deal_Status')
train_labels = train['Deal_Status']

validate_data = validate.drop(columns='Deal_Status')
validate_labels = validate['Deal_Status']

test_data = test.drop(columns='Deal_Status')
test_labels = test['Deal_Status']


In [38]:
train_data.shape
validate_data.shape
test_data.shape
type(train_data)

(494, 1)

(70, 1)

(142, 1)

pandas.core.frame.DataFrame

In [20]:
train_data.columns[0]


'Pitched_Business_Desc'

In [25]:
data = "Menu is  123absolutely   perfect,loved it!  "
re.sub('[^a-z\s]+',' ',data,flags=re.IGNORECASE)
re.sub('(\s+)',' ',data)

'Menu is   absolutely   perfect loved it   '

'Menu is 123absolutely perfect,loved it! '

In [43]:
arr = defaultdict(lambda:0)
arr[1]
arr[2]
list(arr.items())
arr[1]+=1

list(arr.items())

0

0

[(1, 0), (2, 0)]

[(1, 1), (2, 0)]

In [83]:
train_data.iloc[0][0]
arr = np.array(train_data)
arr.shape
arr2 = arr.reshape(arr.shape[0],)
arr2.shape
arr2[0]

'a functional slip worn under a wedding gown that allows the wearer to use the restroom on their own . Bridal Buddy is a lightweight slip worn under the gown that lets brides go to the bathroom while wearing it. When nature calls, the bride can bag up her bustle to safely relieve herself without making a mess.'

(494, 1)

(494,)

'a functional slip worn under a wedding gown that allows the wearer to use the restroom on their own . Bridal Buddy is a lightweight slip worn under the gown that lets brides go to the bathroom while wearing it. When nature calls, the bride can bag up her bustle to safely relieve herself without making a mess.'

<img src="../images/naive_bayes_formula.png"/>
<img src="../images/likelihood_prob.png"/>

In [87]:
def preprocess_string(str_arg):
    
    cleaned_str=re.sub('[^a-z\s]+',' ',str_arg,flags=re.IGNORECASE) #every char except alphabets is replaced
    cleaned_str=re.sub('(\s+)',' ',cleaned_str) #multiple spaces are replaced by single space
    cleaned_str=cleaned_str.lower() #converting the cleaned string to lower case
    
    return cleaned_str # eturning the preprocessed string in tokenized form

In [88]:
class CategoryInfo:
    
    def __init__(self, bow_dict, prob_class, denom):
        self.bow_dict = bow_dict
        self.prob_class = prob_class
        self.denom = denom
        
        

In [89]:
class NaiveBayes:
    
    def __init__(self,unique_classes):
        
        self.classes=unique_classes # Constructor is sinply passed with unique number of classes of the training set
     
    def addToBow(self, example, dict_index):
        
        
        if isinstance(example,np.ndarray): example=example[0]
     
        for token_word in example.split(): #for every word in preprocessed example
          
            self.bow_dicts[dict_index][token_word]+=1 #increment in its count
            
        
    def train(self, dataset, labels):

        self.examples=dataset
        self.labels=labels
        self.bow_dicts=np.array([defaultdict(lambda:0) for index in range(self.classes.shape[0])])
        
        #only convert to numpy arrays if initially not passed as numpy arrays - else its a useless recomputation
        
        if not isinstance(self.examples,np.ndarray): self.examples=np.array(self.examples).reshape(self.examples.shape[0],)
        if not isinstance(self.labels,np.ndarray): self.labels=np.array(self.labels).reshape(self.labels.shape[0],)
            
        #constructing BoW for each category
        for cat_index, cat in enumerate(self.classes):
          
            all_cat_examples=self.examples[self.labels==cat] #filter all examples of category == cat
            
            #get examples preprocessed
            
            cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
            
            cleaned_examples=pd.DataFrame(data=cleaned_examples)
            
            #now costruct BoW of this particular category
            np.apply_along_axis(self.addToBow, 1 , cleaned_examples, cat_index)
            
                
        ###################################################################################################
        
        prob_classes=np.empty(self.classes.shape[0])
        all_words=[]
        cat_word_counts=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
           
            #Calculating prior probability p(c) for each class
            prob_classes[cat_index]=np.sum(self.labels==cat)/float(self.labels.shape[0]) 
            
            #Calculating total counts of all the words of each class 
            count=list(self.bow_dicts[cat_index].values())
            cat_word_counts[cat_index]=np.sum(np.array(list(self.bow_dicts[cat_index].values())))+1 # |v| is remaining to be added
            
            #get all words of this category                                
            all_words+=self.bow_dicts[cat_index].keys()
         
        #combine all words of every category & make them unique to get vocabulary -V- of entire training set
        
        self.vocab=np.unique(np.array(all_words))
        self.vocab_length=self.vocab.shape[0]
                                  
        #computing denominator value                                      
        denoms=np.array([cat_word_counts[cat_index] + self.vocab_length + 1 for cat_index,cat in enumerate(self.classes)])
        
        self.cats_info=[CategoryInfo(self.bow_dicts[cat_index], prob_classes[cat_index], denoms[cat_index]) for cat_index,cat in enumerate(self.classes)]                               
        self.cats_info=np.array(self.cats_info)                                 
          
                                               
    def getExampleProb(self,test_example):                                
                                         
        likelihood_prob=np.zeros(self.classes.shape[0]) #to store probability w.r.t each class
        
        #finding probability w.r.t each class of the given test example
        for cat_index,cat in enumerate(self.classes): 
                             
            for test_token in test_example.split(): #split the test example and get p of each test word
                
                ####################################################################################
                                              
                #This loop computes : for each word w [ count(w|c)+1 ] / [ count(c) + |V| + 1 ]                               
                                              
                ####################################################################################                              
                
                #get total count of this test token from it's respective training dict to get numerator value                           
                test_token_counts=self.cats_info[cat_index].bow_dict.get(test_token, 0) + 1
                
                #now get likelihood of this test_token word                              
                test_token_prob=test_token_counts/float(self.cats_info[cat_index].denom)                              
                
                #remember why taking log? To prevent underflow!
                likelihood_prob[cat_index] +=  np.log(test_token_prob)
                                                 
        # we have likelihood estimate of the given example against every class but we need posterior probility
        post_prob=np.empty(self.classes.shape[0])
        for cat_index,cat in enumerate(self.classes):
            post_prob[cat_index]=likelihood_prob[cat_index] + np.log(self.cats_info[cat_index].prob_class)                                  
      
        return post_prob  
    
    
    
    def test(self,test_set):
      
        predictions=[] #to store prediction of each test example
        for example in test_set: 
                                              
            #preprocess the test example the same way we did for training set exampels                                  
            cleaned_example=preprocess_string(example) 
             
            #simply get the posterior probability of every example                                  
            post_prob=self.getExampleProb(cleaned_example) #get prob of this example for both classes
            
            #simply pick the max value and map against self.classes!
            predictions.append(self.classes[np.argmax(post_prob)])
                
        return np.array(predictions) 

In [90]:
nb=NaiveBayes(np.unique(train_labels))

In [91]:


print ("---------------- Training In Progress --------------------")
 
nb.train(train_data, train_labels) #start tarining by calling the train function

print ('----------------- Training Completed ---------------------')

---------------- Training In Progress --------------------
----------------- Training Completed ---------------------


In [94]:
validate_examples=np.array(validate_data).reshape(validate_data.shape[0],)

validatePclasses=nb.test(validate_examples)

#check how many predcitions actually match original test labels
validate_acc=np.sum(validatePclasses==validate_labels)/float(validate_labels.shape[0]) 

print ("Validate Set Examples: ",validate_labels.shape[0])
print ("Validate Set Accuracy: ",validate_acc*100,"%")

Validate Set Examples:  70
Validate Set Accuracy:  58.57142857142858 %


In [95]:
test_examples=np.array(test_data).reshape(test_data.shape[0],)

pclasses=nb.test(test_examples)
#check how many predcitions actually match original test labels
test_acc=np.sum(pclasses==test_labels)/float(test_labels.shape[0]) 

print ("Test Set Examples: ",test_labels.shape[0])
print ("Test Set Accuracy: ",test_acc*100,"%")

Test Set Examples:  142
Test Set Accuracy:  47.88732394366197 %


In [96]:
train_examples=np.array(train_data).reshape(train_data.shape[0],)

# For Fun, this should be close to 100%.
pclasses=nb.test(train_examples)

#check how many predcitions actually match original test labels
train_acc=np.sum(pclasses==train_labels)/float(train_labels.shape[0]) 

print ("Train Set Examples: ",train_labels.shape[0])
print ("Train Set Accuracy: ",train_acc*100,"%")

Train Set Examples:  494
Train Set Accuracy:  98.78542510121457 %
