Question 1

P(rainy) = 1/2

P(sunny) = 1/2

P(sunny|a cone of ice cream) = P(sunny) * P(a|sunny) * P(cone|sunny) * P(of|sunny) * P(ice|sunny) * P(cream|sunny)

P(rainy|a cup of hot coffee) = P(rainy) * P(a|rainy) * P(cup|rainy) * P(of|rainy) * P(hot|rainy) * P(coffee|rainy)



In [1]:
import util
import classificationMethod
import math

In [2]:
class NaiveBayesClassifier(classificationMethod.ClassificationMethod):

    def __init__(self, legalLabels):
        self.legalLabels = legalLabels
        self.type = "naivebayes"
        self.k = 1  # this is the smoothing parameter, ** use it in your train method **
        self.automaticTuning = False  # Look at this flag to decide whether to choose k automatically ** use this in your train method **

    def setSmoothing(self, k):
        # This is used by the main method to change the smoothing parameter before training.
        # Do not modify this method.
        self.k = k

    def train(self, trainingData, trainingLabels, validationData, validationLabels):
        self.features = list(trainingData[0].keys())  # this could be useful for your code later...
        if self.automaticTuning:
            kgrid = [0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 20, 50]
        else:
            kgrid = [self.k]
        self.trainAndTune(trainingData, trainingLabels, validationData, validationLabels, kgrid)

    def trainAndTune(self, trainingData, trainingLabels, validationData, validationLabels, kgrid):
        bestAccuracyCount = -1  # best accuracy so far on validation set
        # Common training - get all counts from training data
        # We only do it once - save computation in tuning smoothing parameter
        commonPrior = util.Counter()  # probability over labels
        commonConditionalProb = util.Counter()  # Conditional probability of feature feat being 1
        # indexed by (feat, label)
        commonCounts = util.Counter()  # how many time I have seen feature feat with label y
        # whatever inactive or active
        for i in range(len(trainingData)):
            datum = trainingData[i]
            label = trainingLabels[i]
            commonPrior[label] += 1
            for feat, value in datum.items():
                commonCounts[(feat, label)] += 1
                if value > 0:  # assume binary value
                    commonConditionalProb[(feat, label)] += 1
        for k in kgrid:  # Smoothing parameter tuning loop!
            prior = util.Counter()
            conditionalProb = util.Counter()
            counts = util.Counter()
            # get counts from common training step
            for key, val in commonPrior.items():
                prior[key] += val
            for key, val in commonCounts.items():
                counts[key] += val
            for key, val in commonConditionalProb.items():
                conditionalProb[key] += val
            # smoothing:
            for label in self.legalLabels:
                for feat in self.features:
                    conditionalProb[(feat, label)] += k
                    counts[(feat, label)] += 2 * k  # 2 because both value 0 and 1 are smoothed
            # normalizing:
            prior.normalize()
            for x, count in conditionalProb.items():
                conditionalProb[x] = count * 1.0 / counts[x]
            self.prior = prior
            self.conditionalProb = conditionalProb
            # evaluating performance on validation set
            predictions = self.classify(validationData)
            accuracyCount = [predictions[i] == validationLabels[i] for i in range(len(validationLabels))].count(True)
            print("Performance on validation set for k=%f: (%.1f%%)" % (k, 100.0 * accuracyCount / len(validationLabels)))
            if accuracyCount > bestAccuracyCount:
                bestParams = (prior, conditionalProb, k)
                bestAccuracyCount = accuracyCount
        # end of automatic tuning loop
        self.prior, self.conditionalProb, self.k = bestParams
        util.raiseNotDefined()
        
        #https://github.com/sabotuer99/edx_Artificial_Intelligence/blob/master/p5/classification/naiveBayes.py

    def classify(self, testData):
        guesses = []
        self.posteriors = []  # Log posteriors are stored for later data analysis (autograder).
        for datum in testData:
            posterior = self.calculateLogJointProbabilities(datum)
            guesses.append(posterior.argMax())
            self.posteriors.append(posterior)
        return guesses

    def calculateLogJointProbabilities(self, datum):
        logJoint = util.Counter()
        for label in self.legalLabels:
            logJoint[label] = math.log(self.prior[label])
            for feat, value in datum.items():
                if value > 0:
                    logJoint[label] += math.log(self.conditionalProb[feat, label])
                else:
                    logJoint[label] += math.log(1 - self.conditionalProb[feat, label])
        util.raiseNotDefined()
        return logJoint
     
    #https://github.com/sabotuer99/edx_Artificial_Intelligence/blob/master/p5/classification/naiveBayes.py
    
    def findHighOddsFeatures(self, label1, label2):
        featuresOdds = []
        for feat in self.features:
            featuresOdds.append((self.conditionalProb[feat, label1] / self.conditionalProb[feat, label2], feat))
            featuresOdds.sort()
            featuresOdds = [feat for val, feat in featuresOdds[-100:]]
        util.raiseNotDefined()
        return featuresOdds


In [3]:
!python dataClassifier.py

Doing classification
--------------------
data:		digits
classifier:		mostFrequent
training set size:	100
Extracting features...
Training...
Validating...
126 correct out of 1000 (12.6%).
Testing...
108 correct out of 1000 (10.8%).
Mistake on example 0
Predicted 1; truth is 9
Image: 
                            
                            
                            
                            
                            
                            
                            
             ++###+         
             ######+        
            +######+        
            ##+++##+        
           +#+  +##+        
           +##++###+        
           +#######+        
           +#######+        
            +##+###         
              ++##+         
              +##+          
              ###+          
            +###+           
            +##+            
           +##+             
          +##+              
         +##+               
         ##+        

In [4]:
!python dataClassifier.py -h 

Usage: 
  USAGE:      python dataClassifier.py <options>
  EXAMPLES:   (1) python dataClassifier.py
                  - trains the default mostFrequent classifier on the digit dataset
                  using the default 100 training examples and
                  then test the classifier on test data
                 

Options:
  -h, --help            show this help message and exit
  -c CLASSIFIER, --classifier=CLASSIFIER
                        The type of classifier [Default: mostFrequent]
  -d DATA, --data=DATA  Dataset to use [Default: digits]
  -t TRAINING, --training=TRAINING
                        The size of the training set [Default: 100]
  -a, --autotune        Whether to automatically tune hyperparameters
                        [Default: False]
  -i ITERATIONS, --iterations=ITERATIONS
                        Maximum iterations to run training [Default: 3]


In [5]:
!python dataClassifier.py -c naiveBayes --autotune

Doing classification
--------------------
data:		digits
classifier:		naiveBayes
training set size:	100
using automatic tuning for naivebayes
Extracting features...
Training...
Method not implemented: trainAndTune


In [None]:
# References

#https://github.com/sabotuer99/edx_Artificial_Intelligence/blob/master/p5/classification/naiveBayes.py
#https://www.youtube.com/watch?v=grHXPlwtNQY&list=PLiWNvnK7PSPE--36RIdeHg8Sgg02w9ch