In [10]:
# Author: Blake Conrad
# Purpose: HW3 CSCI 48100
# File: Q1_bayes_train_write_bmconrad.py iris-shuffled.txt
# Constraints: Accepts training data as command line argument

from __future__ import division

# Import Libraries
import sys
import os
import numpy as np
import pandas as pd
import sys
import os
import math
from scipy import linalg
from scipy.stats import multivariate_normal
from scipy.stats import norm
from sklearn.metrics import confusion_matrix
import sklearn.metrics

def multivariate_normal_pdf(x, mu, sigma):
    size = len(x)
    det = linalg.det(sigma)
    norm_const = 1.0/ ( math.pow((2*math.pi),float(size)/2) * math.pow(det,1.0/2) )
    x_mu = np.matrix(x - mu)
    inv = sigma.I
    result = math.pow(math.e, -0.5 * (x_mu * inv * x_mu.T))
    return norm_const * result

def normal_pdf(x, mu, sigma):
    u = (x-mu)/abs(sigma)
    y = (1/(math.sqrt(2*math.pi)*abs(sigma)))*math.exp(-u*u/2)
    return y


#
# Bayes Classifier Class Object
#
# Accepts: A pandas-like data frame object
#     - For Details: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
# Returns: 3 lists, P(c_i), mu_i, and sigma_i
#     - Ex) P_c_i[1], mu_i[1], sigma_i[1] ,
#           P_c_i[2], mu_i[2], sigma_i[2] , and
#           P_c_i[3], mu_i[3], sigma_i[3] are enough information to satisfy our objective Function.
#
# Objective Function: y = argmax of c_i = { P(c_i|x) * P(c_i) }
#

class Bayes_Classifier:
    
    # Class Attributes
    constructed = False
    built = False
    predicted = False
    
    # Constructor
    def __init__(self, DF_TRAIN):
        
        # Flag Appropriately
        self.constructed = True
        
        # Save training set
        self.DF_TRAIN = DF_TRAIN
        
        # Class labels
        self.classes = pd.unique(self.DF_TRAIN["Flower Type"])
        self.actual_labels = self.DF_TRAIN.ix[:,"Flower Type"]
        # Constants
        self.k = len(self.classes)
        self.n = len(self.DF_TRAIN)

        # Containers
        self.D_i = list()
        self.n_i = list()
        self.P_c_i = list()
        self.mean_i = list()
        self.sigma_i = list()
        
        
    # Methods    
    def build(self):
        
        # Flag Appropriately
        self.built = True
        
        # Algorthm 18.1
        for i in range(self.k):
            self.D_i.append(self.DF_TRAIN.loc[self.DF_TRAIN['Flower Type'] == self.classes[i]])
            self.n_i.append(len(self.D_i[i]))
            self.P_c_i.append(self.n_i[i] / self.n)
            self.mean_i.append(self.D_i[i].mean())
            self.sigma_i.append(np.cov(self.D_i[i].ix[:,:4].as_matrix().transpose(), bias=True))
            """
            print "D_i\n", D[i]
            print "n_i\n", n_i[i]
            print "P(c_i)\n", P_c_i[i]
            print "mean_i\n", mean_i[i]
            print "sigma_i\n", sigma_i[i]
            """
            
        # Return the model
        return self.D_i, self.n_i, self.P_c_i, self.mean_i, self.sigma_i
    
    def writeModel(self):
        
        # Round to 2 decimals as requested
        self.P_c_i = np.around(self.P_c_i, decimals=2)
        self.mean_i = np.around(self.mean_i, decimals=2)
        self.sigma_i = np.around(self.sigma_i, decimals=2)
        
        target = open("bayes_model.txt", 'w')
        target.write("--- skip this line --- P(c_i) is each line per class~\n")
        target.write(str(self.P_c_i[0]))
        target.write("\n")
        target.write(str(self.P_c_i[1]))
        target.write("\n")
        target.write(str(self.P_c_i[2]))
        target.write("\n")
        target.write("--- skip this line --- mean_i is each line per class~\n")
        for mu in self.mean_i:
            target.write(str(mu.tolist()[0]))
            target.write(",")
            target.write(str(mu.tolist()[1]))
            target.write(",")
            target.write(str(mu.tolist()[2]))
            target.write(",")
            target.write(str(mu.tolist()[3]))
            target.write("\n")
        target.write("--- skip this line --- sigma_i is each 4 lines per class~\n")
        for coV in self.sigma_i: #1,2,and3
            for v in coV:        #row1,2,3,and4
                for rowVal in v: #col1,2,3,and4
                    target.write(str(rowVal))
                    target.write(",")
                target.write("\n") 

        target.close()
    @classmethod
    def readModel(self):
        
        
        target = open("bayes_model.txt","r")
        lines = target.readlines()
        
        # P(c_i)
        label1 = lines[0]
        P_c_i_str = lines[1:4]
        P_c_i_flt = map(float, P_c_i_str)
        P_c_i_ls = P_c_i_flt
        P_c_i_ls = np.array(P_c_i_ls)
        
        # Mean_i
        label2 = lines[4]
        mu1_str = lines[5].split(",")
        mu2_str = lines[6].split(",")
        mu3_str = lines[7].split(",")
        mu1_flt = map(float, mu1_str)
        mu2_flt = map(float, mu2_str)
        mu3_flt = map(float, mu3_str)
        mean_i_ls = [mu1_flt, mu2_flt, mu3_flt]
        mean_i_ls = map(np.array, mean_i_ls)
        
        label3 = lines[8]
        cov1_str = lines[9:13] #4 lines
        cov2_str = lines[13:17] #4 lines
        cov3_str = lines[17:] #4 lines

        
        # Cov1
        s1 = cov1_str[0][:-2].split(",")
        s2 = cov1_str[1][:-2].split(",")
        s3 = cov1_str[2][:-2].split(",")
        s4 = cov1_str[3][:-2].split(",")
        cov1_str = [s1, s2, s3, s4]
        cov1_mat = np.matrix(cov1_str, dtype=np.float)
        
        # Cov2
        s1 = cov2_str[0][:-2].split(",")
        s2 = cov2_str[1][:-2].split(",")
        s3 = cov2_str[2][:-2].split(",")
        s4 = cov2_str[3][:-2].split(",")
        cov2_str = [s1, s2, s3, s4]
        cov2_mat = np.matrix(cov2_str, dtype=np.float)
        
        # Cov3
        s1 = cov3_str[0][:-2].split(",")
        s2 = cov3_str[1][:-2].split(",")
        s3 = cov3_str[2][:-2].split(",")
        s4 = cov3_str[3][:-2].split(",")
        cov3_str = [s1, s2, s3, s4]
        cov3_mat = np.matrix(cov3_str, dtype=np.float)
        
        sigma_i_ls = [cov1_mat, cov2_mat, cov3_mat]
        
        self.P_c_i = P_c_i_ls
        self.mean_i = mean_i_ls
        self.sigma_i = sigma_i_ls
        return self.P_c_i, self.mean_i , self.sigma_i
    
    def predict(self, DF_TEST):
         
        # Flag Appropriately
        self.predicted = True
        
        # Save the testing set
        self.DF_TEST = DF_TEST
        
        # Containers
        self.predicted_labels = list()
        
        # For each point in DF_TEST
        for j in range(len(self.DF_TEST)):
            
            # Get the maxmimum probability classification
            max_probability_class_label = ""
            max_probability = 0
            for i in range(self.k):
                tmp = multivariate_normal.pdf(self.DF_TEST.ix[j,:4].as_matrix(),
                                              mean=self.mean_i[i], 
                                              cov=self.sigma_i[i])
                tmp = tmp * self.P_c_i[i]
                
                if(tmp > max_probability):
                    max_probability = tmp
                    max_probability_class_label = self.classes[i]
                    
            # Store our prediction for each point
            self.predicted_labels.append(max_probability_class_label)
            
        self.actual_labels = self.DF_TEST.ix[:,4].tolist()
        # Return the predictions
        return self.predicted_labels
    
    def get_confusion(self, act, pred):
        if([self.constructed, self.built, self.predicted]):
            print "Safe to calculate."
            return confusion_matrix(act, pred)
            
        else:
            print "Not safe to calculate. Consider building and predicting with your model first."
            
    def perf_measure(self, y_true, y_pred):
        recall = metrics.recall_score(y_true, y_pred)
        precision = metrics.precision_score(y_true, y_pred)
        fscore = metrics.f1_score(y_true, y_pred)  

        
        return (recall, precision, fscore)

def main():
    
    # Import Data
    trainName = "1stfold_train.txt" #iris-shuffled.txt
    
    # Format Data
    df_train = pd.read_csv(trainName,
                          sep=",",
                          names=["Septal Length",
                                "Septal Width",
                                "Pedal Length",
                                "Pedal Width",
                                "Flower Type"],
                          dtype={'Septal Length':  np.float64,
                                 'Septal Width' :  np.float64,
                                 'Pedal Length' :  np.float64,
                                 'Pedal Width'  :  np.float64})



    bayes_classifier = Bayes_Classifier(df_train)
    D_i, n_i, P_c_i, mean_i, sigma_i = bayes_classifier.build()

    bayes_classifier.writeModel()

    bayes_classifier.readModel()

    print "Mean brought in"
    print bayes_classifier.mean_i
    
    print "sigma brought in"
    print bayes_classifier.sigma_i
    
    print "p(c_i)a brought in"
    print bayes_classifier.P_c_i

if __name__ == "__main__":
    print "Starting Q1_bayes_train_write.py -- arg[0] name.py -- arg[1] train.txt"
    main()
    print "Finished training and writing."

Starting Q1_bayes_train_write.py -- arg[0] name.py -- arg[1] train.txt
Finished training and writing.
