# Naive Bayes Classifiers


In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn import datasets
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import statistics
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [2]:
def preprocess_car(filename):
    
    # read in the data
    data = pd.read_csv('car.data',header=None, encoding = 'ISO-8859-1')
    # create a Dataframe 
    car_data = pd.DataFrame(data)
    # X_car is a dataframe containing features 
    X_car = car_data.loc[:, 0:car_data.shape[1]-2]
    # y_car is a series containing class labels
    y_car = car_data.loc[:, car_data.shape[1] - 1]
    # returns a data frame of the feature values and a series of the class labels
    return X_car, y_car

In [3]:
# this function is to change the ordinal car data to numeric 

def car_numeric_data(filename):
    
    # this block of code is modified from Practical week 4 Machine Learning, University of Melbourne
    X = []
    y = []
    with open('car.data', mode='r') as fin:
        for line in fin:
            atts = line.strip().split(",")
            # all features, excluding the class
            X.append(atts[:-1]) 
            # class labels 
            y.append(atts[-1])
            
    # need to hard code the possible feature values as there is no way to sort them into their orders
    feature_lst = [['low', 'med', 'high','vhigh'], ['low', 'med', 'high','vhigh'], ['2','3','4','5more'],\
                   ['2','4', 'more'], ['small', 'med', 'big'], ['low', 'med', 'high']]
    

    X_ordinal = []
    # go though each row in the data 
    for row in X:
        # stores the numeric values for each value in the row
        numericisedValues = []
        # for each value, add the index of it from feature_lst to the numericisedValues
        for i in range(len(row)):
            numericisedValues.append(feature_lst[i].index(row[i]))
        # add this list to the X_ordinal list (this will end up as a row in a data frame)
        X_ordinal.append(numericisedValues)

    #convert to dataframe 
    X_car_ordinal = pd.DataFrame(data=X_ordinal) 
    # convert to a series 
    y_car = pd.Series(data=y)

    # returns a data frame of the feature values and a series of the class labels                    
    return X_car_ordinal, y_car

In [8]:
def preprocess_wdbc(filename):
    
    # read in the data
    data = pd.read_csv('wdbc.data',header=None,encoding = 'ISO-8859-1')
    # create a Dataframe 
    wdbc_data = pd.DataFrame(data)
    #X_wdbc is a dataframe containing features 
    X_wdbc = wdbc_data.loc[:, 2:]
    # y_wdbc is a dataframe containing class labels
    y_wdbc = wdbc_data.loc[:, 1]
    
    # reset the indicies of the columns
    X_wdbc.columns = range(X_wdbc.shape[1])
    
    # change data to numeric 
    for col in range(len(X_wdbc.columns)):
        X_wdbc[col] = pd.to_numeric(X_wdbc[col])
        
    # returns a data frame of the feature values and a series of the class labels      
    return X_wdbc, y_wdbc
    

In [9]:
def preprocess_wine(filename):
    
    # read in the data
    data = pd.read_csv('wine.data',header=None,encoding = 'ISO-8859-1')
    # create a Dataframe 
    wine_data = pd.DataFrame(data)
    # X_wine is a dataframe containing features 
    X_wine = wine_data.loc[:, 1:]
    # y_wine is a dataframe containing class labels
    y_wine = wine_data.loc[:, 0]
    
    # reset the indicies of the columns
    X_wine.columns = range(X_wine.shape[1])
    
    # change data to numeric 
    for col in range(len(X_wine.columns)):
        X_wine[col] = pd.to_numeric(X_wine[col])
    
    # returns a data frame of the feature values and a series of the class labels        
    return X_wine, y_wine

In [10]:
def preprocess_mushroom(filename):
    
    # read in the data
    data = pd.read_csv('mushroom.data',header=None,encoding = 'ISO-8859-1')
    # create a Dataframe 
    mush_data = pd.DataFrame(data)
    
    # remove the instances which have a '?' as a value for the attribute at index 11
    mush_data_clean = mush_data[mush_data[11] != '?'].reset_index(drop = True) 
    
    # X_mush is a dataframe containing features 
    X_mush = mush_data_clean.loc[:, 1:mush_data.shape[1]-1]
    # y_mush is a series containing class labels
    y_mush = mush_data_clean.loc[:,0]
    # reset the indicies of the columns
    X_mush.columns = range(X_mush.shape[1])
    
    # returns a data frame of the feature values and a series of the class labels  
    return X_mush, y_mush, mush_data_clean, mush_data

In [11]:
def preprocess_bank(filename):
    
    # read in the data
    data = pd.read_csv('bank.data',header=None,encoding = 'ISO-8859-1')
    # create a Dataframe 
    bank_data = pd.DataFrame(data) 
    # X_bank is a dataframe containing features 
    X_bank = bank_data.loc[:, 0:bank_data.shape[1]-2]
    # y_bank is a series containing class labels
    y_bank = bank_data.loc[:,bank_data.shape[1]-1]
    # returns a data frame of the feature values and a series of the class labels  
    
    return X_bank, y_bank

# NB Classifier

In [13]:
# preprocesses selected data
def preprocess(name):
    
    if name == 'bank':
        X, y = preprocess_bank('bank.data')
        
    elif name == "car":
        X, y = preprocess_car('car.data')
        
    elif name == "carNumeric":
        X, y = car_numeric_data('car.data')
        
    elif name == "nursery":
        X, y = preprocess_nursery('nursery.data')
        
    elif name == "nurseryNumeric":
        X, y = nursery_numeric_data('nursery.data')
        
    elif name == "somervilleNumeric":
        X, y = somerville_numeric_data('somerville.data')
        
    elif name == "somerville":
        X, y = preprocess_somerville('somerville.data')
        
    elif name == "wdbc":
        X, y = preprocess_wdbc('wdbc.data')
        
    elif name == "wine":
        X, y = preprocess_wine('wine.data')
        
    elif name == "mushroom":
        X, y  = preprocess_mushroom('mushroom.data')
        
    else: 
        print("Invalid argument.")
        X, y = None, None
    
    # returns a preprocessed datafame X of the features and series y, the classes. 
    return X, y


# split the data into test and train datasets 
def test_train(X,y,rand_state):
    
    # initalie a list to save which features are numeric and if they are categorical 
    # save the possible values that each feature can take in a further nested list 
    unique_vals = []
    
    for index, col in X.iteritems():
        # if the first value of the column is either type float or int 
        if isinstance(col[0], float) or isinstance(col[0], (int, np.integer)):
            # add a 0 to the current index of the unique_vals list
            unique_vals.append(0)
        # if the first value of the column is type string 
        elif isinstance(col[0], str):
            # add a list of the unique values to the current index of the unique_vals list
            unique_vals.append(list(col.unique()))
    
    # returns X_train, X_test, y_train, y_test data frames and the list unique_vals
    return train_test_split(X, y, test_size=0.3333, random_state=rand_state), unique_vals

# trains the data 
def train(X, y, unique_vals):
    
    # find the unique classes and how many of each class 
    
    class_values = y.value_counts()
    
    priors = {}
    
    # make a dictionary to hold the prior associated with each class
    for index, class_count in class_values.iteritems():
        # take the log of the priors to prevent underflow error
        priors[index] = math.log(class_count/len(y)) 
        
    # initalise a dataframe to hold the parameters we will need for the testing phase 
    conditional_probs = pd.DataFrame(columns=class_values.index)

    
    # loop over every column in the training set
    for index, col in X.iteritems():
    
        # first is to check whether the column is numeric or nominal to can handle mixed dataframes 
        
        # This 'if' statement selects for numeric data that matches the int or float types.
        if isinstance(col.iloc[0], float) or isinstance(col.iloc[0], (int, np.integer)):
            
            # create a dictionary to store the values in a list assoicated with each class
            
            values_per_class = {}
            
            # inialise each key value (which are the possible classes) as an empty list
            for cls in class_values.index:
                values_per_class[cls] = []
                
            
            # Iterate through the rows of the currently examined column and split them into class lists.
            for row_index, row_value in col.iteritems():
                # itterate for how many classes the data set has
                for i in range(len(class_values.index)):
                    # add the value to the list for the correct class 
                    values_per_class[y[row_index]].append(row_value)
                    
            # create a dict to store mean and standard deviation of each class list 
            mu_sigma = {}
            
            # itterate for how many classes the data set has
            for cls in class_values.index:
                
                # add the mean as the first index and the standard deviation as the 2nd index for each mu_sigma value
                mu_sigma[cls] = [statistics.mean(values_per_class[cls]), statistics.stdev(values_per_class[cls])]
                
            # add the dict as a row in the conditional_probs dataframe
            conditional_probs = conditional_probs.append(mu_sigma, ignore_index=True)
        
        # This 'if' statement selects for nominal data that matches the str types.
        elif isinstance(col.iloc[0], str):

            # find all the possible values that the feature can take
            feature_vals = unique_vals[index]

            # create dictionaries, one for each class to hold the counts
            
            probs_per_class = {}
            
            for cls in class_values.index:
                # inialise each key value (which are the classes) as an empty list
                probs_per_class[cls] = {}
                 # Initialise values in dictionary. Begin at 1 for smoothing.
                for value in feature_vals:
                    probs_per_class[cls][value] = 1


            # Calculate counts for each unique value.
            for row_index, row_value in col.iteritems():
                # for each observed value, add one to the count value
                for i in range(len(class_values.index)):
                    probs_per_class[y[row_index]][row_value] += 1
                    
            # find the conditional probability of each value given its class and store in dictonary 
            for value in feature_vals:
                for cls in class_values.index:
                    # conditional probability equals the amount of times this value appears for each class (plus one for smoothing)
                    # divided by the sum of the amount of data points corresponding to this 
                    # class and how many possible unique values there are for this feature 
                    probs_per_class[cls][value] = probs_per_class[cls][value] / (class_values[cls] + len(feature_vals))
           
            # add the dictionary as a row in the dataframe   
            conditional_probs = conditional_probs.append(probs_per_class, ignore_index=True)
            
    # return a dataframe holding the mean and standard deviation values for numeric features 
    # or the conditional probabilities for nominal features, a dictionary of the prior values for each class
    # and a series, class_values of the unique classes and counts for each class
    return conditional_probs, priors, class_values


# This function should predict classes for new items in a test dataset

def predict(conditional_probs, priors, class_values, X_test, unique_vals):
    
    # initalise a list to hold the predicted class for each instance 
    predicted_class = []
    # for each instance (row in the test set)
    for index, row in X_test.iterrows():  
        
        # initalsie a dictonary to hold the probabilities for each class 
        probs = {}
        for cls in class_values.index:
            # initalise each key in the dictonary as one of the unique classes and make the corresponding
            # value the prior for that class 
            probs[cls] = priors[cls]
                
        # this will be the same number of times as looping through the row 
        for i in X_test.columns:
            
            # if numeric: as all nominal data is stored in a list 
            if unique_vals[i] == 0:
                
                # conditional_probs[col][i][0] is the value of mu for each class
                # conditional_probs[col][i][1] is the value of sigma for each class 
                
                for cls in class_values.index:
                    # have this if statment for when treating ordinal data as numeric, as the data is still discrete
                    # there were a couple of 0 standard deviations, where as this is impossible for 
                    # continuous data
                    if conditional_probs[cls][i][1] == 0:
                        # instead make the standard deviation very small to aviod math error 
                        conditional_probs[cls][i][1] = 1.0e-100
                        
                    # add the conditional probability to the total proabaility for each class 
                    probs[cls] += conditional_probability(conditional_probs[cls][i][0], conditional_probs[cls][i][1], row.iloc[i])
            # all nominal data is stored in a dictonary 
                       
            elif type(unique_vals[i]) == list:
                for cls in class_values.index:
                    probs[cls] += math.log(conditional_probs[cls][i][row.iloc[i]])
                    
        #Predict the outcome
                                           
        sorted_probs = sorted(probs.items(), key=lambda x: x[1])
        predicted_class.append(sorted_probs[-1][0])
        
    return predicted_class 

# this function is for implementing the conditional probailty formula for numeric attributes 
# it takes the mean, standard deviation for each feature and the observed value from the test set.
def conditional_probability(mu, sigma, obs):
    
    # log likelihood was taken 
    # simplified from: log(1/denominator) + log(exp(exponent))
    # to become:  exponent - log(denominator) using the properties of exponents and logarithims 
    # this was to handle the math error as log(0) is undefined 
    denominator = sigma*(math.sqrt(2*math.pi))
    exponent = -0.5*(((obs-mu)/sigma)**2)
    
    likelihood = exponent - math.log(denominator)
    # returns the likelihood (condtional probability)
    return likelihood


# This function should evaluate the prediction performance by comparing your model’s class outputs to ground
# truth labels
def evaluate(predicted_class, y_test):
    
    
    confusion_matrix = sklearn.metrics.confusion_matrix(y_test, predicted_class)
    
    # the proportion of the correctly labeled instances 
    accuracy = accuracy_score(y_test, predicted_class)
  
    return accuracy

# run on any data set

In [14]:
def run_model(data):
    
    X, y = preprocess(data)

    # split data into train and test datasets
    (X_train, X_test, y_train, y_test), unique_vals =  test_train(X, y, rand_state = 20) 

    # train data 
    conditional_probs, priors, class_values = train(X_train, y_train, unique_vals)

    # predict on unseen data
    predicted_class = predict(conditional_probs, priors, class_values, X_test, unique_vals)

    # evaluate prediction and return accuracy
    return evaluate(predicted_class, y_test)


run_model('bank')

0.8824739531488486