# Naive Bayes Classifier

In [645]:
# Importing all the necessary packages
import numpy as np
import pandas as pd
from sklearn import preprocessing
import scipy.stats as stats

In [878]:
def preprocess(dataset):
    data = dataset.values
    dataset = dataset.convert_objects(convert_numeric=True)
    columns_idx = list(dataset.iloc[:,:-1].columns)
    numeric_cols_idx = list(dataset.iloc[:,:-1]._get_numeric_data().columns)
    categorical_cols_idx = list(set(columns_idx) - set(numeric_cols_idx))
    
    X_numeric = pd.DataFrame(np.float_(data[1:,numeric_cols_idx]), columns=numeric_cols_idx) 
    X_categorical = pd.DataFrame(data[1:,categorical_cols_idx], columns=categorical_cols_idx)
    Y = data[1:,-1]
    
    lb = preprocessing.LabelBinarizer()
    Y = pd.DataFrame(lb.fit_transform(Y), columns=[-1])
    for i in range(X_categorical.shape[1]):
        X_categorical.iloc[:, i] = pd.Categorical(X_categorical.iloc[:, i])
        X_categorical.iloc[:, i] = X_categorical.iloc[:, i].cat.codes
    
    #X_categorical = X_categorical.values
    
    return X_numeric, X_categorical, Y

In [838]:
def load_data(path):
    dataset = pd.read_csv(path, header=None)
    X_numeric, X_categorical, Y = preprocess(dataset)
    
    X_numeric.reset_index(drop=True, inplace=True)
    X_categorical.reset_index(drop=True, inplace=True)
    Y.reset_index(drop=True, inplace=True)
    
    return X_numeric, X_categorical, Y

In [879]:
X_numeric, X_categorical, Y = load_data("credit-g.csv")
numeric_data = pd.concat([X_numeric, Y], axis=1)
categorical_data = pd.concat([X_categorical, Y], axis=1)

numerical_X_positive = numeric_data[numeric_data.iloc[:,-1] == 1]
numerical_X_negative = numeric_data[numeric_data.iloc[:,-1] == 0]

categorical_X_positive = categorical_data[categorical_data.iloc[:,-1] == 1]
categorical_X_negative = categorical_data[categorical_data.iloc[:,-1] == 0]

  This is separate from the ipykernel package so we can avoid doing imports until


In [747]:
def NaiveBayes(numerical_X_positive, numerical_X_negative, categorical_X_positive, categorical_X_negative, Observation):

    P_of_Y_positive = len(X_positive) / len(Y)
    P_of_Y_negative = len(X_negative) / len(Y)
    P_of_X_given_Y_positive = 1
    P_of_X_given_Y_negative = 1
    
    for i, x in enumerate(Observation):
        pos_mean = np.mean(X_positive[:,i])
        pos_std = np.std(X_positive[:,i])
        P_of_X_given_Y_positive *= stats.norm(pos_mean, pos_std).pdf(x)
        
        neg_mean = np.mean(X_negative[:,i])
        neg_std = np.std(X_negative[:,i])
        P_of_X_given_Y_negative *= stats.norm(neg_mean, neg_std).pdf(x)
        
    P_of_Y_given_X = 1 if P_of_X_given_Y_positive * P_of_Y_positive >= P_of_X_given_Y_negative * P_of_Y_negative else 0
    
    return P_of_Y_given_X

In [748]:
def Accuracy(X, Y):
    total, TP, FP, TN, FN, prediction, Accuracy = 0, 0, 0, 0, 0, 0, 0

    for i, row in enumerate(X):
        prediction = 1 if Y[i] == NaiveBayes(X_positive, X_negative, row) else 0
        if prediction:
            total += 1
            TP += 1 if Y[i] == 1 else 0
            TN += 1 if Y[i] == 0 else 0
        else:
            FP += 1 if Y[i] == 0 else 0
            FN += 1 if Y[i] == 1 else 0
            
    Accuracy = total/ len(Y) * 100

    return ("Accuracy; %.4f, TP: %s, FP: %s, TN: %s, FN: %s " % (Accuracy, TP, FP, TN, FN))

In [650]:
Accuracy(X, Y)

'Accuracy; 65.1042, TP: 0, FP: 0, TN: 500, FN: 268 '

In [875]:
def TransformDataSet(path):
    X_numeric, X_categorical, Y = load_data(path)
    trasformed_data = pd.concat([X_numeric, X_categorical, Y], axis=1)
    return trasformed_data

In [734]:
dataset = pd.read_csv("credit-g.csv", header=None)
col = dataset.values
columns = dataset.columns
num_col = dataset._get_numeric_data().columns

In [None]:
Observation = pd.Series(Observation)

In [749]:
NaiveBayes(numerical_X_positive, numerical_X_negative, categorical_X_positive, categorical_X_negative, training_data[0,:])

0    float64
dtype: object


  This is separate from the ipykernel package so we can avoid doing imports until
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  return (self.a <= x) & (x <= self.b)
  return (self.a <= x) & (x <= self.b)


0

In [881]:
TransformDataSet("diabetes.csv")

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,0,1,2,3,4,5,6,7,-1
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
5,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0,0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1
7,10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0,0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1
9,8.0,125.0,96.0,0.0,0.0,0.0,0.232,54.0,1


In [794]:
Categorical_data_with_Y = np.array(np.hstack((X_categorical, Y)), dtype='O')
training_data = np.hstack((X_numeric, p))
training_data[0][]

array([6.0, 1169.0, 4.0, 4.0, 67.0, 2.0, 1.0, 2, 1, 7, 0, 3, 3, 2, 2, 1, 1,
       3, 1, 1, 1], dtype=object)