# Naive Bayes Classifier

In [1]:
# Importing all the necessary packages
import numpy as np
import pandas as pd
from sklearn import preprocessing
import scipy.stats as stats

path = "DataSets/breast-cancer.csv"
test_path = "DataSets/breast-cancer.csv"

In [2]:
def preprocess(dataset):
    data = dataset.values
    dataset = dataset.convert_objects(convert_numeric=True)
    columns_idx = list(dataset.iloc[:,:-1].columns)
    numeric_cols_idx = list(dataset.iloc[:,:-1]._get_numeric_data().columns)
    categorical_cols_idx = list(set(columns_idx) - set(numeric_cols_idx))
    
    X_numeric = pd.DataFrame(np.float_(data[1:,numeric_cols_idx]), columns=numeric_cols_idx) 
    X_categorical = pd.DataFrame(data[1:,categorical_cols_idx], columns=categorical_cols_idx)
    Y = data[1:,-1]
    
    lb = preprocessing.LabelBinarizer()
    Y = pd.DataFrame(lb.fit_transform(Y), columns=[-1])
    for i in range(X_categorical.shape[1]):
        X_categorical.iloc[:, i] = pd.Categorical(X_categorical.iloc[:, i])
        X_categorical.iloc[:, i] = X_categorical.iloc[:, i].cat.codes
    
    #X_categorical = X_categorical.values
    
    return X_numeric, X_categorical, Y

In [3]:
def load_data(path):
    dataset = pd.read_csv(path, header=None)
    X_numeric, X_categorical, Y = preprocess(dataset)
    
    X_numeric.reset_index(drop=True, inplace=True)
    X_categorical.reset_index(drop=True, inplace=True)
    Y.reset_index(drop=True, inplace=True)
    
    return X_numeric, X_categorical, Y

In [4]:
X_numeric, X_categorical, Y = load_data(path)
numeric_data = pd.concat([X_numeric, Y], axis=1)
categorical_data = pd.concat([X_categorical, Y], axis=1)

numerical_X_positive = numeric_data[numeric_data.iloc[:,-1] == 1]
numerical_X_negative = numeric_data[numeric_data.iloc[:,-1] == 0]

categorical_X_positive = categorical_data[categorical_data.iloc[:,-1] == 1]
categorical_X_negative = categorical_data[categorical_data.iloc[:,-1] == 0]

  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
def isInt(value):
  try:
    int(value)
    return True
  except ValueError:
    return False

In [6]:
def NaiveBayes(numerical_X_positive, numerical_X_negative, categorical_X_positive, categorical_X_negative, Observation):
    
    P_of_Y_positive = len(numerical_X_positive) / (len(numerical_X_positive) + len(numerical_X_negative))
    P_of_Y_negative = len(numerical_X_negative) / (len(numerical_X_positive) + len(numerical_X_negative))
    P_of_X_given_Y_positive = 1
    P_of_X_given_Y_negative = 1
    
    for i, x in Observation.iteritems():
        #print(i)
        if isInt(x): 
            x = int(x)
            P_of_X_given_Y_positive *= len(categorical_X_positive[i][categorical_X_positive[i] == x]) / len(categorical_X_positive)
            P_of_X_given_Y_negative *= len(categorical_X_negative[i][categorical_X_negative[i] == x]) / len(categorical_X_negative)
            
        else:
            x = float(x)
            pos_mean = np.mean(np.float_(numerical_X_positive[i].values))
            pos_std = np.std(np.float_(numerical_X_positive[i].values))
            P_of_X_given_Y_positive *= stats.norm(pos_mean, pos_std).pdf(x)

            neg_mean = np.mean(np.float_(numerical_X_negative[i].values))
            neg_std = np.std(np.float_(numerical_X_negative[i].values))
            P_of_X_given_Y_negative *= stats.norm(neg_mean, neg_std).pdf(x)
            
    P_of_Y_given_X = 1 if P_of_X_given_Y_positive * P_of_Y_positive >= P_of_X_given_Y_negative * P_of_Y_negative else 0
    
    return P_of_Y_given_X

In [7]:
def Accuracy(numerical_X_positive, numerical_X_negative, categorical_X_positive, categorical_X_negative, Observations, Y):
    total, TP, FP, TN, FN, prediction, Accuracy = 0, 0, 0, 0, 0, 0, 0
    Observations = Observations.astype('str')
    
    for i, row in Observations.iterrows():
        #print("row", i)
        prediction = 1 if Y.iloc[i,-1] == NaiveBayes(numerical_X_positive, numerical_X_negative, categorical_X_positive, categorical_X_negative, row) else 0

        if prediction:
            total += 1
            TP += 1 if Y.iloc[i,-1] == 1 else 0
            TN += 1 if Y.iloc[i,-1] == 0 else 0
        else:
            FP += 1 if Y.iloc[i,-1] == 0 else 0
            FN += 1 if Y.iloc[i,-1] == 1 else 0
            
    Accuracy = total/ len(Y) * 100

    return ("Accuracy; %.4f, TP: %s, FP: %s, TN: %s, FN: %s " % (Accuracy, TP, FP, TN, FN))

In [8]:
def TransformDataSet(path):
    X_numeric, X_categorical, Y = load_data(path)
    trasformed_data = pd.concat([X_numeric, X_categorical], axis=1)
    return trasformed_data, Y

In [9]:
Xtest, Ytest = TransformDataSet(test_path)

  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
Accuracy(numerical_X_positive, numerical_X_negative, categorical_X_positive, categorical_X_negative, Xtest, Ytest)

'Accuracy; 74.8252, TP: 44, FP: 31, TN: 170, FN: 41 '