In [262]:
# This function should open a data file in csv, and transform it into a usable format 
#def preprocess_supervised():
import pandas as pd
import math
from collections import defaultdict

#Convert CSV to dataframe; slice into two dataframes based on class

# Select from breast-cancer, mushroom, hypothyroid. Enter hold-out percentage.
filename = 'breast-cancer-dos.csv'
df = pd.read_csv(filename, header=None)
hold_out_percent = 0

num_rows = len(df)
df_len = len(df.columns) - 1
classes_list = df[df_len].unique()
num_class = len(classes_list)
hold_outs_index = math.ceil(len(df) - hold_out_percent*num_rows)
hold_outs = df.iloc[hold_outs_index:len(df),:]
df = df.iloc[0:hold_outs_index,:]
num_instances = len(df)
class1 = df[df[df_len] == str(classes_list[0])]
class2 = df[df[df_len] == str(classes_list[1])]
del class1[df_len]
del class2[df_len]

# This function should build a supervised NB model
#def train_supervised():

#Generate attribute instance frequencies for both classes
class1_freq = [dict() for x in class1]
class2_freq = [dict() for x in class2]

for i in class1:
    class1_freq[i] = defaultdict(int)
    for key in class1[i]:
        class1_freq[i][key] += 1

for i in class2:
    class2_freq[i] = defaultdict(int)
    for key in class2[i]:
        class2_freq[i][key] += 1

#Generate posterior probabilities and apply log-transformation - i.e. log(P(Xi|Cj))
class1_posterior = [dict() for x in class1_freq]
class2_posterior = [dict() for x in class2_freq]

for i in class1:
    total_class1 = sum(class1_freq[i].values(), 0.0)
    class1_posterior[i][i] = {k: math.log10(v/total_class1) for k, v in class1_freq[i].items()}

for i in class2:
    total_class2 = sum(class2_freq[i].values(), 0.0)
    class2_posterior[i][i]  = {k: math.log10(v/total_class2) for k, v in class2_freq[i].items()}

#Generate class probabilities and apply log-transformation - i.e. log(P(Cj))
class1_prob = math.log10(total_class1/num_instances)
class2_prob = math.log10(total_class2/num_instances)

# This function should predict the class for a set of instances, based on a trained model 
#def predict_supervised():
class1_predicted = defaultdict(int)
class2_predicted = defaultdict(int)
correct = 0

for i in range(num_rows-hold_outs_index, num_instances):
    for j in range(len(class1.columns)):
        try:
            class2_predicted[i] += class2_posterior[j][j][df[j][i]]
            class1_predicted[i] += class1_posterior[j][j][df[j][i]]

        #Allow for any attributes that are present in only one class dictionary
        except KeyError:
            class2_predicted[i] += 0
            class1_predicted[i] += 0
    class2_predicted[i] += class2_prob
    class1_predicted[i] += class1_prob

# This function should evaluate a set of predictions, in a supervised context 
#def evaluate_supervised():
for i in range(num_rows-hold_outs_index, num_instances):
    if(class1_predicted[i] >= class2_predicted[i]):
        if(df[df_len][i]==str(classes_list[0])):
            correct += 1
    elif(class1_predicted[i] < class2_predicted[i]):
        if(df[df_len][i]==str(classes_list[1])):
            correct += 1

print("Accuracy: " + str(round(correct*100/num_instances, 2)) + "%")

Accuracy: 73.43%
