# Part 1: Naive Bayes from scrath

In [1]:
import pandas as pd
import numpy as np
import math

In [4]:
df = pd.read_csv(r"data.csv")
df = df.iloc[:, :-1] # drop last column Unnamed: 32

In [5]:
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


## Split into train and test

In [6]:
train_portion = 0.8
train_size = int(train_portion * len(df))
test_size = len(df) - train_size

# shuffle data
df = df.sample(frac=1, random_state=42)
df_train = df[:train_size]
df_test = df[train_size:]
print(f"number of training data: {len(df_train)}, number of test data: {len(df_test)}")

number of training data: 455, number of test data: 114


## Calculate prior probabilities

In [7]:
prior_m = (df_train['diagnosis'] == 'M').sum() / train_size
prior_b = (df_train['diagnosis'] == 'B').sum() / train_size
prior = {'M': prior_m, 'B': prior_b}

print(f"malignant prior={prior_m}, benign prior={prior_b}")

malignant prior=0.3626373626373626, benign prior=0.6373626373626373


## Calculate Mean and Variance for each feature, knowing its class

In [8]:
def mean(values):
    return values.sum() / len(values)

def var(values):
    column_mean = mean(values)
    return sum((x - column_mean) ** 2 for x in values) / (len(values))

def gaussian(mean, var, x):
    return (1 / np.sqrt(2*np.pi*var)) * np.exp(-(x-mean)**2 / (2*var))

# save the mean and variance of each column, conditioned on class
features = df.columns.tolist()[2:] # features names
mean_dict = {'M':{}, 'B': {}}
var_dict = {'M':{}, 'B': {}}

df_m = df_train.loc[df_train['diagnosis'] == 'M']
df_b = df_train.loc[df_train['diagnosis'] == 'B']

for f in features:
    mean_dict['M'][f] = mean(df_m[f])
    var_dict['M'][f] = var(df_m[f])

    mean_dict['B'][f] = mean(df_b[f])
    var_dict['B'][f] = var(df_b[f])

## Calculate Likelihood

In [23]:
def likelihood(m, v, features, x):
    # in order to prevent underflow,
    # calculate log p
    p = 0

    for f in features:
        g = gaussian(m[f], v[f], x[f])
        p += np.log(g)
    return p

## Prediction

In [24]:
def predict_sample(x):

    prob = 0
    class_pred = 'M'
    for c in ['M', 'B']:
        # pass mean and variance conditioned on the class
        p = np.log(prior[c]) + likelihood(mean_dict[c], var_dict[c], features, x)
        if p > prob:
            prob = p
            class_pred = c

    return class_pred

In [25]:
def predict_batch(df):
    preds = []
    for index in range(len(df)):
        row = df.iloc[index]
        preds.append(predict_sample(row))

    return preds

In [None]:
preds = predict_batch(df_test)
preds_binary = [1 if i == "M" else 0 for i in preds]
true_labels = [1 if i == "M" else 0 for i in df_test['diagnosis']]

In [33]:
tp = 0
tn = 0
fp = 0
fn = 0

for p, t in zip(preds_binary, true_labels):
    # correct prediction
    if p == t:
        if p:
            tp += 1
        else: tn += 1
    # wrong prediction
    else:
        if p:
            fp += 1
        else: fn += 1


In [34]:
total = len(df_test)
accuracy = (tp + tn) / total
precision = tp / (tp + fp)
recall = tp / (tp + fn)
print(f"accuracy = {accuracy}, precision = {precision}, recall = {recall}")

accuracy = 0.8771929824561403, precision = 0.8367346938775511, recall = 0.8723404255319149


In [37]:
confusion = np.array([[tn, fp], [fn, tp]])
print(confusion)

[[59  8]
 [ 6 41]]


# Part 2

In [17]:
from sklearn.model_selection import train_test_split

x_train = df_train[features]
y_train = [1 if i == "M" else 0 for i in df_train["diagnosis"]]

x_test = df_test[features]
y_test = [1 if i == "M" else 0 for i in df_test["diagnosis"]]

In [18]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(x_train, y_train)

In [19]:
y_pred = nb.predict(x_test)

In [21]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("accuracy = ", accuracy_score(y_test, y_pred))
print()
print(classification_report(y_test, y_pred))

accuracy =  0.9210526315789473

              precision    recall  f1-score   support

           0       0.90      0.97      0.94        67
           1       0.95      0.85      0.90        47

    accuracy                           0.92       114
   macro avg       0.93      0.91      0.92       114
weighted avg       0.92      0.92      0.92       114



In [22]:
print(confusion_matrix(y_test, y_pred))

[[65  2]
 [ 7 40]]
