In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import combinations_with_replacement
from sklearn.model_selection import cross_val_score
import math

# Prepare the data for cross-validation and KNN

In [34]:
names = ['Sample_code_number', 'Clump_Thickness', 'Uniformity_of_Cell_Size', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses','Class']
data = pd.read_csv('breast-cancer-wisconsin.data', sep=',',header=None, names = names, na_values=['?'])
#print(data.shape)
#print(data.isnull().sum())
data = data.dropna() # no NA in the df
#print(data.shape)
np.random.seed(42)
indices = np.random.permutation(len(data))
n_training_samples = 400
# split into train and test datasets 
learnset_data = data.iloc[indices[-n_training_samples:]]
testset_data = data.iloc[indices[:-n_training_samples]]
# get the data and labels in train and train datasets 
learnset_data_ds = learnset_data.drop(['Class'],axis=1)
learnset_data_class = learnset_data['Class']
learnset_data_ds_array = np.array(learnset_data_ds)
learnset_data_class_array = np.array(learnset_data_class)
# get the data and labels in test and test datasets 
testset_data_ds = testset_data.drop(['Class'],axis=1)
testset_data_class = testset_data['Class']
testset_data_ds_array = np.array(testset_data_ds)
testset_data_class_array = np.array(testset_data_class)

# Write the function of KNN

In [3]:
def distance(instance1, instance2):
    instance1 = np.array(instance1) 
    instance2 = np.array(instance2)  
    return np.linalg.norm(instance1 - instance2)
# find new points distance to the all points in the training set 
# it needs to compute the euclidean distance between the “new” observation and all the data points in the training set. It must then select the K nearest ones and perform a majority vote. It then assigns the corresponding label to the observation.
def get_neighbors(training_set, 
                  labels, 
                  test_instance, 
                  k,
                  distance):
    distances = []
    for index in range(len(training_set)):
        dist = distance(test_instance, training_set[index])
        distances.append((training_set[index], dist, labels[index]))
    distances.sort(key=lambda x: x[1])# sort the distance for getting the neighbors
    neighbors = distances[:k]
    return neighbors

In [4]:
# Split the data into 10 parts
split = np.array_split(learnset_data, 10)
split_array = [np.array(x) for x in split]
for k in list(range(2,9))+[17,33]:
    ave_accuracy = []
    for i in range(len(split)):
        test_cv_data = split_array[i][:, 0:9]
        test_cv_labels = split_array[i][:, 9]
        train_cv_data = (np.concatenate(split_array[0:i] + split_array[i:10]))[:, 0:9]
        train_cv_labels = (np.concatenate(split_array[0:i] + split_array[i:10]))[:, 9]
        label_pred = []
        for i in range(len(test_cv_data)):
            #predictions = []
            neighbors = get_neighbors(train_cv_data, train_cv_labels, test_cv_data[i],k,distance = distance)
            raw = test_cv_labels[i].tolist()
            #print(predictions0,predictions1,predictions2,testset_data_class_array[i]) 
            #ls = np.array([predictions0]+[predictions1]+[predictions2]+[raw])
            ls = []
            for j in neighbors:
                ls.append(j[2])
            #print(neighbors)
            pred = list(set([i for i in ls if ls.count(i)>(k/2)]))
            count_TN = [0]
            count_FP = [0]
            count_FN = [0]
            count_TP = [0]
            new_ls = pred + [raw] + count_TN + count_FP + count_FN + count_TP
            #print(new_ls)
            label_pred.append(new_ls)
        # accuracy score
        for i in range(len(label_pred)):
            if (label_pred[i][0] == 2 and label_pred[i][1]==2):
                label_pred[i][2] = 1  # TN, raw=2, pred=2
            if (label_pred[i][0] == 2 and label_pred[i][1]==4):
                label_pred[i][3] = 1  # FP, raw=2, pred=4
            if (label_pred[i][0] == 4 and label_pred[i][1]==2):
                label_pred[i][4] = 1  # FN, raw=4, pred=2
            if (label_pred[i][0] == 4 and label_pred[i][1]==4):
                label_pred[i][5] = 1  # TP, raw=4 , pred=4

        df_pred = pd.DataFrame(label_pred)
        #print(df_pred)
        TN = df_pred.iloc[: ,2].sum()
        #print(TN)
        FP = df_pred.iloc[: ,3].sum()
        FN = df_pred.iloc[: ,4].sum()
        TP = df_pred.iloc[: ,5].sum()
        # confusion matrix
        # Accuracy = (TN + TP) / (TN + TP + FN + FP)
        Acuracy = (TN + TP) / (TN + TP + FN + FP)
        # TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
        TPR = TP / (TP + FN)
        # PPV (positive predictive value or precision) = TP / (TP + FP)
        PPV = TP / (TP + FP)
        # TNR (true negative rate or specificity) = TN / (TN + FP)
        TNR = TN / (TN + FP)
        # F1 Score = 2 × PPV × TPR / (PPV + TPR)
        F1_Score = 2 * PPV * TPR / (PPV + TPR)
        #print(Acuracy) #, TPR, PPV, TNR, F1_Score
        #print(TN,FP,FN,TP)
        ave_accuracy.append(Acuracy)
    print('acc score for k=%d is %.3f'%(k,sum(ave_accuracy)/10))

acc score for k=2 is 1.000
acc score for k=3 is 0.975
acc score for k=4 is 0.987
acc score for k=5 is 0.972
acc score for k=6 is 0.977
acc score for k=7 is 0.972
acc score for k=8 is 0.975
acc score for k=17 is 0.967
acc score for k=33 is 0.965


In [5]:
label_pred = []
for i in range(len(testset_data_ds_array)):
    #predictions = []
    neighbors = get_neighbors(learnset_data_ds_array, learnset_data_class_array, testset_data_ds_array[i],2,distance = distance)
    raw = testset_data_class_array[i].tolist()
    #print(predictions0,predictions1,predictions2,testset_data_class_array[i]) 
    #ls = np.array([predictions0]+[predictions1]+[predictions2]+[raw])
    ls = []
    for j in neighbors:
        ls.append(j[2])
    #print(neighbors)
    pred = list(set([i for i in ls if ls.count(i)>1]))
    count_TN = [0]
    count_FP = [0]
    count_FN = [0]
    count_TP = [0]
    new_ls = pred + [raw] + count_TN + count_FP + count_FN + count_TP
    #print(new_ls)
    label_pred.append(new_ls)
# accuracy score
for i in range(len(label_pred)):
    if (label_pred[i][0] == 2 and label_pred[i][1]==2):
        label_pred[i][2] = 1  # TN, raw=2, pred=2
    if (label_pred[i][0] == 2 and label_pred[i][1]==4):
        label_pred[i][3] = 1  # FP, raw=2, pred=4
    if (label_pred[i][0] == 4 and label_pred[i][1]==2):
        label_pred[i][4] = 1  # FN, raw=4, pred=2
    if (label_pred[i][0] == 4 and label_pred[i][1]==4):
        label_pred[i][5] = 1  # TP, raw=4 , pred=4

df_pred = pd.DataFrame(label_pred)
#print(df_pred)
TN = df_pred.iloc[: ,2].sum()
#print(TN)
FP = df_pred.iloc[: ,3].sum()
FN = df_pred.iloc[: ,4].sum()
TP = df_pred.iloc[: ,5].sum()
# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print(Acuracy, TPR, PPV, TNR, F1_Score) 
print(TN,FP,FN,TP)

0.9702602230483272 0.9883720930232558 0.9239130434782609 0.9617486338797814 0.9550561797752809
176 7 1 85.0


# Decision Tree

In [30]:
np.random.permutation(20)
# Split a dataset into k folds
def cross_validation_split(data, k_folds):
    data_split = []
    fold_size = int(len(data) / k_folds)
    indices = np.random.permutation(len(data))
    for i in range(k_folds):
        data_split.append(data[indices[i*fold_size:(i+1)*fold_size]])
    return data_split
## impurity index
# Calculate the Gini index for a split dataset
def gini_index(train_data, train_label):
    p, n = sum(train_label==4), sum(train_label==2)
    if p==0 or n==0:
        impurity = 0
    else:
        impurity = 1 - (p/(p+n))**2 - (n/(p+n))**2
    return impurity 

def entropy(train_data, train_label):
    p, n = sum(train_label==4), sum(train_label==2)
    if p==0 or n==0:
        impurity = 0
    else:
        impurity = -1*(p/(p+n)*math.log(p/(p+n),2) + n/(p+n)*math.log(n/(p+n),2))
    return impurity
def misclassification_error(train_data, train_label):
    p, n = sum(train_label==4), sum(train_label==2)
    if p==0 or n==0:
        return 0
    else:
        impurity = 1- max(p/(p+n), n/(p+n))
    return impurity

def decrease(train_data, train_label, metric, attr, thre):
    x1, y1 = train_data[train_data[:, attr] > thre], train_label[train_data[:, attr] > thre]
    x2, y2 = train_data[train_data[:, attr] < thre], train_label[train_data[:, attr] < thre]
    
    if metric == 'gini':
        impurity = gini_index(train_data, train_label)
        impurity_split = len(y1) / len(train_label) * gini_index(x1, y1) + len(y2) /len(train_label) * gini_index(x2, y2)
        
    elif metric == 'entropy':
        impurity = entropy(train_data, train_label)
        impurity_split = len(y1) / len(train_label) * entropy(x1, y1) + len(y2) /len(train_label) * entropy(x2, y2)
        
    else:
        impurity = misclassification_error(train_data, train_label)
        impurity_split = len(y1) / len(train_label) * misclassification_error(x1, y1) + len(y2) /len(train_label) * misclassification_error(x2, y2)

    # return impurity decrease
    return impurity - impurity_split
## chose threshold

def chose_thre(train_data, train_label, attr, metric):
    values = set(train_data[:, attr])    # unique values
    values_sort = sorted(list(values))
    attr_thre = 0
    max_im_dec = float('-inf')

    # try all thresholds values 
    for i in range(len(values_sort)-1):
        thre = (values_sort[i] + values_sort[i+1])/2
        im_dec = decrease(train_data, train_label, metric, attr, thre)
        if im_dec > max_im_dec:
            max_im_dec = im_dec
            attr_thre = thre
    # find the best attr threshold
    return attr_thre

# select the best attr based on the informaiton gain and threshold
def chose_attr(train_data, train_label, metric):
    max_im_dec = float('-inf')
    best_thre = None
    best_attr = None
    for attr in range(train_data.shape[1]):
        thre = chose_thre(train_data, train_label, attr, metric)
        im_dec  = decrease(train_data, train_label, metric, attr, thre)
        if im_dec > max_im_dec:
            max_im_dec = im_dec
            best_attr = attr
            best_thre = thre

    return best_attr, best_thre

class Node:
    def __init__(self, attribute, threshold):
        self.attr = attribute
        self.thre = threshold
        self.height = None
        self.left = None
        self.right = None
        self.leaf = False
        self.predict = None
        
## generate tree
def generate_tree(train_data, train_label, metric, max_depth, impurity_thre,depth=1):
    P, N = sum(train_label==4), sum(train_label==2)
    
    if P==0 or N==0:
        leaf = Node(None, None)
        leaf.leaf = True
        leaf.predict = 4 if P > N else 2
        return leaf       
    
    # max_depth    
    elif depth >= max_depth:
        leaf = Node(None, None)
        leaf.leaf = True
        leaf.predict = 4 if P > N else 2
        return leaf
    else:
        best_attr, best_thre = chose_attr(train_data, train_label, metric)
        impurity = decrease(train_data, train_label, metric, best_attr, best_thre)

        '''
           if metric == 'gini':
            best_attr, best_thre = chose_attr(train_data, train_label, metric)
            impurity = decrease(train_data, train_label, 'gini', best_attr, best_thre)
        elif metric == 'entropy':
            best_attr, best_thre = chose_attr(train_data, train_label)
            impurity = decrease(train_data, train_label, 'entropy', best_attr, best_thre)
        else:
            best_attr, best_thre = chose_attr(train_data, train_label)
            impurity = decrease(train_data, train_label, 'misclassification_error', best_attr, best_thre)        


        '''


    # impurity threshold
    if impurity < impurity_thre:
        # create a leaf none, since it is well separated
        leaf = Node(None, None)
        leaf.leaf = True
        leaf.predict = 4 if P > N else 2
        return leaf

    else:
        root = Node(best_attr, best_thre)

        # split data 
        x1 = train_data[train_data[:, best_attr] < best_thre]
        y1 = train_label[train_data[:, best_attr] < best_thre]
        x2 = train_data[train_data[:, best_attr] > best_thre]
        y2 = train_label[train_data[:, best_attr] > best_thre] 
    # recursively bulid the tree
        root.left = generate_tree(x1, y1,metric, max_depth, impurity_thre, depth+1)
        root.right = generate_tree(x2, y2,metric, max_depth, impurity_thre, depth+1)

        return root
def accuracy_score(pred_labels, labels):
    TP = sum((pred_labels==4) & (labels==4))
    TN = sum((pred_labels==2) & (labels==2))
    return (TP + TN) / len(labels)

## define a function to get the prediction for each obs
def predict_row(node, row):
    if node.leaf:
        return node.predict
    if row[node.attr] < node.thre:
        return predict_row(node.left, row)
    else:
        return predict_row(node.right, row)
    
# predict testing data
def predict(root, test_data):
    predictions = []
    for row in test_data:
        pred = predict_row(root, row)
        predictions.append(pred)
    return np.array(predictions)
# the height of a tree   
def depth(tree):
    depth_left = 0
    depth_right = 0
    if tree.left:
        depth_left = depth(tree.left)
    if tree.right:
        depth_right = depth(tree.right)
    if depth_left > depth_right:
        return depth_left + 1
    else:
        return depth_right + 1
# Evaluate an algorithm using a cross validation split
def evaluate_algorithm(dataset, n_folds, metric, max_depth, impurity_thre):
    X = cross_validation_split(dataset, n_folds)
    scores = []
    for i in range(len(X)):
        test_cv_data = X[i][:, 0:-1]
        test_cv_labels = X[i][:, -1]
        train_cv_data = (np.concatenate(X[:i] + X[i:]))[:, 0:-1]
        train_cv_labels = (np.concatenate(X[:i] + X[i:]))[:, -1]
        
        # run decision tree
        tree = generate_tree(train_cv_data, train_cv_labels, metric, max_depth, impurity_thre,depth=1)
        predictions = predict(tree, test_cv_data)
        scores.append(accuracy_score(predictions, test_cv_labels))
    return sum(scores) / len(scores)

# Entropy

In [7]:

# max_depth = 10, impurity decrease is 0
max_depth, impurity_thre = 10, 0
tree = generate_tree(learnset_data_ds_array,learnset_data_class_array, 'entropy', max_depth, impurity_thre,depth=1)
predicted_labels = predict(tree, testset_data_ds_array)
labels = testset_data_class_array

# performance metrics
FP = sum((labels==2) & (predicted_labels==4))
FN = sum((labels==4) & (predicted_labels==2))
TP = sum((labels==4) & (predicted_labels==4))
TN = sum((labels==2) & (predicted_labels==2))

# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print('Accuracy is %.3f, TPR is %.3f, PPV is %.3f\nTNR is %.3f, F1_Score is %.3f' %(Acuracy, TPR, PPV, TNR, F1_Score))
print(TN,FP,FN,TP)

Accuracy is 0.933, TPR is 0.843, PPV is 0.966
TNR is 0.983, F1_Score is 0.901
178 3 16 86


# Gini

In [8]:

# max_depth = 10, impurity decrease is 0
max_depth, impurity_thre = 10, 0
tree = generate_tree(learnset_data_ds_array,learnset_data_class_array, 'gini', max_depth, impurity_thre,depth=1)
predicted_labels = predict(tree, testset_data_ds_array)
labels = testset_data_class_array

# performance metrics
FP = sum((labels==2) & (predicted_labels==4))
FN = sum((labels==4) & (predicted_labels==2))
TP = sum((labels==4) & (predicted_labels==4))
TN = sum((labels==2) & (predicted_labels==2))

# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print('Accuracy is %.3f, TPR is %.3f, PPV is %.3f\nTNR is %.3f, F1_Score is %.3f' %(Acuracy, TPR, PPV, TNR, F1_Score))
print(TN,FP,FN,TP)

Accuracy is 0.919, TPR is 0.882, PPV is 0.891
TNR is 0.939, F1_Score is 0.887
170 11 12 90


In [9]:
max_depth, impurity_thre = 10, 0
tree = generate_tree(learnset_data_ds_array,learnset_data_class_array, 'misclassification', max_depth, impurity_thre,depth=1)
predicted_labels = predict(tree, testset_data_ds_array)
labels = testset_data_class_array

# performance metrics
FP = sum((labels==2) & (predicted_labels==4))
FN = sum((labels==4) & (predicted_labels==2))
TP = sum((labels==4) & (predicted_labels==4))
TN = sum((labels==2) & (predicted_labels==2))

# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print('Accuracy is %.3f, TPR is %.3f, PPV is %.3f\nTNR is %.3f, F1_Score is %.3f' %(Acuracy, TPR, PPV, TNR, F1_Score))
print(TN,FP,FN,TP)

Accuracy is 0.940, TPR is 0.873, PPV is 0.957
TNR is 0.978, F1_Score is 0.913
177 4 13 89


# PCA analysis

In [11]:
X = np.array(learnset_data)[:,:-1]
Y = np.array(learnset_data)[:,-1]
X_mean = X-X.mean()
u, s, vh = np.linalg.svd(X_mean)
np.cumsum(s**2) / sum(s**2)  # 5 pcs
X_pca = X_mean @ vh.T[:,:5]
learnset_pca_data = np.column_stack([X_pca, Y])
testset_pca_data = (testset_data_ds_array - testset_data_ds_array.mean()) @ vh.T[:,:5]
### tune parameters
score_pca_matrix = np.zeros((9,6))

for d in range(2,11):  #max_depth
    for i in [x/100 for x in range(0,25,5)]:
        score = evaluate_algorithm(learnset_pca_data, 5, 'entropy', d, i)
        score_pca_matrix[d-2, int(i*20)] = score
        print('Accu score for depth=%d, impurity=%.3f is %.3f' %(d, i, score))

Accu score for depth=2, impurity=0.000 is 0.970
Accu score for depth=2, impurity=0.050 is 0.970
Accu score for depth=2, impurity=0.100 is 0.970
Accu score for depth=2, impurity=0.150 is 0.970
Accu score for depth=2, impurity=0.200 is 0.970
Accu score for depth=3, impurity=0.000 is 0.972
Accu score for depth=3, impurity=0.050 is 0.970
Accu score for depth=3, impurity=0.100 is 0.970
Accu score for depth=3, impurity=0.150 is 0.970
Accu score for depth=3, impurity=0.200 is 0.970
Accu score for depth=4, impurity=0.000 is 0.975
Accu score for depth=4, impurity=0.050 is 0.972
Accu score for depth=4, impurity=0.100 is 0.970
Accu score for depth=4, impurity=0.150 is 0.970
Accu score for depth=4, impurity=0.200 is 0.970
Accu score for depth=5, impurity=0.000 is 0.982
Accu score for depth=5, impurity=0.050 is 0.980
Accu score for depth=5, impurity=0.100 is 0.970
Accu score for depth=5, impurity=0.150 is 0.970
Accu score for depth=5, impurity=0.200 is 0.970
Accu score for depth=6, impurity=0.000 i

# Entropy

In [None]:
testset_pca_data = (testset_data_ds_array - testset_data_ds_array.mean()) @ vh.T[:,:5]

In [31]:
max_depth, impurity_thre = 10, 0
tree = generate_tree(learnset_pca_data[:, :5],learnset_data_class_array, 'entropy', max_depth, impurity_thre,depth=1)
predicted_labels = predict(tree, testset_pca_data)
labels = testset_data_class_array
print('The depth of tree is %d' %depth(tree))

# performance metrics
FP = sum((labels==2) & (predicted_labels==4))
FN = sum((labels==4) & (predicted_labels==2))
TP = sum((labels==4) & (predicted_labels==4))
TN = sum((labels==2) & (predicted_labels==2))

# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print('Accuracy is %.3f, TPR is %.3f, PPV is %.3f\nTNR is %.3f, F1_Score is %.3f' %(Acuracy, TPR, PPV, TNR, F1_Score))
print(TN,FP,FN,TP)

The depth of tree is 10
Accuracy is 0.951, TPR is 0.912, PPV is 0.949
TNR is 0.972, F1_Score is 0.930
176 5 9 93


In [21]:
testset_pca_data.shape

(283, 5)

# Gini

In [32]:
max_depth, impurity_thre = 10, 0
tree = generate_tree(learnset_pca_data[:, :5],learnset_data_class_array, 'gini', max_depth, impurity_thre,depth=1)
predicted_labels = predict(tree, testset_pca_data)
labels = testset_data_class_array
print('The depth of tree is %d' %depth(tree))

# performance metrics
FP = sum((labels==2) & (predicted_labels==4))
FN = sum((labels==4) & (predicted_labels==2))
TP = sum((labels==4) & (predicted_labels==4))
TN = sum((labels==2) & (predicted_labels==2))

# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print('Accuracy is %.3f, TPR is %.3f, PPV is %.3f\nTNR is %.3f, F1_Score is %.3f' %(Acuracy, TPR, PPV, TNR, F1_Score))
print(TN,FP,FN,TP)


The depth of tree is 10
Accuracy is 0.951, TPR is 0.892, PPV is 0.968
TNR is 0.983, F1_Score is 0.929
178 3 11 91


# Misclassification 

In [33]:
max_depth, impurity_thre = 10, 0
tree = generate_tree(learnset_pca_data[:, :5],learnset_data_class_array, 'misclassification', max_depth, impurity_thre,depth=1)
predicted_labels = predict(tree, testset_pca_data)
labels = testset_data_class_array
print('The depth of tree is %d' %depth(tree))

# performance metrics
FP = sum((labels==2) & (predicted_labels==4))
FN = sum((labels==4) & (predicted_labels==2))
TP = sum((labels==4) & (predicted_labels==4))
TN = sum((labels==2) & (predicted_labels==2))

# confusion matrix
# Accuracy = (TN + TP) / (TN + TP + FN + FP)
Acuracy = (TN + TP) / (TN + TP + FN + FP)
# TPR (true positive rate, recall, or sensitivity) = TP / (TP + FN)
TPR = TP / (TP + FN)
# PPV (positive predictive value or precision) = TP / (TP + FP)
PPV = TP / (TP + FP)
# TNR (true negative rate or specificity) = TN / (TN + FP)
TNR = TN / (TN + FP)
# F1 Score = 2 × PPV × TPR / (PPV + TPR)
F1_Score = 2 * PPV * TPR / (PPV + TPR)
print('Accuracy is %.3f, TPR is %.3f, PPV is %.3f\nTNR is %.3f, F1_Score is %.3f' %(Acuracy, TPR, PPV, TNR, F1_Score))
print(TN,FP,FN,TP)

The depth of tree is 10
Accuracy is 0.972, TPR is 0.961, PPV is 0.961
TNR is 0.978, F1_Score is 0.961
177 4 4 98
