In [15]:
### Packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import metrics as skm
import matplotlib.pyplot as plt

In [16]:
### Loading and cleaning dataset 1
nhanes = pd.read_csv("dataset1/NHANES_age_prediction.csv")

## Rename columns
nhanes = nhanes.rename(columns = {
    "SEQN":"Index",
    "RIDAGEYR":"Age",
    "RIAGENDR":"Gender",
    "PAQ605":"Fitness",
    "BMXBMI":"BMI",
    "LBXGLU":"Blood_glucose",
    "DIQ010":"Diabetic",
    "LBXGLT": "Oral",
    "LBXIN": "Insulin"
})

## Checking for missing values
# no missing values!
nhanes.isnull().sum()
nhanes.describe()

# Wonky value of 7 for 1 row in "Fitness"
nhanes["Fitness"].value_counts()
# Dropping the row:
nhanes = nhanes.drop(nhanes[nhanes["Fitness"] == 7].index)
# Verify:
nhanes["Fitness"].value_counts()

2.0    1868
1.0     409
Name: Fitness, dtype: int64

In [29]:
### Loading and cleaning dataset 2
bcw = pd.read_csv("dataset2/breast-cancer-wisconsin.csv")

# Removing the first column as it contains ids that we don't need
bcw = bcw.drop(bcw.columns[0], axis=1)

# Creating column names
column_names = ["clump_thickness","cell_uniformity","cell_shape",
                "marginal_adhesion","epithereal_cell_size","bare_nuclei",
                "bland_chromatin","normal_nucleoli","mitoses","class"]
bcw.columns = column_names

# Replacing all '?' characters with NaN
bcw.replace('?', np.nan, inplace=True)

# Converting all rows to numeric values, setting any rows that can't be converted to NaN
bcw = bcw.apply(pd.to_numeric, errors='coerce')

# Dropping all rows with NaN
bcw = bcw.dropna()

bcw["class"] = [0 if x == 2 else 1 for x in bcw["class"]]

bcw = bcw.rename(columns = {"class": "malignant"})

bcw.head()

Unnamed: 0,clump_thickness,cell_uniformity,cell_shape,marginal_adhesion,epithereal_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,5,4,4,5,7,10.0,3,2,1,0
1,3,1,1,1,2,2.0,3,1,1,0
2,6,8,8,1,3,4.0,3,7,1,0
3,4,1,1,3,2,1.0,3,1,1,0
4,8,10,10,8,7,10.0,9,7,1,1


In [18]:
### ----- Dataset 1 Summary stats ----- ###

## Gender
print("Gender proportions by age group:")
print(nhanes.groupby("age_group")["Gender"].value_counts(normalize = True),
      "\n\n")
# No apparent impact of gender upon age group

## Fitness
nhanes["Fitness"] = [0 if x == 2 else 1 for x in nhanes["Fitness"]]

print("Fitness levels by age group:")
print(nhanes.groupby("age_group")["Fitness"].value_counts(normalize = True),
      "\n\n")
# Fitness does appear to predict/depend upon age group

## BMI
print("BMI summary by age group:\n",
      nhanes.groupby("age_group")["BMI"].describe(),
      "\n\n")
# not crazy different, but may be signifcant

## Blood Glucose
print("Blood glucose summary by age group:\n",
      nhanes.groupby("age_group")["Blood_glucose"].describe(),
      "\n\n")
# the seniors have noticeably higher blood glucose levels

## Diabetic
# 1: Yes diabetes
# 2: No diabetes (to be -> 0)
# 3: Borderline (to be -> 1)

nhanes["Diabetic"] = [0 if x == 2 else 1 for x in nhanes["Diabetic"]]


print("Diabetic value counts by age group:\n",
      nhanes["Diabetic"].value_counts(),
      "\n\n")
# Values are 2, 3, 1. 2 means not-diabetic, don't know what 1 and 3 mean
print(nhanes.groupby("age_group")["Diabetic"].value_counts(normalize = True))
# Higher proportion of 1s and 3s among seniors -- prolly big indicator

## Oral
nhanes.groupby("age_group")["Oral"].describe()
# Much higher among seniors rather than adults

## Insulin
nhanes.groupby("age_group")["Insulin"].describe()
# Lower among seniors vs adults


## Variables to consider in KNN:
#
# - Fitness (categorical -- 2 levels)
# - BMI (cont.)
# - Blood glucose (cont.)
# - Diabetic (categorical -- 3 levels)
# - Oral (cont.)
# - Insulin (cont.)


Gender proportions by age group:
age_group  Gender
Adult      2.0       0.512284
           1.0       0.487716
Senior     2.0       0.508242
           1.0       0.491758
Name: Gender, dtype: float64 


Fitness levels by age group:
age_group  Fitness
Adult      0          0.803450
           1          0.196550
Senior     0          0.909341
           1          0.090659
Name: Fitness, dtype: float64 


BMI summary by age group:
             count       mean       std   min   25%   50%   75%   max
age_group                                                           
Adult      1913.0  27.971877  7.526883  14.5  22.6  26.8  31.4  70.1
Senior      364.0  27.886264  5.574166  16.8  24.2  27.2  30.6  52.2 


Blood glucose summary by age group:
             count        mean        std   min   25%    50%    75%    max
age_group                                                                
Adult      1913.0   98.638787  18.258651  63.0  91.0   96.0  103.0  405.0
Senior      364.0  104.3296

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
age_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adult,1913.0,12.110774,10.061061,0.14,5.99,9.2,14.8,102.29
Senior,364.0,10.405247,7.530538,1.02,5.2475,8.465,13.2125,52.89


In [19]:
## Distance functions

euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)


In [20]:
### Implementation of KNN

# This class will represent an instance of the KNN model with a static K
## Distance functions:
euclidean = lambda x1, x2: np.sqrt(np.sum((x1 - x2)**2, axis=-1))
manhattan = lambda x1, x2: np.sum(np.abs(x1 - x2), axis=-1)

## Defining the KNN class
class KNN:

    def __init__(self, K = 1, dist_fn = euclidean):
        self.dist_fn = dist_fn
        self.K = K
        return
    
    def fit(self, x, y):
        self.x = x
        self.y = y
        self.C = np.max(y) + 1
        return self
    
    def predict(self, x_test):
        num_test = x_test.shape[0]

        distances = self.dist_fn(self.x[None,:,:], x_test[:,None,:])
        #ith-row of knns stores the indices of k closest training samples to the ith-test sample 
        knns = np.zeros((num_test, self.K), dtype=int)
        #ith-row of y_prob has the probability distribution over C classes
        y_prob = np.zeros((num_test, self.C))
        for i in range(num_test):
            # print(i)
            knns[i,:] = np.argsort(distances[i])[:self.K]
            # print(knns[i,:])
            y_prob[i,:] = np.bincount(self.y[knns[i,:]], minlength=self.C) #counts the number of instances of each class in the K-closest training samples
        #y_prob /= np.sum(y_prob, axis=-1, keepdims=True)
        #simply divide by K to get a probability distribution
        y_prob /= self.K
        return y_prob, knns


In [None]:
### Implementation of DT

class Node:
    def __init__(self, data_indices, parent):
        self.data_indices = data_indices                    #stores the data indices which are in the region defined by this node
        self.left = None                                    #stores the left child of the node 
        self.right = None                                   #stores the right child of the node
        self.split_feature = None                           #the feature for split at this node
        self.split_value = None                             #the value of the feature for split at this node
        if parent:
            self.depth = parent.depth + 1                   #obtain the dept of the node by adding one to dept of the parent 
            self.num_classes = parent.num_classes           #copies the num classes from the parent 
            self.data = parent.data                         #copies the data from the parent
            self.labels = parent.labels                     #copies the labels from the parent
            try:
                class_prob = np.bincount(self.labels[data_indices], minlength=self.num_classes) #this is counting frequency of different labels in the region defined by this node
            except:
                intlabels = self.labels.astype(int)
                class_prob = np.bincount(intlabels[data_indices], minlength=self.num_classes)
            self.class_prob = class_prob / np.sum(class_prob)  #stores the class probability for the node
            #note that we'll use the class probabilites of the leaf nodes for making pr


def greedy_test(node, cost_fn):
    #initialize the best parameter values
    best_cost = np.inf
    best_feature, best_value = None, None
    num_instances, num_features = node.data.shape
    #sort the features to get the test value candidates by taking the average of consecutive sorted feature values 
    data_sorted = np.sort(node.data[node.data_indices],axis=0)
    test_candidates = (data_sorted[1:] + data_sorted[:-1]) / 2.
    for f in range(num_features):
        #stores the data corresponding to the f-th feature
        data_f = node.data[node.data_indices, f]
        for test in test_candidates[:,f]:
            #Split the indices using the test value of f-th feature
            left_indices = node.data_indices[data_f <= test]
            right_indices = node.data_indices[data_f > test]
            #we can't have a split where a child has zero element
            #if this is true over all the test features and their test values  then the function returns the best cost as infinity
            if len(left_indices) == 0 or len(right_indices) == 0:                
                continue
            #compute the left and right cost based on the current split                                                         
            left_cost = cost_fn(node.labels[left_indices])
            right_cost = cost_fn(node.labels[right_indices])
            num_left, num_right = left_indices.shape[0], right_indices.shape[0]
            #get the combined cost using the weighted sum of left and right cost
            cost = (num_left * left_cost + num_right * right_cost)/num_instances
            #update only when a lower cost is encountered
            if cost < best_cost:
                best_cost = cost
                best_feature = f
                best_value = test
    return best_cost, best_feature, best_value


#computes misclassification cost by subtracting the maximum probability of any class
def cost_misclassification(labels):
    try:
        counts = np.bincount(labels) 
    except:
        counts = np.bincount(labels.astype(int)) 
    class_probs = counts / np.sum(counts)
    #you could compress both the steps above by doing class_probs = np.bincount(labels) / len(labels)
    return 1 - np.max(class_probs)

#computes entropy of the labels by computing the class probabilities
def cost_entropy(labels):
    try:
        class_probs = np.bincount(labels) / len(labels)
    except:
        class_probs = np.bincount(labels.astype(int)) / len(labels)
    class_probs = class_probs[class_probs > 0]              #this steps is remove 0 probabilities for removing numerical issues while computing log
    return -np.sum(class_probs * np.log2(class_probs))       #expression for entropy -\sigma p(x)log[p(x)]

#computes the gini index cost
def cost_gini_index(labels):
    try:
        class_probs = np.bincount(labels) / len(labels)
    except:
        class_probs = np.bincount(labels.astype(int)) / len(labels)
    return 1 - np.sum(np.square(class_probs))               #expression for gini index 1-\sigma p(x)^2



class DecisionTree:
    def __init__(self, num_classes=None, max_depth=3, cost_fn="cost_entropy", min_leaf_instances=1):
        self.max_depth = max_depth      #maximum dept for termination 
        self.root = None                #stores the root of the decision tree 
        if cost_fn == "cost_entropy":
            self.cost_fn = cost_entropy
        elif cost_fn == "cost_gini_index":
            self.cost_fn = cost_gini_index
        elif cost_fn == "cost_misclassification":
            self.cost_fn = cost_misclassification
        else:
            self.cost_fn = cost_entropy
        self.num_classes = num_classes  #stores the total number of classes
        self.min_leaf_instances = min_leaf_instances  #minimum number of instances in a leaf for termination
        
    def fit(self, data, labels):
        self.data = data
        self.labels = labels
        if self.num_classes is None:
            self.num_classes = int(np.max(labels) + 1)
            #self.num_classes = len(np.unique(labels))
        #below are initialization of the root of the decision tree
        self.root = Node(np.arange(data.shape[0]), None)
        self.root.data = data
        self.root.labels = labels
        self.root.num_classes = self.num_classes
        self.root.depth = 0
        #to recursively build the rest of the tree
        self._fit_tree(self.root)
        return self

    def _fit_tree(self, node):
        #This gives the condition for termination of the recursion resulting in a leaf node
        if node.depth == self.max_depth or len(node.data_indices) <= self.min_leaf_instances:
            return
        #greedily select the best test by minimizing the cost
        cost, split_feature, split_value = greedy_test(node, self.cost_fn)
        #if the cost returned is infinity it means that it is not possible to split the node and hence terminate
        if np.isinf(cost):
            return
        #print(f'best feature: {split_feature}, value {split_value}, cost {cost}')
        #to get a boolean array suggesting which data indices corresponding to this node are in the left of the split
        test = node.data[node.data_indices,split_feature] <= split_value
        #store the split feature and value of the node
        node.split_feature = split_feature
        node.split_value = split_value
        #define new nodes which are going to be the left and right child of the present node
        left = Node(node.data_indices[test], node)
        right = Node(node.data_indices[np.logical_not(test)], node)
        #recursive call to the _fit_tree()
        self._fit_tree(left)
        self._fit_tree(right)
        #assign the left and right child to present child
        node.left = left
        node.right = right
    
    def predict(self, data_test):
        class_probs = np.zeros((data_test.shape[0], self.num_classes))
        #class_probs = []
        for n, x in enumerate(data_test):
            node = self.root
            #loop along the dept of the tree looking region where the present data sample fall in based on the split feature and value
            while node.left:
                if x[node.split_feature] <= node.split_value:
                    node = node.left
                else:
                    node = node.right
            #the loop terminates when you reach a leaf of the tree and the class probability of that node is taken for prediction
            class_probs[n,:] = node.class_prob
            #class_probs.append(node.class_prob)
        return class_probs
    
    # Gets the prediction accuracy of the KNN model with an integer threshold
    def evaluate_threshold_acc(self, probabilities: list[float], actual_labels: list[int], pos_threshold: float) -> (float, list[int]):
        correct_predictions = 0
        predictions = []

        # Looping through each prediction and comparing it to the threshold
        for i in range(0, len(probabilities)):

            # Comparing prediction using the threshold
            probability = probabilities[i]
            actual = actual_labels[i]
            label_prediction = 0 if probability < pos_threshold else 1

            # Tracking the labels that are guessed
            predictions.append(label_prediction)

            # Checking to see if the prediction was correct
            if actual == label_prediction:
                correct_predictions += 1

        # Returning the proportion of correct predictions
        predict_acc = correct_predictions / actual_labels.size
        return (predict_acc, predictions)

In [21]:
## Scaling for Dataset 1

def feature_normalization(dataset):
    for c in dataset.columns:
        mean = dataset[c].mean()
        st_dev = dataset[c].std()

        dataset[c].apply(lambda x: (x - mean)/st_dev)

In [24]:
### KNN model 1: Dataset 1 with certain variables

# Model 1: continuous variables only for simplicity

nhanes["age_group"] = [0 if x == "Adult" else 1 for x in nhanes["age_group"]]

nhanes_m1 = nhanes[["BMI", "Blood_glucose", "Oral", "Insulin"]]
nhanes_target = nhanes["age_group"]


## Step 1: splitting data into train, validation, test, roughly 50%, 25%, 25%

X_train, X_test, y_train, y_test = train_test_split(
    nhanes_m1, nhanes_target, test_size = 0.25, random_state = 21

)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train, y_train, test_size = 0.33, random_state=22
)

print("Training features array dimensions:", X_train.shape)
print("Training target array dimensions:", y_train.shape, "\n")

print("Validation features array dimensions:", X_valid.shape)
print("Validation target array dimensions:", y_valid.shape, "\n")

print("Test features array dimensions:", X_test.shape)
print("Test target array dimensions:", y_test.shape, "\n")

X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

Training features array dimensions: (1143, 4)
Training target array dimensions: (1143,) 

Validation features array dimensions: (564, 4)
Validation target array dimensions: (564,) 

Test features array dimensions: (570, 4)
Test target array dimensions: (570,) 



In [25]:
## Running KNN on dataset 1
model_1 = KNN(K = 5)

model_1.fit(X_train, y_train)

print(y_train.shape)

m1_preds, m1_probs = model_1.predict(X_test)

accuracy = np.sum(m1_preds == y_test)/y_test.shape[0]

print(accuracy)
    
print(m1_probs[:10])
print(m1_preds[:10])
print(y_test[:10])


(1143,)


InvalidIndexError: (None, slice(None, None, None), slice(None, None, None))

In [None]:
# Decision Tree Section

print("\n----- DECISION TREE SECTION -----\n")

cost_functions = ["cost_misclassification", "cost_gini_index", "cost_entropy"]
max_max_depth = 10

#util functions
def evaluate_acc(pred_y, real_y):
    accurate_preds = 0
    for i in range(len(pred_y)):
        if pred_y[i] == real_y[i]:
            accurate_preds += 1

    train_accuracy = accurate_preds / len(pred_y)
    return train_accuracy

In [None]:
#TRAINING, PARAMETRIZING AND EVALUATION DT ON DATASET ONE

print("\n----- TRAINING ON DATASET ONE -----\n")

dataset_size = nhanes.shape[0]
num_cols = nhanes.shape[1]
nhanes = nhanes.to_numpy()

#change float col to int
nhanes[:, 5] *= 10#.astype(int)
nhanes[:, -1] *= 100#.astype(int)

for col in range(num_cols):
    nhanes[:, col] = (nhanes[:, col]).astype(int)

inds = np.random.permutation(dataset_size)
test_proportion = 0.25
validate_proportion = 0.25
test_size = int(test_proportion*dataset_size)
validate_size = int(validate_proportion*dataset_size)
train_size = dataset_size-test_size-validate_size


want_to_select = [True for _ in range(num_cols)]
#remove ID and age labels from X features
want_to_select[0] = False
want_to_select[1] = False
want_to_select[2] = False
x, y = nhanes[:,np.array(want_to_select)], nhanes[:,1]

x_train, y_train = x[inds[:train_size]], y[inds[:train_size]]
x_validate, y_validate = x[inds[train_size:train_size+validate_size]], y[inds[train_size:train_size+validate_size]]
x_test, y_test = x[inds[train_size+validate_size:]], y[inds[train_size+validate_size:]]

max_accuracy_function = None
max_accuracy_max_depth = None
max_accuracy = None

for fn in cost_functions:
    for max_depth in range(1,max_max_depth+1):
        DTmodel = DecisionTree(max_depth=max_depth, cost_fn=fn)
        DTmodel.fit(x_train, y_train)
        train_predictedClassProbs = DTmodel.predict(x_test)
        train_predictedClasses = []
        for v in train_predictedClassProbs:
            maxp = -1
            maxIndex = -1
            for i in range(len(v)):
                if v[i] > maxp:
                    maxp = v[i]
                    maxIndex = i
            train_predictedClasses.append(maxIndex)
        
        train_accurate_preds = 0
        for i in range(len(train_predictedClasses)):
            if train_predictedClasses[i] == y_test[i]:
                train_accurate_preds += 1

        train_accuracy = train_accurate_preds / len(train_predictedClasses)
        #print(f'TRAIN ACCURACY ON DATASET ONE OF DECISION TREE WITH COST FUNCTION {fn} AND MAX DEPTH {max_depth} IS {train_accuracy}')

        val_predictedClassProbs = DTmodel.predict(x_validate)
        val_predictedClasses = []
        for v in val_predictedClassProbs:
            maxp = -1
            maxIndex = -1
            for i in range(len(v)):
                if v[i] > maxp:
                    maxp = v[i]
                    maxIndex = i
            val_predictedClasses.append(maxIndex)

        #print(val_predictedClassProbs)
        #print(y_validate)
        val_accuracy = evaluate_acc(val_predictedClasses, y_validate)
        print(f'VALIDATION ACCURACY ON DATASET ONE OF DECISION TREE WITH COST FUNCTION {fn} AND MAX DEPTH {max_depth} IS {val_accuracy}')

        if max_accuracy is None or val_accuracy > max_accuracy:
            max_accuracy_function = fn
            max_accuracy_max_depth = max_depth
            max_accuracy = val_accuracy

print(f'BEST DECISION TREE MODEL FOR DATASET ONE HAS COST FUNCTION {max_accuracy_function} AND MAX DEPTH {max_accuracy_max_depth} WITH ACCURACY {max_accuracy}')

# Testing with test data
model = DecisionTree(max_depth=max_accuracy_max_depth, cost_fn=max_accuracy_function)

# Training the model
model.fit(x_train, y_train)

# Predicting labels
probabilities = [x[1] for x in model.predict(x_test)]

# Checking the prediction accuracy with a threshold of 0.5
accuracy, _ = model.evaluate_threshold_acc(probabilities, y_test, 0.5)
print("Got accuracy on test data of " + str(round(accuracy, 2)))

# Computing AUROC
print("\nGetting the AUROC score.")
fpr, tpr, thresholds = skm.roc_curve(y_test, probabilities)
# Compute AUC
auc = skm.roc_auc_score(y_test, probabilities)

# Plotting
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='darkgrey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Example')
plt.legend(loc="lower right")
plt.show()

In [None]:
#TRAINING, PARAMETRIZING AND EVALUATION DT ON DATASET TWO

print("\n----- TRAINING ON DATASET TWO -----\n")

dataset_size = bcw.shape[0]
num_cols = bcw.shape[1]
bcw = bcw.to_numpy().astype(int)
'''for i in range(dataset_size):
    for j in range(num_cols):
        if bcw[i, j] == 2:
            bcw[i, j] = 0
        elif bcw[i, j] == 4:
            bcw[i,j] = 1'''

inds = np.random.permutation(dataset_size)

test_proportion = 0.25
validate_proportion = 0.25
test_size = int(test_proportion*dataset_size)
validate_size = int(validate_proportion*dataset_size)
train_size = dataset_size-test_size-validate_size

x, y = bcw[:,:-1], bcw[:,-1]

x_train, y_train = x[inds[:train_size]], y[inds[:train_size]]
x_validate, y_validate = x[inds[train_size:train_size+validate_size]], y[inds[train_size:train_size+validate_size]]
x_test, y_test = x[inds[train_size+validate_size:]], y[inds[train_size+validate_size:]]

max_accuracy_function = None
max_accuracy_max_depth = None
max_accuracy = None
for fn in cost_functions:
    for max_depth in range(1,max_max_depth+1):
        DTmodel = DecisionTree(max_depth=max_depth, cost_fn=fn)
        DTmodel.fit(x_train, y_train)
        train_predictedClassProbs = DTmodel.predict(x_test)
        train_predictedClasses = []
        for v in train_predictedClassProbs:
            maxp = -1
            maxIndex = -1
            for i in range(len(v)):
                if v[i] > maxp:
                    maxp = v[i]
                    maxIndex = i
            train_predictedClasses.append(maxIndex)

        train_accurate_preds = 0
        for i in range(len(train_predictedClasses)):
            if train_predictedClasses[i] == y_test[i]:
                train_accurate_preds += 1

        train_accuracy = train_accurate_preds / len(train_predictedClasses)
        #print(f'TRAIN ACCURACY ON DATASET TWO OF DECISION TREE WITH COST FUNCTION {fn} AND MAX DEPTH {max_depth} IS {train_accuracy}')

        val_predictedClassProbs = DTmodel.predict(x_validate)
        val_predictedClasses = []
        for v in val_predictedClassProbs:
            maxp = -1
            maxIndex = -1
            for i in range(len(v)):
                if v[i] > maxp:
                    maxp = v[i]
                    maxIndex = i
            val_predictedClasses.append(maxIndex)

        val_accuracy = evaluate_acc(val_predictedClasses, y_validate)
        print(f'VALIDATION ACCURACY ON DATASET TWO OF DECISION TREE WITH COST FUNCTION {fn} AND MAX DEPTH {max_depth} IS {val_accuracy}')

        if max_accuracy is None or val_accuracy > max_accuracy:
            max_accuracy_function = fn
            max_accuracy_max_depth = max_depth
            max_accuracy = val_accuracy

print(f'BEST DECISION TREE MODEL FOR DATASET TWO HAS COST FUNCTION {max_accuracy_function} AND MAX DEPTH {max_accuracy_max_depth} WITH ACCURACY {max_accuracy}')

# Testing with test data
model = DecisionTree(max_depth=max_accuracy_max_depth, cost_fn=max_accuracy_function)

# Training the model
model.fit(x_train, y_train)

# Predicting labels
probabilities = [x[1] for x in model.predict(x_test)]

# Checking the prediction accuracy with a threshold of 0.5
accuracy, _ = model.evaluate_threshold_acc(probabilities, y_test, 0.5)
print("Got accuracy on test data of " + str(round(accuracy, 2)))

# Computing AUROC
print("\nGetting the AUROC score.")
fpr, tpr, thresholds = skm.roc_curve(y_test, probabilities)

# Compute AUC
auc = skm.roc_auc_score(y_test, probabilities)

# Plotting
plt.figure()
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (area = %0.2f)' % auc)
plt.plot([0, 1], [0, 1], color='darkgrey', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Example')
plt.legend(loc="lower right")
plt.show()