# Assignment 3 Group no. [9]
### Project members: 
[Yuxia Wang yuxia@kth.se, Hansika Attanayake, ghat@kth.se, Sevket Melih Zenciroglu, smzen@kth.se]

### Declaration
By submitting this solution, it is hereby declared that all individuals listed above have contributed to the solution, either with code that appear in the final solution below, or with code that has been evaluated and compared to the final solution, but for some reason has been excluded. It is also declared that all project members fully understand all parts of the final solution and can explain it upon request.

It is furthermore declared that the code below is a contribution by the project members only, and specifically that no part of the solution has been copied from any other source (except for lecture slides at the course ID2214) and no part of the solution has been provided by someone not listed as project member above.

It is furthermore declared that it has been understood that no other library/package than the Python 3 standard library, NumPy, pandas and time may be used in the solution for this assignment.

### Instructions
All assignments starting with number 1 below are mandatory. Satisfactory solutions
will give 1 point (in total). If they in addition are good (all parts work more or less 
as they should), completed on time (submitted before the deadline in Canvas) and according
to the instructions, together with satisfactory solutions of assignments starting with 
number 2 below, then the assignment will receive 2 points (in total).

It is highly recommended that you do not develop the code directly within the notebook
but that you copy the comments and test cases to your regular development environment
and only when everything works as expected, that you paste your functions into this
notebook, do a final testing (all cells should succeed) and submit the whole notebook 
(a single file) in Canvas (do not forget to fill in your group number and names above).


## Load NumPy, pandas and time

In [3]:
import numpy as np
import pandas as pd
import time


## Reused functions from Assignment 1

In [4]:
# Copy and paste functions from Assignment 1 here that you need for this assignment

def create_imputation(dataframe):
    
    df = dataframe.copy()
    imputation = {}
    
    for col in df.columns:
        if col != "ID" and col != "CLASS":         
            if df[col].dtypes == "int" or df[col].dtypes == "float":           
                df[col].fillna(df[col].mean(),inplace=True)
                imputation[col] = (df[col].mean())            
            else:
                df[col].fillna(df[col].mode()[0],inplace=True)
                imputation[col] = (df["CLASS"].mode()[0])
            
    return df, imputation


def apply_imputation(dataframe,imputation):
    
    df = dataframe.copy()
    [df[col].fillna(imputation[col],inplace=True) for col in imputation]
    return df


def create_bins(dataframe,nobins=10,bintype="equal-width"):
    
    df = dataframe.copy()
    binning = {}
    for col in df.columns:
        if col != "CLASS" and col != "ID" and df[col].dtype in ["float64", "float32", "int64", "int32"]:
            if bintype == "equal-width":
                df[col], bins = pd.cut(df[col],nobins,retbins=True,duplicates="drop",labels=False)
                binning[col] = bins    
            elif bintype == "equal-size":
                df[col], bins = pd.qcut(df[col],q=nobins,retbins=True,duplicates="drop",labels=False)
                binning[col] = bins
            df[col] = df[col].astype("category")
            df[col] = df[col].cat.set_categories([str(i) for i in df[col].cat.categories], rename = True)
            binning[col][0] = -np.inf
            binning[col][-1] = np.inf
        else:
            df[col] = df[col].astype('category')
    
    return df, binning


def apply_bins(dataframe,binning):
    
    df = dataframe.copy()
    bin_labels = {}
    for col in binning:  
        bins = binning[col]
        df[col] = pd.cut(df[col],bins,labels=False)
        df[col] = df[col].astype("category")
        df[col] = df[col].cat.set_categories([str(i) for i in df[col].cat.categories], rename = True)        
    df = df.astype("category")
    return df


def split(dataframe, testfraction=0.5):
    
    df = dataframe.copy()
    df_random = df.reindex(np.random.permutation(df.index))
    trainingdf = df_random[0: int((1-testfraction)*df.shape[0])]
    testdf = df_random[int((1-testfraction)*df.shape[0])+1 : df.shape[0]]
    return trainingdf, testdf


def accuracy(dataframe, correctlabels):
    
    df = dataframe.copy()
    labels = df.idxmax(axis=1)
    truelabels = (labels == correctlabels).sum(axis=0)
    accuracy = truelabels/len(df)
    return accuracy


def folds(dataframe,nofolds=10):
    
    df = dataframe.copy()
    np.random.permutation(df.index) 
    folds = []
    for i in range(nofolds):
        folds.append(df[int(len(df)*i/nofolds) : int(len(df)*(i+1)/nofolds)])

    return folds


def brier_score(dataframe, corretlabels):
    
    df = dataframe.copy()
    correct_df = pd.get_dummies(corretlabels)
    brier_score = np.mean(np.sum((df - correct_df)**2, axis=1))
    
    return brier_score


# ROC_Henrik's way

def count_tp_fp(predictions_df, correctlabels):

    Score = predictions_df.iloc[:, 0]
    sorted_unique_score = np.unique(Score)[::-1]
    pos = np.zeros(len(sorted_unique_score))
    neg = np.zeros(len(sorted_unique_score))

    for s in range(len(sorted_unique_score)):
        for p in range(len(predictions_df)):
            if(sorted_unique_score[s] == predictions_df.iloc[p, 0]):
                if(predictions_df.columns[0] == correctlabels[p]):
                    pos[s] += 1
                else:
                    neg[s] += 1         
    
    return pos, neg

def calculate_AUC(pos, neg):
    # AUC = Area under ROC curve
    AUC = 0
    Cov_tp = 0
    n_tp = len(pos)
    Tot_tp = sum(pos)
    Tot_fp = sum(neg)
    
    for i in range(n_tp):    
        if(neg[i] == 0):
            Cov_tp += pos[i]
        elif(pos[i] == 0):
            AUC += (Cov_tp/Tot_tp)*(neg[i]/Tot_fp)
        else:
            AUC += (Cov_tp/Tot_tp)*(neg[i]/Tot_fp) + (pos[i]/Tot_tp)*(neg[i]/Tot_fp)/2
            Cov_tp += pos[i]
            #print('AUC_else = ', AUC)  
    return AUC

def auc(df, correctlabels):  
    
    class_frequency = dict(pd.Series(correctlabels).value_counts(normalize = True))   
    AUC = 0
    for col in df.columns:
        if(col in class_frequency.keys()):
            predictions_df = pd.DataFrame(df[col], columns=[col])
            pos, neg = count_tp_fp(predictions_df, correctlabels)
            area_col = calculate_AUC(pos, neg)            
            AUC += class_frequency[col] * area_col   
            
    return AUC

## 1. Define the class DecisionTree

In [5]:
# Define the class DecisionTree with three functions __init__, fit and predict (after the comments):
#
# Input to __init__: 
# self: the object itself
#
# Output from __init__:
# nothing
# 
# This function does not return anything but just initializes the following attributes of the object (self) to None:
# binning, imputatiom, labels, model
#
# Input to fit:
# self: the object itself
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# nobins: no. of bins (default = 10)
# bintype: either "equal-width" (default) or "equal-size"
# min_samples_split: no. of instances required to allow a split (default = 5)
#
# Output from fit:
# nothing
#
# The result of applying this function should be:
#
# self.binning should be a discretization mapping (see Assignment 1) from df
# self.imputation should be an imputation mapping (see Assignment 1) from df
# self.labels should be the categories of the "CLASS" column of df, set to be of type "category" 
# self.model should be a decision tree (for details, see lecture slides), where the leafs return class probabilities
# Note that the function does not return anything but just assigns values to the attributes of the object.
#
# Hint 1: First find the available features (excluding "CLASS" and "ID"), then find the class counts, e.g., using 
#         groupby, and calculate the default class probabilities (relative frequencies of the class labels)
# Hint 2: Define a function, e.g., called divide_and_conquer, that takes the above as input together with df 
#         and min_samples_split, and also a nodeno (starting with 0) to keep track of the generated nodes in the tree
# Hint 3: You may represent the tree under construction as a list of nodes (tuples), on the form:
#         (nodeno,"leaf",class_probabilities): corresponding to a leaf node where class_probabilities is a vector
#                                              with the relative class frequencies (ordered according to self.labels)
#         (nodeno,feature,node_dict): corresponding to an internal (non-leaf) node where node_dict is a mapping from
#                                     the possible values of feature to child nodes (their nodenos)
# Hint 4: You may evaluate each feature by a function information_content, which takes the group sizes
#         for each possible value of the feature together with the class counts of each group as input
# Hint 5: The best feature found (with lowest resulting information content) will be used to split the training
#         instances, and each sub-group is used for generating a sub-tree (recursively by divide_and_conquer,
#         see lecture slides for details)

# Hint 6: You may make divide_and_conquer return not only a list of nodes, but also a current_node_no; 
#         by this, each subsequent call to divide_and_conquer for each subset of instances, i.e. for each feature value, 
#         could use current_node_no as a starting point.
#         If you e.g. make the following call:
#
#         current_node_no, node_list = divide_and_conquer(current_node_no, ...)
#
#         then the returned value in current_node_no can be used in the next call to divide_and_conquer.
#         Node that node_list will contain an arbitrary number of tuples, each element corresponding to a node together 
#         with a node number. The first element in the list will have the same number as current_node_no when the call 
#         was made and the last element will have a number one less than current_node_no when returned, e.g., if there is
#         only one (leaf) node in the returned list, then current_node_no will only be incremented by one through the above call.
#         
# Hint 7: The list of nodes output by divide_and_conquer may finally be converted to an array, where each nodeno in the 
#         tuples corresponds to an index of the array 
#
# Input to predict:
# self: the object itself
# df: a dataframe
# 
# Output from predict:
# predictions: a dataframe with class labels as column names and the rows corresponding to
#              predictions with estimated class probabilities for each row in df, where the class probabilities
#              are the relative class frequencies in the leaves of the decision tree into which the instances in
#              df fall
#
# Hint 1: Drop any "CLASS" and "ID" columns first and then apply imputation and binning
# Hint 2: Iterate over the rows calling some sub-function, e.g., make_prediction(nodeno,row), which for a test row
#         finds a leaf node from which class probabilities are obtained
# Hint 3: This sub-function may recursively traverse the tree (represented by an array), starting with the nodeno
#         that corresponds to the root

class DecisionTree:
    
    def __init__(self):
        self.binning = None
        self.imputatiom = None
        self.labels = None
        self.model= None  
        self.node_list = None
        
    
    def select_feature(self, features, dataframe):
        
        def entropy(df):
            return(-np.sum(df*np.log(df/np.sum(df))))
        
        df = dataframe.copy()
        idx, minInf = 0, 1e5
        for i in range(len(features)):
            f = features[i]
            sub_df = df.groupby([f, 'CLASS'], observed=True, sort = False).size()
            inf = sub_df.groupby([f], sort = False).apply(entropy).sum()
            if(inf < minInf):
                minInf = inf
                idx = i
        curr_feature = features[idx]
        sub_features = features[:idx] + features[idx+1:]
        
        return curr_feature, sub_features
        

    def divide_and_conquer(self, dataframe, class_freq, min_samples_split, nodeno):
        current_node_no = nodeno
        df = dataframe.copy()
        df_temp = df.drop(['CLASS', 'ID'], axis=1)
        features = list(df_temp.columns)
        
        curr_feature, sub_features = self.select_feature(features,df)  
        values = df[curr_feature].unique()
        node_dict = {}
        # add the root node first and add the subtrees when self.divide_and_conquer is called again.
        node_list = (current_node_no, curr_feature, node_dict)
        self.model.append(node_list)
        
        for v in values:
            gb = df.groupby([curr_feature])
            sub_df = gb.get_group(v).drop([curr_feature],axis=1)
          #  print(sub_df)
            sub_class_freq =  sub_df["CLASS"].value_counts(normalize=True)  
           # print(sub_class_freq)
            current_node_no += 1
            node_dict[v] = current_node_no 
            
            if sub_df.shape[0] < min_samples_split or len(sub_features)==0:
                self.model.append((current_node_no, "leaf", sub_class_freq))
            else:
                current_node_no, node_list = self.divide_and_conquer(sub_df, sub_class_freq, min_samples_split, current_node_no)  
      
        return current_node_no, node_list
    
            
    def fit(self, dataframe, nobins=10, bintype="equal-width", min_samples_split=5):
        df = dataframe.copy()
        df, self.imputation = create_imputation(df)
        df, self.binning = create_bins(df,nobins,bintype)
        self.labels = list(df["CLASS"].cat.categories)
        class_freq = df["CLASS"].value_counts(normalize= True).sort_index() 
        self.model = []
        current_node_no, node_list = self.divide_and_conquer(df, class_freq, min_samples_split, 0)

        
    def make_prediction(self, nodeno, row):
        node, feature, dictionary = self.model[nodeno]

        if feature == "leaf":
            return(dictionary.sort_index())
        else:
            bin_value = row[feature]
            # If there is no node in the dictionary for the values of row
            if(bin_value not in dictionary.keys()):
                # average the predictions of the sub_nodes
                nodes = list(dictionary.values())
                m = np.zeros([len(nodes), len(self.labels)])
                for i in range(len(nodes)):
                     m[i] = self.make_prediction(nodes[i], row)
                m = np.mean(m, axis = 0)
                return(m)
            # else we just iterate in the good next node
            else:
                new_nodeno = dictionary[bin_value]
            return(self.make_prediction(new_nodeno, row))
                
        return prediction   


    def predict(self, dataframe): 
        
        df = dataframe.copy()
        df = df.drop(["CLASS","ID"], axis=1)
        df = apply_imputation(df, self.imputation)
        df = apply_bins(df, self.binning)
        num_rows = df.shape[0]
        predictions = np.zeros((num_rows,len(self.labels)),dtype=float)
        for i in range(num_rows):
            row = df.iloc[i]
            predictions[i] = self.make_prediction(0,row)
        probs = pd.DataFrame(predictions, columns=self.labels)
       # print(probs)
        return probs


In [6]:
# Test your code (leave this part unchanged, except for if auc is undefined)

glass_train_df = pd.read_csv("glass_train.txt")
#print(glass_train_df)

glass_test_df = pd.read_csv("glass_test.txt")

tree_model = DecisionTree()

test_labels = glass_test_df["CLASS"]

nobins_values = [5,10]
bintype_values = ["equal-width","equal-size"]
min_samples_split_values = [3,5,10]
parameters = [(nobins,bintype,min_samples_split) for nobins in nobins_values for bintype in bintype_values 
              for min_samples_split in min_samples_split_values]

results = np.empty((len(parameters),3))

for i in range(len(parameters)):
    t0 = time.perf_counter()
    tree_model.fit(glass_train_df,nobins=parameters[i][0],bintype=parameters[i][1],min_samples_split=parameters[i][2])
    print("Training time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
   
    t0 = time.perf_counter() 
    predictions = tree_model.predict(glass_test_df)
    print("Testing time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=pd.MultiIndex.from_product([nobins_values,bintype_values,min_samples_split_values]),
                       columns=["Accuracy","Brier score","AUC"])

results



Training time (5, 'equal-width', 3): 1.69 s.
Testing time (5, 'equal-width', 3): 0.18 s.
Training time (5, 'equal-width', 5): 1.12 s.
Testing time (5, 'equal-width', 5): 0.16 s.
Training time (5, 'equal-width', 10): 0.62 s.
Testing time (5, 'equal-width', 10): 0.15 s.
Training time (5, 'equal-size', 3): 1.51 s.
Testing time (5, 'equal-size', 3): 0.10 s.
Training time (5, 'equal-size', 5): 0.79 s.
Testing time (5, 'equal-size', 5): 0.09 s.
Training time (5, 'equal-size', 10): 0.48 s.
Testing time (5, 'equal-size', 10): 0.13 s.
Training time (10, 'equal-width', 3): 1.45 s.
Testing time (10, 'equal-width', 3): 0.12 s.
Training time (10, 'equal-width', 5): 1.07 s.
Testing time (10, 'equal-width', 5): 0.12 s.
Training time (10, 'equal-width', 10): 0.55 s.
Testing time (10, 'equal-width', 10): 0.12 s.
Training time (10, 'equal-size', 3): 1.18 s.
Testing time (10, 'equal-size', 3): 0.13 s.
Training time (10, 'equal-size', 5): 0.89 s.
Testing time (10, 'equal-size', 5): 0.13 s.
Training time (

Unnamed: 0,Unnamed: 1,Unnamed: 2,Accuracy,Brier score,AUC
5,equal-width,3,0.626168,0.593127,0.777658
5,equal-width,5,0.635514,0.557402,0.794113
5,equal-width,10,0.570093,0.568765,0.789285
5,equal-size,3,0.616822,0.687435,0.756975
5,equal-size,5,0.635514,0.645004,0.76065
5,equal-size,10,0.598131,0.652672,0.743455
10,equal-width,3,0.635514,0.557164,0.817905
10,equal-width,5,0.626168,0.531397,0.833556
10,equal-width,10,0.588785,0.509364,0.83993
10,equal-size,3,0.485981,0.907561,0.653567


In [5]:
train_labels = glass_train_df["CLASS"]
tree_model.fit(glass_train_df,min_samples_split=1)
predictions = tree_model.predict(glass_train_df)
print("Accuracy on training set: {0:.2f}".format(accuracy(predictions,train_labels)))
print("AUC on training set: {0:.2f}".format(auc(predictions,train_labels)))
print("Brier score on training set: {0:.2f}".format(brier_score(predictions,train_labels)))

Accuracy on training set: 0.97
AUC on training set: 1.00
Brier score on training set: 0.03


### Comment on assumptions, things that do not work properly, etc.


## 2. Define the class DecisionForest

In [23]:
# Define the class DecisionForest with three functions __init__, fit and predict (after the comments):
#
# Input to __init__: 
# self: the object itself
#
# Output from __init__:
# nothing
# 
# This function does not return anything but just initializes the following attributes of the object (self) to None:
# binning, imputatiom, labels, model
#
# Input to fit:
# self: the object itself
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# nobins: no. of bins (default = 10)
# bintype: either "equal-width" (default) or "equal-size"
# min_samples_split: no. of instances required to allow a split (default = 5)
# random_features: no. of features to evaluate at each split (default = 2), 0 means all features (no random sampling)
# notrees: no. of trees in the forest (default = 10)
#
# Output from fit:
# nothing
#
# The result of applying this function should be:
#
# self.binning should be a discretization mapping (see Assignment 1) from df
# self.imputation should be an imputation mapping (see Assignment 1) from df
# self.labels should be the categories of the "CLASS" column of df, set to be of type "category" 
# self.model should be a random forest (for details, see lecture slides)
# Note that the function does not return anything but just assigns values to the attributes of the object.
#
# Hint 1: Redefine divide_and_conquer to take one additional argument; random_features, and instead of
#         evaluating all features choose a random subset, e.g., by np.random.choice (without replacement)
# Hint 2: Generate each tree in the forest from a bootstrap replicate of df, e.g., by np.random.choice 
#         (with replacement) from the index values of df.
#
# Input to predict:
# self: the object itself
# df: a dataframe
# 
# Output from predict:
# predictions: a dataframe with class labels as column names and the rows corresponding to
#              predictions with estimated class probabilities for each row in df, where the class probabilities
#              are the mean of all relative class frequencies in the leaves of the forest into which the instances in
#              df fall
#
# Hint 1: Drop any "CLASS" and "ID" columns first and then apply imputation and binning
# Hint 2: Iterate over the rows calling some sub-function, e.g., make_prediction(row), which for a test row
#         finds all leaf nodes and calculates the average of their class probabilities

class DecisionForest:
    
    def __init__(self):
        
        self.binning = None
        self.imputatiom = None
        self.labels = None
        self.model= None  # should be a random forest
        self.notrees = None
        
            
    def select_feature(self, features, dataframe):
        
        def entropy(df):
            return(-np.sum(df*np.log(df/np.sum(df))))
        
        df = dataframe.copy()
        idx, minInf = 0, 1e5
        for i in range(len(features)):
            f = features[i]
            class_repart = df.groupby([f, 'CLASS'], observed=True, sort = False).size()
            inf = class_repart.groupby([f], sort = False).apply(entropy).sum()
            if(inf < minInf):
                minInf = inf
                idx = i
        curr_feature = features[idx]
        
        if idx == 0:
            sub_features = list(features[1:])
        elif idx == len(features)-1:
            sub_features = list(features[:-1])
        else:
            sub_features = list(features[:idx]) + list(features[idx+1:])
        
        return curr_feature, sub_features
        
    
    def divide_and_conquer(self, dataframe, class_freq, min_samples_split, nodeno, random_features_samples, tree_no):
        current_node_no = nodeno
        df = dataframe.copy()
        features = random_features_samples
        curr_feature, sub_features = self.select_feature(features,df)  
        values = df[curr_feature].unique()
        node_dict = {}
        # add the root node first and add the subtrees when self.divide_and_conquer is called again.
        node_list = (current_node_no, curr_feature, node_dict)
        self.model[tree_no].append(node_list)
        
        for v in values:
            gb = df.groupby([curr_feature])
            sub_df = gb.get_group(v).drop([curr_feature],axis=1)
            sub_class_freq =  sub_df["CLASS"].value_counts(normalize=True)  
            current_node_no += 1
            node_dict[v] = current_node_no 
            if sub_df.shape[0]<=min_samples_split or len(sub_features)==0:
                self.model[tree_no].append((current_node_no, "leaf", sub_class_freq))
            else:
                current_node_no, node_list = self.divide_and_conquer(sub_df, sub_class_freq, min_samples_split, current_node_no, sub_features, tree_no)  
        return current_node_no, node_list
    
    
    def fit(self, dataframe, nobins=10, bintype="equal-width", min_samples_split=5, random_features=2, notrees=10):
        df = dataframe.copy()
        df, self.imputation = create_imputation(df)
        df, self.binning = create_bins(df,nobins,bintype)
        self.labels = list(df["CLASS"].cat.categories)
        self.notrees = notrees
        features = list(df.columns)
        target = "CLASS"
        features.remove(target)
        features.remove("ID")
        num_rows = df.shape[0]
        orig_index = df.index
        self.model = [[] for i in range(notrees)]
        
        if(random_features==0):
            random_features = len(features)
     
        for i in range(notrees):
            bootstrap_index = np.random.choice(orig_index,num_rows,replace=True)
            feature_sample = np.random.choice(features,random_features,replace=False)
            mod_df = df.loc[bootstrap_index,np.append(feature_sample,target)]
            class_freq = mod_df["CLASS"].value_counts(normalize=True).sort_index()
            current_node_no, node_list = self.divide_and_conquer(mod_df, class_freq, min_samples_split, 0, feature_sample, i) 
        
            self.model[i] = sorted(self.model[i], key=lambda x:x[0])
        
       
    def make_prediction(self, nodeno, row, tree_index):
        
        node, feature, dictionary = self.model[tree_index][nodeno]

        if feature == "leaf":
            return(dictionary.sort_index())
        else:
            bin_value = row[feature]
            # If there is no node in the dictionary for the values of row
            if(bin_value not in dictionary.keys()):
                #  average the predictions of the sub_nodes
                nodes = list(dictionary.values())
                m = np.zeros([len(nodes), len(self.labels)])
                for i in range(len(nodes)):
                    m[i] = self.make_prediction(nodes[i], row, tree_index)
                m = np.mean(m, axis = 0)
                return(m)
            else:
                new_nodeno = dictionary[bin_value]
                
            return(self.make_prediction(new_nodeno, row, tree_index))
            


    def predict(self, dataframe): 
        df = dataframe.copy()
        df = df.drop(["CLASS","ID"], axis=1)
        df = apply_imputation(df, self.imputation)
        df = apply_bins(df, self.binning)
   
        num_rows = df.shape[0]
        notrees = self.notrees
        predictions = np.zeros((num_rows,len(self.labels)),dtype=float) 
        
        for i in range(num_rows):
            row = df.iloc[i]
            prob_trees = np.zeros((notrees, len(self.labels)), dtype=float)
            for j in range(notrees):
                prob_trees[j] = self.make_prediction(0, row, j)
            prob_avg = np.mean(prob_trees, axis = 0)
            predictions[i] = prob_avg
            
        probs = pd.DataFrame(predictions, columns=self.labels)
      #  print(probs)
        return probs
    
    

In [25]:
glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

forest_model = DecisionForest()

test_labels = glass_test_df["CLASS"]

min_samples_split_values = [1,2,5]
random_features_values = [1,2,5]

parameters = [(min_samples_split,random_features) for min_samples_split in min_samples_split_values 
              for random_features in random_features_values]

results = np.empty((len(parameters),3))

for i in range(len(parameters)):
    t0 = time.perf_counter()
    forest_model.fit(glass_train_df,min_samples_split=parameters[i][0],random_features=parameters[i][1])
    print("Training time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    t0 = time.perf_counter()
    predictions = forest_model.predict(glass_test_df)
    print("Testing time {0}: {1:.2f} s.".format(parameters[i],time.perf_counter()-t0))
    results[i] = [accuracy(predictions,test_labels),brier_score(predictions,test_labels),
                  auc(predictions,test_labels)] # Assuming that you have defined auc - remove otherwise

results = pd.DataFrame(results,index=pd.MultiIndex.from_product([min_samples_split_values,random_features_values]),
                       columns=["Accuracy","Brier score","AUC"])

results

Training time (1, 1): 0.30 s.
Testing time (1, 1): 0.34 s.
Training time (1, 2): 1.18 s.
Testing time (1, 2): 0.37 s.
Training time (1, 5): 8.86 s.
Testing time (1, 5): 0.55 s.
Training time (2, 1): 0.26 s.
Testing time (2, 1): 0.30 s.
Training time (2, 2): 1.07 s.
Testing time (2, 2): 0.37 s.
Training time (2, 5): 6.30 s.
Testing time (2, 5): 0.51 s.
Training time (5, 1): 0.24 s.
Testing time (5, 1): 0.29 s.
Training time (5, 2): 0.91 s.
Testing time (5, 2): 0.39 s.
Training time (5, 5): 3.35 s.
Testing time (5, 5): 0.42 s.


Unnamed: 0,Unnamed: 1,Accuracy,Brier score,AUC
1,1,0.551402,0.66685,0.714317
1,2,0.626168,0.545914,0.790422
1,5,0.691589,0.39314,0.910134
2,1,0.654206,0.595914,0.828412
2,2,0.663551,0.511113,0.832556
2,5,0.654206,0.425872,0.908995
5,1,0.64486,0.595951,0.796879
5,2,0.616822,0.52771,0.822614
5,5,0.635514,0.440345,0.880127


In [26]:
train_labels = glass_train_df["CLASS"]
forest_model.fit(glass_train_df,min_samples_split=1)
predictions = forest_model.predict(glass_train_df)
print("Accuracy on training set: {0:.2f}".format(accuracy(predictions,train_labels)))
print("AUC on training set: {0:.2f}".format(auc(predictions,train_labels)))
print("Brier score on training set: {0:.2f}".format(brier_score(predictions,train_labels)))

Accuracy on training set: 0.82
AUC on training set: 0.98
Brier score on training set: 0.35


### Comment on assumptions, things that do not work properly, etc.

In [None]:
#The randomnization process will lead to variation of the prediction results. 
#This implementation is faster but the accurency and AUC scores are a little lower than the answer.