In [1]:
import numpy as np
from random import shuffle
import copy
#from statistics import mode
from scipy.stats import mode
from sklearn.model_selection import KFold #To shuffle our data into K-folds
from sklearn.metrics import confusion_matrix #To calculate confusion maxtrix
import math
import csv

# Classification tree

In [2]:
#Load our data
def loadData(name):
    data = []
    with open(name) as openfile:
        readfile = csv.reader(openfile, delimiter=',')
        for line in readfile:
            if len(line) > 1:
                data.append(line)
    return(data)

In [3]:
class classificationTree():
    def __init__(self, n_min=5): #Create the tree with leaf size of n_min % of total data.
        self.n_min = n_min
        self.root = {}

    #Splits into two groups for discreet data.
    def __splitGroupDiscreet(self, data, axis, name):
        #Place data into respective groups based on name.
        group1 = []
        group2 = []
        for entry in data:
            if entry[axis] == name:
                group1.append(entry)
            else:
                group2.append(entry)
        return(group1, group2)
    
    #Information gain of splitting on data[axis] = name
    def __infoGainDiscreet(self, data, axis, name):
        group1, group2 = self.__splitGroupDiscreet(data, axis, name)
        group1_entropy = self.__entropy(group1) #Calculate the entropies
        group2_entropy = self.__entropy(group2) #Calculate the entropies
        resultEntropy = (len(group1)/len(data))*group1_entropy + (len(group2)/len(data))*group2_entropy #Ave entropy
        return(self.__entropy(data) - resultEntropy)
    
    #Splits into two groups for continuous data.
    def __splitGroupContinuous(self, data, axis, min_val):
        group1 = []
        group2 = []
        for entry in data:
            if len(min_val) > 0 and float(entry[axis]) > float(min_val):
                group1.append(entry)
            else:
                group2.append(entry)
                
        return(group1, group2)
    
    #Information gain of splitting on value > min_val
    def __infoGainContinuous(self, data, axis, min_val):
        group1, group2 = self.__splitGroupContinuous(data, axis, str(min_val))
        group1_entropy = self.__entropy(group1) #Calculate the entropies
        group2_entropy = self.__entropy(group2) #Calculate the entropies
        resultEntropy = (len(group1)/len(data))*group1_entropy + (len(group2)/len(data))*group2_entropy #Ave entropy
        return(self.__entropy(data) - resultEntropy)
    
    #Entropy of a group with respect to our targets (self.labels)
    def __entropy(self, group):
        totalcount = len(group) #Total number of inputs
        entropy = 0 #Total entropy.
        true = 0

        #Calculated the weighted entropy of each label.   
        for label in self.labels:
            true = [x for x in group if label in x[-1]]
            if len(true) != 0: #Log of 0.0 is undefined.
                entropy -= (len(true)/totalcount) * math.log((len(true)/totalcount), 2)
        return(entropy)
        
    
    #Finds the feature to split with the highest information gain.
    #For discreet data, we use the median as the splitting category.
    #For continuous data, we use the average
    #Returns the column(feature), value
    def __find_best_feature(self, data):
        highest_info_gain = 0.0 #This tracks the highest we found
        highest_column = 0 #This tracks the feature we should split by.
        splitby = "" #This tracks the name (discreet) or min_split (continuous)
        
        for i in range(len(data[0])-1): #For each feature (column) minus label.
            #Determine the type of feature.
            featuretype = ""
            if 'int' in str(type(data[0][i])) or 'float' in str(type(data[0][i])):
                featuretype = "continuous"
            elif 'str' in str(type(data[0][i])):
                featuretype = "discreet"
            
            #If our feature is discreet, we check info gain on each name(type)
            if featuretype == "discreet":
                #Get only unique categories.
                possibility = set()
                for j in range(len(data)):
                    possibility.add(data[j][i])
                #Now iterate over unique names
                for name in possibility:
                    info_gain = self.__infoGainDiscreet(data, i, name)
                    if(info_gain > highest_info_gain): #If we exceeded existin info_gain
                        highest_info_gain = info_gain #Update highest values.
                        highest_column = i
                        splitby = name

            #If the feature is continuous:
            #"For determining the optimal threshold for splitting you will need to search over
            #all possible thresholds for a given feature". This is slow. Esp for tasks 
            #such as spambase with 4k+ values over 20+ features
            #We will use the mean value instead rather than search over the range.
            elif featuretype == "continuous":
                mean = 0
                for j in range(len(data)):
                    mean += data[j][i]
                mean = (mean/len(data))
                info_gain = self.__infoGainContinuous(data, i, mean)
                if(info_gain > highest_info_gain): #If we exceeded existin info_gain
                    highest_info_gain = info_gain #Update highest values.
                    highest_column = i
                    splitby = str(mean)

        return(highest_column, splitby, featuretype)
        
    
    #Given a node, find it's correct splits, and assign values.
    def __splitNode(self, node):
        
        #Find the optimal splitting strategy.
        column, splitby, featuretype = self.__find_best_feature(node["data"])
        node["column"] = column
        node["splitby"] = splitby
        
        #Obtain the splits into group1 and group2
        group1 = None
        group2 = None
        if featuretype == "discreet":
            node["featuretype"] = "discreet"
            group1, group2 = self.__splitGroupDiscreet(node["data"], column, splitby)
        else:
            node["featuretype"] = "continuous"
            group1, group2 = self.__splitGroupContinuous(node["data"], column, splitby)
        
        #If one group has size <= self.n_min, we stop.
        #Also stop if all features have the same value.
        #Store the "mode" feature in the yes/no.
        if len(group1) == 0:
            node["yes"] = None
        elif len(group1) <= self.n_min: #If we have fewer than our minimum
            node["yes"] = mode([x[-1] for x in group1])[0][0] #The mode feature is our "yes value"
        elif len(set([x[-1] for x in group1])) == 1: #If the number of unique features is only 1
            node["yes"] = mode([x[-1] for x in group1])[0][0]
        else: #If we still need to continue, then we recurse.
            node["yes"] = {"data":group1} 
            self.__splitNode(node["yes"]) #Recurse on this side.

        if len(group2) == 0:
            node["no"] = None
        elif len(group2) <= self.n_min:
            node["no"] = mode([x[-1] for x in group2])[0][0]
        elif len(set([x[-1] for x in group2])) == 1:
            node["no"] = mode([x[-1] for x in group2])[0][0]
        else:
            node["no"] = {"data":group2}
            self.__splitNode(node["no"]) #Recurse on this side.
        return

        
    #Fit the data into our tree.
    #Continuous data should be numerical. Categorical should be string.
    def fit(self, data, label):
        data = copy.deepcopy(data)
        label = copy.deepcopy(label)
        #Attach the labels to the end of our data.
        for i in range(len(data)):
            data[i].append(label[i])
        self.labels = list(set(label))   #These are the unique labels.
        self.size = len(data) #size of our dataset
        self.n_min = (self.n_min * self.size)/100 #Minimum leaf size.
        
        #Use ID3 algorithm. 
        #1. Create a root node.
        self.root = {"data":data}
        #Recursively split this node by finding it's left and right.
        self.__splitNode(self.root)
        
    #Recursively querys the node until we get our answer.
    def __checkValue(self, node, query):
        answer = ""
        if(node["featuretype"] == "continuous"): #Get the featureType.
            if(query[node["column"]] > float(node["splitby"])): #Do the corresponding test.
                #Check if this node is the final.
                if "dict" in str(type(node["yes"])): 
                    answer = self.__checkValue(node["yes"], query)
                else:
                    answer = node["yes"]
            else:
                #Check if this node is the final.
                if "dict" in str(type(node["no"])): 
                    answer = self.__checkValue(node["no"], query)
                else:
                    answer = node["no"]
        elif(node["featuretype"] == "discreet"):
            if(query[node["column"]] == node["splitby"]):
                #Check if this node is the final.
                if "dict" in str(type(node["yes"])): 
                    answer = self.__checkValue(node["yes"], query)
                else:
                    answer = node["yes"]
            else:
                #Check if this node is the final.
                if "dict" in str(type(node["no"])): 
                    answer = self.__checkValue(node["no"], query)
                else:
                    answer = node["no"]
        return(answer)

    #The driver to call self.__checkValue with.
    def predict(self, query):
        return(self.__checkValue(self.root, query))
        
        

## Classification of flowers based on features (iris.csv dataset)

In [4]:
#Load the iris dataset.
data = loadData("iris.csv")
label = [x[-1] for x in data] #Take out the labels.
data = [x[:-1] for x in data]
#Set the correct data types.
for i in range(len(data)):
    data[i][0] = float(data[i][0])
    data[i][1] = float(data[i][1])
    data[i][2] = float(data[i][2])
    data[i][3] = float(data[i][3])


In [5]:
#Create and fit our classification tree.
tree = classificationTree(n_min = 5) #n_min = minimum percentage of data as leafnode size.
tree.fit(data,label)



In [6]:
#We can test on one value.
tree.predict([6.4, 2.8, 5.7, 2.2])

'Iris-virginica'

### Evaluating the accuracy over 10 folds. 

In [7]:
n_splits=10 #10 folds

n_min_tests = [5,10,15,20] #For these different n_min parameters
for n_min in n_min_tests:
    
    kf = KFold(n_splits) #Use Kfolds to generate the test folds.
    count = 0
    accuracy = []
    for train, test in kf.split(data):
        #Get the training data ready for each fold
        training_data = []
        training_label = []
        for i in train:
            training_data.append(data[i])
            training_label.append(label[i])
        #Create our model for each fold, 
        kfoldmodel = classificationTree(n_min)
        kfoldmodel.fit(training_data, training_label)

        #Predict on the test labels and collect results.
        test_label = []
        test_label_predicted = []
        for i in test:
            test_label.append(label[i])
            test_label_predicted.append(kfoldmodel.predict(data[i]))

        #Calculate accuracy
        total = len(test_label)
        correct = 0
        for i in range(total):
            if(test_label[i] == test_label_predicted[i]):
                correct += 1
        count += 1
        accuracy.append((correct*100)/total)
    accuracy = np.array(accuracy)
    
    print("Avg accuracy over 10 folds for n_min "+str(n_min)+" :", np.mean(accuracy))
    print("Avg std"+str(n_min)+" :", np.std(accuracy))
    


Avg accuracy over 10 folds for n_min 5 : 93.33333333333333
Avg std5 : 7.888106377466154
Avg accuracy over 10 folds for n_min 10 : 90.66666666666667
Avg std10 : 8.537498983243799
Avg accuracy over 10 folds for n_min 15 : 92.66666666666666
Avg std15 : 7.571877794400365
Avg accuracy over 10 folds for n_min 20 : 92.66666666666666
Avg std20 : 7.571877794400365


Unsuprisingly, the best performing model was the lowest n_min percentage of 5% (Since the tree can be more granular). 

#### Based on the best n_min value (5), create a class confusion matrix using ten-fold cross-validation

In [8]:
kf = KFold(n_splits)
confusion_array = np.zeros((3,3), dtype=int) #initialize our confusion_array
for train, test in kf.split(data):
    #Get the training data ready for each fold
    training_data = []
    training_label = []
    for i in train:
        training_data.append(data[i])
        training_label.append(label[i])
    #Create our model for each fold, 
    kfoldmodel = classificationTree(n_min)
    kfoldmodel.fit(training_data, training_label)

    #Predict on the test labels and collect results.
    test_label = []
    test_label_predicted = []
    for i in test:
        test_label.append(label[i])
        test_label_predicted.append(kfoldmodel.predict(data[i]))
    #Sum over the confusion arrays since we use a test set segment out of total
    confusion_array += confusion_matrix(test_label, test_label_predicted, labels=["Iris-virginica", "Iris-setosa", "Iris-versicolor"])

print(confusion_array)

[[46  0  4]
 [ 0 50  0]
 [ 1  6 43]]


This confusion matrix can be understood as:

for Iris-virginica we predicted 46 as Iris-virginica, 0 as Iris-setosa and 4 as Iris-versicolor.

for Iris-setosa we predicted 0 as Iris-virginica, 50 as Iris-setosa and 0 as Iris-versicolor.

for Iris-versicolor we predicted 1 as Iris-virginica, 6 as Iris-setosa and 43 as Iris-versicolor.

## Classification of spam (spambase.csv dataset)

In [9]:
#Load the spam dataset.
data = loadData("spambase.csv")
shuffle(data) #Shuffle our data
label = [x[-1] for x in data] #Take out the labels.
data = [x[:-1] for x in data]


#Set the correct data types.
for i in range(len(data)):
    for j in range(len(data[0])):
        data[i][j] = float(data[i][j])


In [10]:
#Create and fit our classification tree.
tree = classificationTree(n_min = 5) #n_min = minimum percentage of data as leafnode size.
tree.fit(data,label)

In [11]:
tree.predict([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.86, 0.0, 0.0, 0.0, 0.0, 0.0, 3.73, 0.0, 1.86, 0.0, 0.93, 3.73, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.93, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.173, 0.0, 0.0, 0.0, 0.0, 1.9, 5.0, 38.0])

'0'

### Evaluating the accuracy over 10 folds. 

In [12]:
n_splits=10 #10 folds

n_min_tests = [5,10,15,20,25] #For these different n_min parameters
for n_min in n_min_tests:
    
    kf = KFold(n_splits) #Use Kfolds to generate the test folds.
    count = 0
    accuracy = []
    for train, test in kf.split(data):
        #Get the training data ready for each fold
        training_data = []
        training_label = []
        for i in train:
            training_data.append(data[i])
            training_label.append(label[i])
        #Create our model for each fold, 
        kfoldmodel = classificationTree(n_min)
        kfoldmodel.fit(training_data, training_label)

        #Predict on the test labels and collect results.
        test_label = []
        test_label_predicted = []
        for i in test:
            test_label.append(label[i])
            test_label_predicted.append(kfoldmodel.predict(data[i]))

        #Calculate accuracy
        total = len(test_label)
        correct = 0
        for i in range(total):
            if(test_label[i] == test_label_predicted[i]):
                correct += 1
        count += 1
        accuracy.append((correct*100)/total)

    accuracy = np.array(accuracy)
    print("% Avg accuracy for n_min "+str(n_min)+" :", np.mean(accuracy))
    print("% Avg std of accuracy for n_min "+str(n_min)+" :", np.std(accuracy))

% Avg accuracy for n_min 5 : 89.89394510987458
% Avg std of accuracy for n_min 5 : 1.2828025913491792
% Avg accuracy for n_min 10 : 89.56771668395737
% Avg std of accuracy for n_min 10 : 1.2198388074607098
% Avg accuracy for n_min 15 : 85.95949259643496
% Avg std of accuracy for n_min 15 : 1.3832206029310827
% Avg accuracy for n_min 20 : 85.93775346600019
% Avg std of accuracy for n_min 20 : 1.3861682547753538
% Avg accuracy for n_min 25 : 85.30736583985666
% Avg std of accuracy for n_min 25 : 1.1367235446124628


# Regression tree

In [13]:
#Modify regression Tree from classification tree whereby:
#Early stopping criteria: if instance is < n_min, return the mean feature value.
#Use variance instead of information_gain.
class regressionTree():
    def __init__(self, n_min=5): #Create the tree with leaf size of n_min % of total data.
        self.n_min = n_min
        self.root = {}

    #Splits into two groups for discreet data.
    def __splitGroupDiscreet(self, data, axis, name):
        #Place data into respective groups based on name.
        group1 = []
        group2 = []
        for entry in data:
            if entry[axis] == name:
                group1.append(entry)
            else:
                group2.append(entry)
        return(group1, group2)
    
    #Information gain of splitting on data[axis] = name
    def __infoGainDiscreet(self, data, axis, name):
        group1, group2 = self.__splitGroupDiscreet(data, axis, name)
        group1_entropy = self.__entropy(group1) #Calculate the entropies
        group2_entropy = self.__entropy(group2) #Calculate the entropies
        resultEntropy = (len(group1)/len(data))*group1_entropy + (len(group2)/len(data))*group2_entropy #Ave entropy
        return(self.__entropy(data) - resultEntropy)
    
    #Splits into two groups for continuous data.
    def __splitGroupContinuous(self, data, axis, min_val):
        group1 = []
        group2 = []
        for entry in data:
            if len(min_val) > 0 and float(entry[axis]) > float(min_val):
                group1.append(entry)
            else:
                group2.append(entry)
                
        return(group1, group2)
    
    #Information gain of splitting on value > min_val
    def __infoGainContinuous(self, data, axis, min_val):
        group1, group2 = self.__splitGroupContinuous(data, axis, str(min_val))
        group1_entropy = self.__entropy(group1) #Calculate the entropies
        group2_entropy = self.__entropy(group2) #Calculate the entropies
        resultEntropy = (len(group1)/len(data))*group1_entropy + (len(group2)/len(data))*group2_entropy #Ave entropy
        return(self.__entropy(data) - resultEntropy)
    
    #For resuable code with classification, we refer to variance as entropy.
    def __entropy(self, group):
        #If there are no values, we simply don't have variance.
        if(len(group) == 0):
            return(0)
        group = np.array(group, dtype=float)
        return(np.var(group[:,-1])) #Return the varance of this group

    
    #Finds the feature to split with the highest information gain.
    #For discreet data, we use the median as the splitting category.
    #For continuous data, we use the average
    #Returns the column(feature), value
    def __find_best_feature(self, data):
        highest_info_gain = 0.0 #This tracks the highest we found
        highest_column = 0 #This tracks the feature we should split by.
        splitby = "" #This tracks the name (discreet) or min_split (continuous)
        
        for i in range(len(data[0])-1): #For each feature (column) minus label.
            #Determine the type of feature.
            featuretype = ""
            if 'int' in str(type(data[0][i])) or 'float' in str(type(data[0][i])):
                featuretype = "continuous"
            elif 'str' in str(type(data[0][i])):
                featuretype = "discreet"
            
            #If our feature is discreet, we check info gain on each name(type)
            if featuretype == "discreet":
                #Get only unique categories.
                possibility = set()
                for j in range(len(data)):
                    possibility.add(data[j][i])
                #Now iterate over unique names
                for name in possibility:
                    info_gain = self.__infoGainDiscreet(data, i, name)
                    if(info_gain > highest_info_gain): #If we exceeded existin info_gain
                        highest_info_gain = info_gain #Update highest values.
                        highest_column = i
                        splitby = name

            #If the feature is continuous:
            #"For determining the optimal threshold for splitting you will need to search over
            #all possible thresholds for a given feature". This is slow. Esp for tasks 
            #such as spambase with 4k+ values over 20+ features
            #We will use the mean value instead rather than search over the range.
            elif featuretype == "continuous":
                mean = 0
                for j in range(len(data)):
                    mean += data[j][i]
                mean = (mean/len(data))
                info_gain = self.__infoGainContinuous(data, i, mean)
                if(info_gain > highest_info_gain): #If we exceeded existin info_gain
                    highest_info_gain = info_gain #Update highest values.
                    highest_column = i
                    splitby = str(mean)

        return(highest_column, splitby, featuretype)
        
    
    #Given a node, find it's correct splits, and assign values.
    def __splitNode(self, node):
        
        #Find the optimal splitting strategy.
        column, splitby, featuretype = self.__find_best_feature(node["data"])
        node["column"] = column
        node["splitby"] = splitby
        
        #Obtain the splits into group1 and group2
        group1 = None
        group2 = None
        if featuretype == "discreet":
            node["featuretype"] = "discreet"
            group1, group2 = self.__splitGroupDiscreet(node["data"], column, splitby)
        else:
            node["featuretype"] = "continuous"
            group1, group2 = self.__splitGroupContinuous(node["data"], column, splitby)
        
        #If one group has size <= self.n_min, we stop.
        #Also stop if all features have the same value.
        #Store the "mode" feature in the yes/no.
        if len(group1) == 0:
            node["yes"] = None
        elif len(group1) <= self.n_min: #If we have fewer than our minimum
            l = [float(x[-1]) for x in group1]
            node["yes"] = sum(l)/float(len(l)) #The mean feature prediction is our regression value.
        else: #If we still need to continue, then we recurse.
            node["yes"] = {"data":group1} 
            self.__splitNode(node["yes"]) #Recurse on this side.

        if len(group2) == 0:
            node["no"] = None
        elif len(group2) <= self.n_min:
            l = [float(x[-1]) for x in group1]
            node["no"] = sum(l)/float(len(l))
        else:
            node["no"] = {"data":group2}
            self.__splitNode(node["no"]) #Recurse on this side.
        return

        
    #Fit the data into our tree.
    #Continuous data should be numerical. Categorical should be string.
    def fit(self, data, label):
        data = copy.deepcopy(data)
        label = copy.deepcopy(label)
        #Attach the labels to the end of our data.
        for i in range(len(data)):
            data[i].append(label[i])
        self.labels = list(set(label))   #These are the unique labels.
        self.size = len(data) #size of our dataset
        self.n_min = (self.n_min * self.size)/100 #Minimum leaf size.
        
        #Use ID3 algorithm. 
        #1. Create a root node.
        self.root = {"data":data}
        #Recursively split this node by finding it's left and right.
        self.__splitNode(self.root)
        
    #Recursively querys the node until we get our answer.
    def __checkValue(self, node, query):
        answer = ""
        if(node["featuretype"] == "continuous"): #Get the featureType.
            if(query[node["column"]] > float(node["splitby"])): #Do the corresponding test.
                #Check if this node is the final.
                if "dict" in str(type(node["yes"])): 
                    answer = self.__checkValue(node["yes"], query)
                else:
                    answer = node["yes"]
            else:
                #Check if this node is the final.
                if "dict" in str(type(node["no"])): 
                    answer = self.__checkValue(node["no"], query)
                else:
                    answer = node["no"]
        elif(node["featuretype"] == "discreet"):
            if(query[node["column"]] == node["splitby"]):
                #Check if this node is the final.
                if "dict" in str(type(node["yes"])): 
                    answer = self.__checkValue(node["yes"], query)
                else:
                    answer = node["yes"]
            else:
                #Check if this node is the final.
                if "dict" in str(type(node["no"])): 
                    answer = self.__checkValue(node["no"], query)
                else:
                    answer = node["no"]
        return(answer)

    #The driver to call self.__checkValue with.
    def predict(self, query):
        return(self.__checkValue(self.root, query))
        
        

## Predicing boston suburb house prices with a decision tree (housing.csv)

In [14]:
#Load the iris dataset.
data = loadData("housing.csv")
shuffle(data) #Shuffle our data
label = [x[-1] for x in data] #Take out the labels. #This is median price in thousands.
data = [x[:-1] for x in data]

#Set the correct data types.
for i in range(len(data)):
    for j in range(len(data[0])):
        data[i][j] = float(data[i][j])


In [15]:
#Create and fit our regression tree.
tree = regressionTree(n_min = 5) #n_min = minimum percentage of data as leafnode size.
tree.fit(data,label)

In [16]:
#Predict one of our values
#['0.02985', '0', '2.18', '0', '0.458', '6.43', '58.7', '6.0622', '3', '222', '18.7', '394.12', '5.21', '28.7']
tree.predict([0.02985, 0, 2.18, 0, 0.458, 6.43, 58.7, 6.0622, 3, 222, 18.7, 394.12, 5.21])

25.550000000000004

### Evaluate the MSE accuracy of the model using 10 folds for various n_min

In [17]:
n_splits=10 #10 folds

n_min_tests = [5,10,15,20,25] #For these different n_min parameters
for n_min in n_min_tests:
    kf = KFold(n_splits) #Use Kfolds to generate the test folds.
    count = 0
    accuracy = []
    for train, test in kf.split(data):
        #Get the training data ready for each fold
        training_data = []
        training_label = []
        for i in train:
            training_data.append(data[i])
            training_label.append(label[i])
        #Create our model for each fold, 
        kfoldmodel = classificationTree(n_min)
        kfoldmodel.fit(training_data, training_label)

        #Predict on the test labels and collect results.
        test_label = []
        test_label_predicted = []
        for i in test:
            test_label.append(label[i])
            test_label_predicted.append(kfoldmodel.predict(data[i]))

        #Calculate accuracy with MSR
        total = len(test_label)
        error = 0
        for i in range(total):
            error +=  (float(test_label[i]) - float(test_label_predicted[i])) ** 2 #Square error
        accuracy.append(error/total) #Accuracy is the mean of this.
        count += 1

    accuracy = np.array(accuracy)
    print("Avg mean squared error for n_min "+str(n_min)+" :", np.mean(accuracy))
    print("Avg std for this MSE for n_min"+str(n_min)+" :", np.std(accuracy))

Avg mean squared error for n_min 5 : 36.112191764705884
Avg std for this MSE for n_min5 : 8.315048012229902
Avg mean squared error for n_min 10 : 40.300452156862754
Avg std for this MSE for n_min10 : 12.404878974774986
Avg mean squared error for n_min 15 : 41.04351803921569
Avg std for this MSE for n_min15 : 12.907606399568813
Avg mean squared error for n_min 20 : 39.98316196078431
Avg std for this MSE for n_min20 : 12.021595019127043
Avg mean squared error for n_min 25 : 93.21851921568627
Avg std for this MSE for n_min25 : 33.11527494067931


It appears that MSE increases slightly as n_min increases. However, past a certain n_min, the MSE dramatically increases and the algorithm beings to perform poorly.