In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from HelperMethods import *

In [2]:
this = %pwd

In [3]:
PATH = (f'{this}/').replace('NoteBook/','')+'Data/'

In [4]:
column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
df = pd.read_csv(PATH + "abalone.data", names=column_names)

In [5]:
df.shape

(4177, 9)

In [6]:
df.head(50)

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [7]:
X = df.drop('rings', axis='columns')
y = df['rings']

In [8]:
d = {'M': 1, 'F': 2, 'I': 3}
X['sex'].replace(d,inplace = True)

X.head(10)

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055
5,3,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12
6,2,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33
7,2,0.545,0.425,0.125,0.768,0.294,0.1495,0.26
8,1,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165
9,2,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_test.values[:, 0])

[1. 1. 2. ... 1. 1. 2.]


In [10]:
def value_count(x, threshold):
    result = {}
    result[0] = 0
    result[1] = 0
    for value in x:
        if value <= threshold:
            result[0] += 1
        else:
            result[1]+=1
    return result

def target_value_count(y):
    result = {}
    for value in y:
        if value not in result:
            result[value] = 1
        else:
            result[value]+=1
    return result
    
def gini(y): 
        result = count_values(y)
        g = 1
        for val in result.values(): 
            p = float(val/len(y))
            g = g - (p)**2
        return g
    
def entropy(y, val_type, threshold = None):
    if val_type == "target":
        result = target_value_count(y)
        entropy = 0
        for value in result.values():
            p = value/len(y)
            entropy -= p*log2(p)
        return entropy
    else:
        result= value_count(y, threshold)
        entropy = 0
        for idx in range(2):
            p = result[idx]/len(y)
            entropy -= p*log2(p)
        return entropy
    

In [11]:
ja = X_train.index
print(ja.values)

[3823 3956 3623 ... 3092 3772  860]


# text

In [12]:
def information_gain(attribute, target, method):
    target_entropy = entropy(target, "target")
    #print(target_entropy)
    info_gains = []
    for attr in attribute:
        threshold = np.mean(attribute[attr])
        le_idx = np.where(attribute[attr] <= threshold)
        g_idx = np.where(attribute[attr] > threshold)
        y_le = target.values[le_idx]
        y_g = target.values[g_idx]
        # count hvor mange av hver ring som er i <= og >
        # ta count/len * log2 len/count
        le_entropy = entropy(y_le, "target")
        g_entropy = entropy(y_g, "target")
        
        
        attr_entropy = (len(y_le)/len(attribute[attr]))*le_entropy + (len(y_g)/len(attribute[attr]))*g_entropy
        information_gain = target_entropy - attr_entropy
        info_gains.append(information_gain)
    return np.argmax(info_gains)    

In [50]:
def make_tree(X, y, n, impurity_measure):
    if len(X) == 0:
        return
    else:
        x_copy = X.copy()
        y_copy = y.copy()
        
        top_ig = information_gain(x_copy, y_copy, "entropy")
        threshold = np.mean(x_copy[x_copy.columns[top_ig]])
        le_idx = np.where(x_copy[x_copy.columns[top_ig]] <= threshold)
        g_idx = np.where(x_copy[x_copy.columns[top_ig]] > threshold)
    
        n.category = top_ig
        n.data = threshold
    
        left_child = mnode()
        right_child = mnode()
        left_child.parent = n
        right_child.parent = n
        
        left_child.data = le_idx
        right_child.data = g_idx
        
        #node.children[0] = left_child
        #node.children[1] = right_child
        
        n.add_child(1, le_idx, left_child)
        n.add_child(2, g_idx, right_child)
        for child in n.children:
            X_copy = pd.DataFrame(x_copy.values[child.data])
            Y_copy = pd.Series(y_copy.values[child.data])
            
            if(len(X_copy) == 1):
                child.isLeaf = True
                child.data = Y_copy[X_copy.index]
            elif len(np.unique(Y_copy.values)) == 1:
                child.isLeaf = True
                child.data = Y_copy.sample(n = 1)
            elif len(target_value_count(X_copy)) == 1:
                child.isleaf = True
                child.data = Y_copy.value_counts().argmax()
            else:
                learn(X_copy, Y_copy, child, "entropy") 

In [51]:
class mnode(object):
    
    def __init__(self):
        self.data = None
        self.parent = None
        self.children = []
        self.category = None
        self.isLeaf = False
        self.category = None
    
    def add_child(self, name, threshold, child):
        child.data = threshold
        self.children.append(child)
    

In [52]:
n = mnode()
make_tree(X_train, y_train, n, "entropy")
print(n.data)

ValueError: operands could not be broadcast together with shapes (1,8) (1,3) 

In [16]:
def printer(n):
    print(n.data)
    for child in n.children:
        if child.children != None:
            print(child.data)
            print(child.category)
            printer(child)        

In [17]:
#printer(n)

NameError: name 'n' is not defined

In [18]:
def predict_row(x,node):
    while len(x) > 0:
        while node.isLeaf == False: 
            attr_var = x[node.category] 
            if attr_var <= node.data:
                child_node = node.children[0]
            elif attr_var > node.data:
                child_node = node.children[1]
            if child_node.isLeaf: 
                return child_node.data.values.item()
            node = child_node

In [19]:
def predict(X, node):
    counter = 0
    copy = X.copy()
    result = {}
    for i in X.values: 
        result[counter] = (predict_row(i,node))
        counter += 1
    return result

In [20]:
#pred = predict(X_test, n)
#print(pred)

NameError: name 'n' is not defined

In [21]:
def accuracy(y_true, y_pred): 
        result = 0
        for idx, y_ in enumerate(y_pred.values()): 
            if y_ == y_true.values[idx]: result+=1
        return (result/len(y_true))


In [22]:
#accuracy(y_test, pred)

NameError: name 'pred' is not defined

In [37]:
def learn(X,y,impurity_measure = 'entropy',  pruning=False): 
    '''
    Arguments:
        X: training data 
        y: true values for the training data 
        impurity_measure: default impurity measure is 'entropy'. Alternatives: 'gini'
        pruning: default False

    If the user want to use pruning on the three, the training set will be divided into two subsets
    Then we will create the three and use the pruning set to prune the tree, otherwise it will just create a tree. 
    Returns the tree
    '''
    X_pruning, y_pruning = [],[]
    if pruning: 
        X, X_pruning, y, y_pruning =  train_test_split(X,y, test_size = 0.25, random_state=42) #get pruning set the same was as test set, but this time we split the training set in two subsets. 
        
   
    root = mnode()

    #X.T so that each of the categories comes on a line
    make_tree(X,y, root,impurity_measure)
    if pruning: 
        while pruning: 
            pruning = False 
            pruning_pred = predict(X_pruning,root) #Vanlig predikt 
            pruning_accuracy = accuracy(y_pruning, pruning_pred)

            for children in root.children:
                leaf_node  = find_leaf(children)
                change_made = prune(leaf_node, pruning_accuracy,X_pruning,y_pruning, root)
                if (not pruning) and change_made: pruning = change_made
            
    return root

In [38]:
def find_leaf(node):
    '''
    Arguments:
        node: Node object. Example a child node of root. 
    
    return the leaf node for the node. 
    '''
    if node.isLeaf: 
        return (node)
    else: 
        for child in node.children:
            return find_leaf(child)

In [39]:
def find_parent_variabel(parent_node,grand_parent_node):
    '''
    Find the variabel value in grandparent node, so we can change the variables node to another node. In our case parent_variable_in_grand_parent

    returns the variabel value. 
    '''
    for node in grand_parent_node.children: 
        if node == parent_node: return node.data

In [48]:

def prune(leaf_node, pruning_accuracy, X_pruning, y_pruning, tree): 
    '''
    Arguments:
        leaf_node: leaf_node of a subtree
        pruning_accuracy: accurcy on the pruning set 
        X_pruning: pruning set
        y_pruning: true values for the pruning set
        tree: the root node 

    The method checks if the accurcy on the pruning set increases if we change the parent node of the leaf_node to leaf node with the output 0 and/or 1. 
    The parent node will be the new leaf if it imporves the accuracy and the children of the parent node will be removed from the three
    return variable says if there have been any changes in the three (True/False)
    '''
    prun_acc = pruning_accuracy
    changes = False 
    parent = leaf_node.parent
    
    child_values_to_check = []
    child_values_to_check.append(leaf_node.data.values)
    #if(len(parent.children) >1): 
        #child_values_to_check.append(not leaf_node.data)
    
    for i in child_values_to_check: 
        grand_parent = parent.parent

        dummy_node = mnode()
        dummy_node.category = parent.category
        dummy_node.isLeaf = True
        dummy_node.data = i
        dummy_node.parent = grand_parent

        if not (grand_parent == None): 
            parent_variable_in_grand_parent = find_parent_variabel(parent, grand_parent)
            grand_parent.children[parent_variable_in_grand_parent] = dummy_node
  
        #predict
        pred = predict(X_pruning, tree)
        pred_acc = accuracy(y_pruning,pred)
    
        if (prun_acc < pred_acc): 
            prun_acc = pred_acc
            parent = dummy_node
            changes = True
        elif not (grand_parent == None ): 
            grand_parent.children[parent_variable_in_grand_parent] = parent
    return changes

In [49]:
learn(X_train, y_train, 'entropy', True)

ValueError: operands could not be broadcast together with shapes (1,8) (1,3) 