In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from HelperMethods import *

In [2]:
this = %pwd

In [3]:
PATH = (f'{this}/').replace('NoteBook/','')+'Data/'

In [4]:
column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
df = pd.read_csv(PATH + "abalone.data", names=column_names)

In [5]:
df.shape

(4177, 9)

In [6]:
df.head(50)

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7
5,I,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12,8
6,F,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33,20
7,F,0.545,0.425,0.125,0.768,0.294,0.1495,0.26,16
8,M,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165,9
9,F,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32,19


In [7]:
X = df.drop('rings', axis='columns')
y = df['rings']

In [8]:
d = {'M': 1, 'F': 2, 'I': 3}
X['sex'].replace(d,inplace = True)

X.head(10)

Unnamed: 0,sex,length,diameter,height,whole weight,shucked weight,viscera weight,shell weight
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055
5,3,0.425,0.3,0.095,0.3515,0.141,0.0775,0.12
6,2,0.53,0.415,0.15,0.7775,0.237,0.1415,0.33
7,2,0.545,0.425,0.125,0.768,0.294,0.1495,0.26
8,1,0.475,0.37,0.125,0.5095,0.2165,0.1125,0.165
9,2,0.55,0.44,0.15,0.8945,0.3145,0.151,0.32


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
print(X_test.values[:, 0])

[1. 1. 2. ... 1. 1. 2.]


In [10]:
def value_count(x, threshold):
    result = {}
    result[0] = 0
    result[1] = 0
    for value in x:
        if value <= threshold:
            result[0] += 1
        else:
            result[1]+=1
    return result

def target_value_count(y):
    result = {}
    for value in y:
        if value not in result:
            result[value] = 1
        else:
            result[value]+=1
    return result
    
    
def entropy(y, val_type, threshold = None):
    if val_type == "target":
        result = target_value_count(y)
        entropy = 0
        for value in result.values():
            p = value/len(y)
            entropy -= p*log2(p)
        return entropy
    else:
        result= value_count(y, threshold)
        entropy = 0
        for idx in range(2):
            p = result[idx]/len(y)
            entropy -= p*log2(p)
        return entropy
    

In [11]:
ja = X_train.index
print(ja.values)

[3823 3956 3623 ... 3092 3772  860]


# text

In [12]:
def information_gain(attribute, target, method):
    target_entropy = entropy(target, "target")
    #print(target_entropy)
    info_gains = []
    for attr in attribute:
        threshold = np.mean(attribute[attr])
        le_idx = np.where(attribute[attr] <= threshold)
        g_idx = np.where(attribute[attr] > threshold)
        y_le = target.values[le_idx]
        y_g = target.values[g_idx]
        # count hvor mange av hver ring som er i <= og >
        # ta count/len * log2 len/count
        le_entropy = entropy(y_le, "target")
        g_entropy = entropy(y_g, "target")
        
        
        attr_entropy = (len(y_le)/len(attribute[attr]))*le_entropy + (len(y_g)/len(attribute[attr]))*g_entropy
        information_gain = target_entropy - attr_entropy
        info_gains.append(information_gain)
    return np.argmax(info_gains)    

In [13]:
def learn(X, y, n, impurity_measure):
    if len(X) == 0:
        return
    else:
        x_copy = X.copy()
        y_copy = y.copy()
        
        top_ig = information_gain(x_copy, y_copy, "entropy")
        threshold = np.mean(x_copy[x_copy.columns[top_ig]])
        le_idx = np.where(x_copy[x_copy.columns[top_ig]] <= threshold)
        g_idx = np.where(x_copy[x_copy.columns[top_ig]] > threshold)
    
        n.category = top_ig
        n.data = threshold
    
        left_child = mnode()
        right_child = mnode()
        
        left_child.data = le_idx
        right_child.data = g_idx
        
        #node.children[0] = left_child
        #node.children[1] = right_child
        
        n.add_child(1, le_idx, left_child)
        n.add_child(2, g_idx, right_child)
        for child in n.children:
            X_copy = pd.DataFrame(x_copy.values[child.data])
            Y_copy = pd.Series(y_copy.values[child.data])
            
            if(len(X_copy) == 1):
                child.isLeaf = True
                child.data = Y_copy[X_copy.index]
            elif len(np.unique(Y_copy.values)) == 1:
                child.isLeaf = True
                child.data = Y_copy.sample(n = 1)
            elif len(target_value_count(X_copy)) == 1:
                child.isleaf = True
                child.data = Y_copy.value_counts().argmax()
            else:
                learn(X_copy, Y_copy, child, "entropy") 

In [14]:
class mnode(object):
    
    def __init__(self):
        self.data = None
        self.parent = None
        self.children = []
        self.category = None
        self.isLeaf = False
        self.category = None
    
    def add_child(self, name, threshold, child):
        child.data = threshold
        self.children.append(child)
    

In [15]:
n = mnode()
learn(X_train, y_train, n, "entropy")
print(n.data)

0.24035089399744528


In [16]:
def printer(n):
    print(n.data)
    for child in n.children:
        if child.children != None:
            print(child.data)
            print(child.category)
            printer(child)        

In [17]:
printer(n)

0.24035089399744528
0.13220343137254903
7
0.13220343137254903
0.2765394242803504
2
0.2765394242803504
0.22159663865546197
2
0.22159663865546197
0.024089403973509907
7
0.024089403973509907
0.018020833333333323
5
0.018020833333333323
0.008675675675675681
7
0.008675675675675681
0.007176470588235295
5
0.007176470588235295
0.10222222222222223
2
0.10222222222222223
0.11833333333333335
1
0.11833333333333335
0    1
dtype: int64
None
0    1
dtype: int64
0.14
1
0.14
0    3
dtype: int64
None
0    3
dtype: int64
0    2
dtype: int64
None
0    2
dtype: int64
0.005916666666666666
5
0.005916666666666666
1    4
dtype: int64
None
1    4
dtype: int64
2.3333333333333335
0
2.3333333333333335
0    3
dtype: int64
None
0    3
dtype: int64
0.11499999999999999
2
0.11499999999999999
0    4
dtype: int64
None
0    4
dtype: int64
0    3
dtype: int64
None
0    3
dtype: int64
0.0065625
7
0.0065625
0    4
dtype: int64
None
0    4
dtype: int64
0.12375
2
0.12375
0    5
dtype: int64
None
0    5
dtype: int64
0.04500000000

0.3125
0    7
dtype: int64
None
0    7
dtype: int64
0    13
dtype: int64
None
0    13
dtype: int64
0    6
dtype: int64
None
0    6
dtype: int64
0.08131578947368422
3
0.08131578947368422
0.04195
6
0.04195
0.07549999999999998
3
0.07549999999999998
0.074375
3
0.074375
0    7
dtype: int64
None
0    7
dtype: int64
0.17614285714285713
4
0.17614285714285713
0.3333333333333333
1
0.3333333333333333
0.325
1
0.325
0    7
dtype: int64
None
0    7
dtype: int64
0    6
dtype: int64
None
0    6
dtype: int64
0    6
dtype: int64
None
0    6
dtype: int64
0.18175
4
0.18175
0.2525
2
0.2525
0    7
dtype: int64
None
0    7
dtype: int64
0    6
dtype: int64
None
0    6
dtype: int64
1    7
dtype: int64
None
1    7
dtype: int64
0    7
dtype: int64
None
0    7
dtype: int64
0.33
1
0.33
0.315
1
0.315
0    5
dtype: int64
None
0    5
dtype: int64
0.245
2
0.245
0    6
dtype: int64
None
0    6
dtype: int64
0    5
dtype: int64
None
0    5
dtype: int64
0.25166666666666665
2
0.25166666666666665
0.33999999999999997
1
0.339

0.42
0    9
dtype: int64
None
0    9
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.42333333333333334
1
0.42333333333333334
0    12
dtype: int64
None
0    12
dtype: int64
0.4325
1
0.4325
0    11
dtype: int64
None
0    11
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.07491666666666667
6
0.07491666666666667
1    9
dtype: int64
None
1    9
dtype: int64
0.4275
1
0.4275
0.4125
1
0.4125
0    10
dtype: int64
None
0    10
dtype: int64
0    12
dtype: int64
None
0    12
dtype: int64
0.4425
1
0.4425
0    9
dtype: int64
None
0    9
dtype: int64
0    12
dtype: int64
None
0    12
dtype: int64
0.11988709677419357
7
0.11988709677419357
0.4052083333333332
4
0.4052083333333332
0.10833333333333334
3
0.10833333333333334
0.4
1
0.4
0    16
dtype: int64
None
0    16
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0.1125
3
0.1125
1    8
dtype: int64
None
1    8
dtype: int64
1    11
dtype: int64
None
1    11
dtype: int64
0.19441666666666668
5
0.19441666666666668
0.44
1
0.

0.24483597285067873
2.1875
0
2.1875
0.12511538461538466
3
0.12511538461538466
0.4613924050632914
1
0.4613924050632914
1.5227272727272727
0
1.5227272727272727
0.1504285714285714
7
0.1504285714285714
0.09650000000000002
6
0.09650000000000002
0.4144
4
0.4144
0.435
1
0.435
0.4275
1
0.4275
0    9
dtype: int64
None
0    9
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.45166666666666666
4
0.45166666666666666
0.13975
7
0.13975
0.4525
1
0.4525
0    7
dtype: int64
None
0    7
dtype: int64
0    12
dtype: int64
None
0    12
dtype: int64
1    11
dtype: int64
None
1    11
dtype: int64
0.4525
1
0.4525
0    8
dtype: int64
None
0    8
dtype: int64
0    15
dtype: int64
None
0    15
dtype: int64
0.16150000000000003
7
0.16150000000000003
0.4516666666666667
1
0.4516666666666667
2    10
dtype: int64
None
2    10
dtype: int64
0.4583333333333333
1
0.4583333333333333
0    11
dtype: int64
None
0    11
dtype:

dtype: int64
None
1    9
dtype: int64
0.24020000000000002
5
0.24020000000000002
2    8
dtype: int64
None
2    8
dtype: int64
0.5225
1
0.5225
0    9
dtype: int64
None
0    9
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.1646881188118812
7
0.1646881188118812
2.0454545454545454
0
2.0454545454545454
0.149625
7
0.149625
0.4877083333333334
1
0.4877083333333334
0.31675000000000003
5
0.31675000000000003
1.2
0
1.2
0.135875
6
0.135875
2    7
dtype: int64
None
2    7
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
3    8
dtype: int64
None
3    8
dtype: int64
0.12285714285714287
3
0.12285714285714287
0.5984999999999999
4
0.5984999999999999
0.3833333333333333
2
0.3833333333333333
2    7
dtype: int64
None
2    7
dtype: int64
0.5033333333333333
1
0.5033333333333333
0    9
dtype: int64
None
0    9
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0.14049999999999999
6
0.14049999999

dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.6948333333333333
4
0.6948333333333333
0    12
dtype: int64
None
0    12
dtype: int64
0    14
dtype: int64
None
0    14
dtype: int64
0.416923076923077
2
0.416923076923077
0.53125
1
0.53125
0.13125
3
0.13125
0    10
dtype: int64
None
0    10
dtype: int64
0.515
1
0.515
0    15
dtype: int64
None
0    15
dtype: int64
0    11
dtype: int64
None
0    11
dtype: int64
0.684625
4
0.684625
0.12
3
0.12
0    9
dtype: int64
None
0    9
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.54
1
0.54
0    9
dtype: int64
None
0    9
dtype: int64
0    11
dtype: int64
None
0    11
dtype: int64
0.518
1
0.518
1    9
dtype: int64
None
1    9
dtype: int64
1.5
0
1.5
0    12
dtype: int64
None
0    12
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.15447727272727274
6
0.15447727272727274
0.13959090909090907
6
0.13959090909090907
0.1284
6
0.1284
0    12
dtype: int64
None
0    12
dtype: int64
0.49833333333333335
1
0.49833333333333335

0    10
dtype: int64
0.4533333333333333
2
0.4533333333333333
1    11
dtype: int64
None
1    11
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
2.125
0
2.125
1.0135714285714286
4
1.0135714285714286
0.5708333333333333
1
0.5708333333333333
0.54
1
0.54
0    8
dtype: int64
None
0    8
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.12625
3
0.12625
0    12
dtype: int64
None
0    12
dtype: int64
1    8
dtype: int64
None
1    8
dtype: int64
0    12
dtype: int64
None
0    12
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.4443500000000001
5
0.4443500000000001
0.569655172413793
1
0.569655172413793
0.15269230769230768
3
0.15269230769230768
0.55
1
0.55
0.44875
2
0.44875
0.535
1
0.535
0    12
dtype: int64
None
0    12
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
1    8
dtype: int64
None
1    8
dtype: int64
0.8375
4
0.8375
2.0
0
2.0
0    8
dtype: int64
None
0    8
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0    8
dtype: int64
None

6
0.19569999999999999
1    11
dtype: int64
None
1    11
dtype: int64
0.5733333333333333
1
0.5733333333333333
0    11
dtype: int64
None
0    11
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0    15
dtype: int64
None
0    15
dtype: int64
0.168125
3
0.168125
0.305
7
0.305
0.56875
1
0.56875
0    13
dtype: int64
None
0    13
dtype: int64
0.5825
1
0.5825
0    17
dtype: int64
None
0    17
dtype: int64
0    13
dtype: int64
None
0    13
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.575
1
0.575
0.57
1
0.57
0    20
dtype: int64
None
0    20
dtype: int64
0    12
dtype: int64
None
0    12
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.2867352941176471
7
0.2867352941176471
0.19283333333333333
6
0.19283333333333333
0.571
1
0.571
0.8178333333333333
4
0.8178333333333333
0    12
dtype: int64
None
0    12
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0.5874999999999999
1
0.5874999999999999
0    12
dtype: int64
None
0    12
dtype: int64
0    11
dtype:

dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.615
1
0.615
0    10
dtype: int64
None
0    10
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
2.5
0
2.5
0    11
dtype: int64
None
0    11
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0.14765306122448982
3
0.14765306122448982
0.5596428571428571
5
0.5596428571428571
0.48181818181818187
2
0.48181818181818187
1.2
0
1.2
0.5428750000000001
5
0.5428750000000001
0    8
dtype: int64
None
0    8
dtype: int64
1    9
dtype: int64
None
1    9
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.6316666666666667
1
0.6316666666666667
1.75
0
1.75
0    10
dtype: int64
None
0    10
dtype: int64
0    9
dtype: int64
None
0    9
dtype: int64
0.6475
1
0.6475
0    11
dtype: int64
None
0    11
dtype: int64
0    12
dtype: int64
None
0    12
dtype: int64
0.20883333333333334
6
0.20883333333333334
1    8
dtype: int64
None
1    8
dtype: int64
0    14
dtype: int64
None
0    14
dtype: int64
0.2476428571428571
6
0.247642857142

0.4709999999999999
2
0.4709999999999999
0.6016666666666667
1
0.6016666666666667
0    11
dtype: int64
None
0    11
dtype: int64
0.6125
1
0.6125
0    10
dtype: int64
None
0    10
dtype: int64
0    11
dtype: int64
None
0    11
dtype: int64
1    10
dtype: int64
None
1    10
dtype: int64
0.6425
1
0.6425
0    14
dtype: int64
None
0    14
dtype: int64
0.6549999999999999
1
0.6549999999999999
0    9
dtype: int64
None
0    9
dtype: int64
0    11
dtype: int64
None
0    11
dtype: int64
2.1666666666666665
0
2.1666666666666665
0.31823333333333337
7
0.31823333333333337
0.6178571428571429
1
0.6178571428571429
0.5775
1
0.5775
0    9
dtype: int64
None
0    9
dtype: int64
0    13
dtype: int64
None
0    13
dtype: int64
4    9
dtype: int64
None
4    9
dtype: int64
1.2086875
4
1.2086875
0.6266666666666666
1
0.6266666666666666
0    10
dtype: int64
None
0    10
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
0.16299999999999998
3
0.16299999999999998
0.605
1
0.605
0.4825
2
0.4825
0    8
dtype: int64


dtype: int64
0.5116666666666667
2
0.5116666666666667
0.5025
2
0.5025
0    13
dtype: int64
None
0    13
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0    8
dtype: int64
None
0    8
dtype: int64
1    12
dtype: int64
None
1    12
dtype: int64
0.6117450980392156
5
0.6117450980392156
0.33934782608695657
6
0.33934782608695657
0.31300000000000006
6
0.31300000000000006
1.4415714285714285
4
1.4415714285714285
0.39449999999999996
7
0.39449999999999996
0    15
dtype: int64
None
0    15
dtype: int64
1    12
dtype: int64
None
1    12
dtype: int64
0.6383333333333333
1
0.6383333333333333
0.5025
2
0.5025
0    15
dtype: int64
None
0    15
dtype: int64
0    20
dtype: int64
None
0    20
dtype: int64
0    18
dtype: int64
None
0    18
dtype: int64
0.4170714285714286
7
0.4170714285714286
1.6666666666666667
0
1.6666666666666667
0    19
dtype: int64
None
0    19
dtype: int64
0.6525000000000001
1
0.6525000000000001
0    12
dtype: int64
None
0    12
dtype: int64
0    10
dtype: int64
None
0    10


0.46453125
0.7942142857142857
5
0.7942142857142857
0    11
dtype: int64
None
0    11
dtype: int64
1.3333333333333333
0
1.3333333333333333
0.72
1
0.72
0    9
dtype: int64
None
0    9
dtype: int64
0    10
dtype: int64
None
0    10
dtype: int64
0    11
dtype: int64
None
0    11
dtype: int64
0.5772222222222221
2
0.5772222222222221
1.9576250000000002
4
1.9576250000000002
0    17
dtype: int64
None
0    17
dtype: int64
2    12
dtype: int64
None
2    12
dtype: int64
0.19999999999999998
3
0.19999999999999998
1.6666666666666667
0
1.6666666666666667
0    12
dtype: int64
None
0    12
dtype: int64
0.695
1
0.695
0    10
dtype: int64
None
0    10
dtype: int64
0    29
dtype: int64
None
0    29
dtype: int64
1    11
dtype: int64
None
1    11
dtype: int64
0.569557142857143
7
0.569557142857143
0.42217187500000003
6
0.42217187500000003
0.5107916666666668
7
0.5107916666666668
0.698913043478261
1
0.698913043478261
1.77975
4
1.77975
0.5435714285714286
2
0.5435714285714286
0.6816666666666666
1
0.68166666666666

In [34]:
def predict_row(x,node):
    while len(x) > 0:
        while node.isLeaf == False: 
            attr_var = x[node.category] 
            if attr_var <= node.data:
                child_node = node.children[0]
            elif attr_var > node.data:
                child_node = node.children[1]
            if child_node.isLeaf: 
                return child_node.data.values.item()
            node = child_node

In [35]:
def predict(X, node):
    counter = 0
    copy = X.copy()
    result = {}
    for i in X.values: 
        result[counter] = (predict_row(i,node))
        counter += 1
    return result

In [36]:
pred = predict(X_test, n)

In [39]:
def accuracy(y_true, y_pred): 
        result = 0
        for idx, y_ in enumerate(y_pred.values()): 
            if y_ == y_true.values[idx]: result+=1
        return (result/len(y_true))


In [40]:
accuracy(y_test, pred)

0.19808612440191387

In [53]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()

In [54]:
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [55]:
y_ = clf.predict(X_test)

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_)

0.20287081339712917