In [37]:
import pandas as pd
import numpy as np
from math import log2

train = pd.read_csv('experiment-data/data/train.csv')

def entropy(p):
    """
    Return entropy for ratio p.
    :param p: double
    :return: double
    """
    if p > 1 or p < 0:
        raise ValueError('value not between 0 to 1')
    if p == 0 or p == 1:
        return 0
    return -p * log2(p) - (1-p) * log2(1-p)


def get_sorted_entropy_list(dataset):
    """
    Calculate entropy for each column in dataset. Final label should be in the first column.
    :param dataset: pandas.DataFrame
    :return: list
    """
    total_entropy = entropy(len(dataset.query('label == "e"'))/len(dataset))
    entropy_list = []

    for i in range(1, len(list(dataset))):
        df = dataset.iloc[:, [0, i]]
        vals = df.iloc[:, 1].unique().tolist()
        expected_entropy = 0
        for val in vals:
            example_percentage = len(df[df.iloc[:, 1] == val]) / len(df)
            p = len(df[(df.iloc[:, 0] == 'p') & (df.iloc[:, 1] == val)]) / len(df[df.iloc[:, 1] == val])
            ent = entropy(p)
            expected_entropy += ent * example_percentage
        information_gain = total_entropy - expected_entropy
        entropy_list.append([list(df)[1], information_gain])

    entropy_list = sorted(entropy_list, key=lambda k: -k[1])
    return entropy_list


class ID3Tree:

    def __init__(self, dataset, depth):
        self.isend = False
        self.depth = depth
        self.next_v = {}
        if len(set(dataset.iloc[:, 0])) > 1:
            self.A = get_sorted_entropy_list(dataset)[0][0]
            self.vs = list(dataset[self.A].unique())
            for v in self.vs:
                new_dataset = dataset[dataset.loc[:, self.A] == v]
                self.next_v[v] = ID3Tree(new_dataset, depth+1)
        else:
            self.isend = True
            self.ans = dataset.iloc[0][0]

    def predict(self, data):
        """

        :param data: pandas.core.series.Series
        :return: string
        """
        if self.isend:
            return self.ans
        #print(self.depth, self.A)
        next_v = data[self.A]
        return self.next_v[next_v].predict(data)

    def get_depth(self):
        if self.isend:
            return self.depth
        else:
            sub_tree_max_depth = self.depth
            for v in self.next_v:
                sub_tree_max_depth = max(sub_tree_max_depth, self.next_v[v].get_depth())
            return sub_tree_max_depth




In [38]:
entropies

NameError: name 'entropies' is not defined

In [39]:
df = train

In [40]:
len(set(df['label']))

2

In [41]:
df['spore-print-color'].head()

0    k
1    n
2    k
3    n
4    k
Name: spore-print-color, dtype: object

In [42]:
vals = list(df['spore-print-color'].unique())

In [43]:
vals

['k', 'n', 'u', 'h', 'w', 'r', 'o', 'y', 'b']

In [44]:
df[df.loc[:, 'spore-print-color'] == 'k'].head()

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
2,p,x,y,w,t,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,e,x,y,y,t,f,c,b,n,e,...,s,w,w,p,w,o,p,k,n,g
5,e,b,s,w,t,f,c,b,g,e,...,s,w,w,p,w,o,p,k,n,m
7,p,x,y,w,t,f,c,n,p,e,...,s,w,w,p,w,o,p,k,v,g


In [45]:
get_sorted_entropy_list(df)[0][0]

'spore-print-color'

In [46]:
class ID3Tree:
    
    def __init__(self, dataset, depth):
        self.A = get_sorted_entropy_list(dataset)[0][0]
        self.vs = list(dataset['spore-print-color'].unique())
        self.next_of = {}
        self.depth = depth + 1
        
    def __str__(self):
        return ''
    
    def predict(data):
        return

In [47]:
df_k = df[df.loc[:, 'spore-print-color'] == 'k']
len(df_k)
df_n = df[df.loc[:, 'spore-print-color'] == 'n']
len(df_n)


1559

In [48]:
s = 0
for a in ['k', 'n', 'u', 'h', 'w', 'r', 'o', 'y', 'b']:
    df_x = df[df.loc[:, 'spore-print-color'] == a]
    s += len(df_x)
print(s)

6530


In [49]:
df_k = df[df.loc[:, 'spore-print-color'] == 'k']
df_k

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
2,p,x,y,w,t,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,e,x,y,y,t,f,c,b,n,e,...,s,w,w,p,w,o,p,k,n,g
5,e,b,s,w,t,f,c,b,g,e,...,s,w,w,p,w,o,p,k,n,m
7,p,x,y,w,t,f,c,n,p,e,...,s,w,w,p,w,o,p,k,v,g
8,e,b,s,y,t,f,c,b,g,e,...,s,w,w,p,w,o,p,k,s,m
10,e,x,y,y,t,f,c,b,n,e,...,s,w,w,p,w,o,p,k,s,m
13,e,x,f,n,f,f,w,b,n,t,...,f,w,w,p,w,o,e,k,a,g
22,e,b,s,w,t,f,c,b,g,e,...,s,w,w,p,w,o,p,k,s,m
30,e,s,f,g,f,f,c,n,k,e,...,s,w,w,p,w,o,p,k,v,u


In [50]:
if len(set(df_k.iloc[:, 0])) > 1:
    A_k = get_sorted_entropy_list(df_k)[0][0]
    print(A_k)

gill-size


In [51]:
list(df_k[A_k].unique())

['n', 'b']

In [52]:
df_k[df_k.loc[:, 'gill-size'] == 'n']

Unnamed: 0,label,cap-shape,cap-surface,cap-color,bruises,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
2,p,x,y,w,t,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
7,p,x,y,w,t,f,c,n,p,e,...,s,w,w,p,w,o,p,k,v,g
30,e,s,f,g,f,f,c,n,k,e,...,s,w,w,p,w,o,p,k,v,u
44,p,x,s,w,t,f,c,n,k,e,...,s,w,w,p,w,o,p,k,v,u
65,p,x,y,w,t,f,c,n,w,e,...,s,w,w,p,w,o,p,k,s,g
92,e,s,f,g,f,f,c,n,g,e,...,s,w,w,p,w,o,p,k,y,u
100,p,x,s,w,t,f,c,n,k,e,...,s,w,w,p,w,o,p,k,v,g
101,p,f,y,n,t,f,c,n,p,e,...,s,w,w,p,w,o,p,k,v,g
128,e,s,f,n,f,f,c,n,k,e,...,s,w,w,p,w,o,p,k,v,u


In [53]:
df_kn = df_k[df_k.loc[:, 'gill-size'] == 'n']
if len(set(df_kn.iloc[:, 0])) > 1:
    A_kn = get_sorted_entropy_list(df_kn)[0][0]
    print(A_kn)

population


In [54]:
for i, d in train.iterrows():
    print(type(d))
    break
data = train.loc[35]

<class 'pandas.core.series.Series'>


In [55]:
data["population"]

'v'

In [56]:
for i, d in train.iterrows():
    print(type(d))
    break

<class 'pandas.core.series.Series'>


In [57]:
class B:
    def __init__(self, i):
        print(i)dataset.iloc[0][0]
        if i < 10:
            B(i+1)
B(1)

SyntaxError: invalid syntax (<ipython-input-57-0948d8772e5f>, line 3)

In [58]:
dataset.iloc[0][0]

NameError: name 'dataset' is not defined

In [59]:
class ID3Tree:

    def __init__(self, dataset, depth):
        self.isend = False
        self.depth = depth
        self.next_v = {}
        if len(set(dataset.iloc[:, 0])) > 1:
            self.A = get_sorted_entropy_list(dataset)[0][0]
            self.vs = list(dataset[self.A].unique())
            for v in self.vs:
                new_dataset = dataset[dataset.loc[:, self.A] == v]
                self.next_v[v] = ID3Tree(new_dataset, depth+1)
        else:
            self.isend = True
            self.ans = dataset.iloc[0][0]

    def predict(self, data):
        """

        :param data: pandas.core.series.Series
        :return: string
        """
        if self.isend:
            return self.ans
        #print(self.depth, self.A)
        next_v = data[self.A]
        return self.next_v[next_v].predict(data)

In [60]:
root = ID3Tree(train, 1)

In [61]:
1

1

In [62]:
for _, row in train.iterrows():
    if root.predict(row) != row.iloc[0]:
        print('error')
        break
print('success')

success


In [63]:
test = pd.read_csv('experiment-data/data/test.csv')

In [64]:
for _, row in test.iterrows():
    if root.predict(row) != row.iloc[0]:
        print('error')
        break
print('success')

success


In [65]:
cv = pd.read_csv('experiment-data/data/CVfolds/fold1.csv')
root = ID3Tree(cv, 0)

In [66]:
root.get_depth()

AttributeError: 'ID3Tree' object has no attribute 'get_depth'