In [251]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [252]:
data = pd.read_csv('Iris.csv')

In [253]:
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [254]:
data.isnull().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [270]:
print(data['SepalLengthCm'].isnull())

0      False
1      False
2      False
3      False
4      False
       ...  
145    False
146    False
147    False
148    False
149    False
Name: SepalLengthCm, Length: 150, dtype: bool


In [285]:
if data['SepalLengthCm'].isnull().sum():
    print("null value")
    index = data['SepalLengthCm'].isnull() == True
    data['SepalLengthCm'][index] = np.mean(data['SepalLengthCm'])
else:
    print("no null")

no null


In [255]:
print(data['Species'])

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object


In [256]:
X = data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].values
y = data['Species'].values

In [257]:
print(data['Species'])

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: Species, Length: 150, dtype: object


In [258]:
data['Species'] = data['Species'].map({
    'Iris-setosa': 0,
    'Iris-versicolor': 1,
    'Iris-virginica': 2
})

In [259]:
print(data['Species'])

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: Species, Length: 150, dtype: int64


In [29]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [35]:
print(X_train.shape[0])
print(X_test.shape[0])

120
30


In [47]:
def build_tree(X, y, max_depth = 5, depth = 0):
    if len(np.unique(y)) == 1:
        return {'type': 'leaf', 'class': y[0]}
    if max_depth is not None and depth >= max_depth:
        return {'type': 'leaf', 'class': np.bincount(y).argmax()}
    if len(y) == 0:
        return None
    
    best_feature, best_threshold = find_best_split(X, y)
    
    if best_feature is None:
        return {'type': 'leaf', 'class': np.bincount(y).argmax()}
    
    left_indices = X[:, best_feature] <= best_threshold
    right_indices = X[:, best_feature] > best_threshold
    
    left = build_tree(X[left_indices], y[left_indices], max_depth, depth+1)
    right = build_tree(X[right_indices], y[right_indices], max_depth, depth+1)
    
    return {
        'type': 'node',
        'feature': best_feature,
        'threshold': best_threshold,
        'left': left,
        'right': right
    }

In [48]:
def find_best_split(X, y):
    best_gain = -1
    best_feature, best_threshold = None, None
    
    for feature in range(X.shape[1]):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            
            left_indices = X[:, feature] <= threshold
            right_indices = X[:, feature] > threshold
            
            if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
                continue
                
            gain = information_gain(y, y[left_indices], y[right_indices])
            
            if gain>best_gain:
                best_gain = gain
                best_feature = feature
                best_threshold = threshold
                
    return best_feature, best_threshold

In [49]:
def information_gain(y, left, right):
    p_left = len(left)/len(y)
    p_right = 1 - p_left
    return gini(y) - (p_left * gini(left) + p_right * gini(right))

In [50]:
def gini(y):
    proportions = np.bincount(y)/len(y)
    return 1 - np.sum(proportions **2)

In [182]:
def predict_sample(sample, tree):
#     print(tree)
#     print(sample)
    
    if tree['type'] == 'leaf':
        return tree['class']
    if sample[tree['feature']] <= tree['threshold']:
        return predict_sample(sample, tree['left'])
    else:
        return predict_sample(sample, tree['right'])

In [230]:
def predict(X, tree):
    return predict_sample(X, tree)

In [231]:
def print_tree(tree, depth = 0):
    if tree['type'] == 'leaf':
        print(f"{'  ' * depth} Leaf Predict {tree['class']}")
    else:
        print(f"{'  ' * depth} Node Feature[{tree['feature']}] <= {tree['threshold']}")
        print_tree(tree['left'], depth+1)
        print_tree(tree['right'], depth+1)

In [232]:
tree = build_tree(X_train, y_train, max_depth = 5)

In [233]:
print_tree(tree)

 Node Feature[2] <= 1.9
   Leaf Predict 0
   Node Feature[2] <= 4.7
     Node Feature[3] <= 1.6
       Leaf Predict 1
       Leaf Predict 2
     Node Feature[3] <= 1.7
       Node Feature[2] <= 4.9
         Leaf Predict 1
         Node Feature[3] <= 1.5
           Leaf Predict 2
           Leaf Predict 1
       Node Feature[2] <= 4.8
         Node Feature[0] <= 5.9
           Leaf Predict 1
           Leaf Predict 2
         Leaf Predict 2


In [266]:
y_pred = []
for sample in X_test:
    y_pred += [predict(sample, tree)]

In [267]:
print(y_test)

[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [268]:
print(y_pred)

[1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]


In [265]:
print(accuracy_score(y_pred, y_test) * 100)

100.0


0.0
