In [4]:
import matplotlib.pyplot as plt
import numpy as np


# Veri seti
10 mantar var 
Sınıflar: 
 * Brown Cap
 * Tapering Stalk Shape
 * Solitary

Çıkış olarak da:
 * Yenilebilir (1)
 * Zehirli (0) 

In [5]:
X_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])
y_train = np.array([1,1,0,0,1,0,0,1,1,0])

In [6]:
print("First few elements of X_train:\n", X_train[:5])
print("Type of X_train:",type(X_train))

First few elements of X_train:
 [[1 1 1]
 [1 0 1]
 [1 0 0]
 [1 0 0]
 [1 1 1]]
Type of X_train: <class 'numpy.ndarray'>


In [8]:
print("First few elements of y_train:", y_train[:5])
print("Type of y_train:",type(y_train))

First few elements of y_train: [1 1 0 0 1]
Type of y_train: <class 'numpy.ndarray'>


In [9]:
print ('The shape of X_train is:', X_train.shape)
print ('The shape of y_train is: ', y_train.shape)
print ('Number of training examples (m):', len(X_train))

The shape of X_train is: (10, 3)
The shape of y_train is:  (10,)
Number of training examples (m): 10


# entropi hesaplama

![image.png](attachment:0ce02c58-3e75-407b-b072-b7bf63f80b67.png)

In [10]:
def compute_entropy(y):
    entropy = 0
    if len(y) == 0:
        return entropy

    p1 = np.sum(y) / len(y) # yenilebilir / tüm veri sayısı

    if p1 == 0 or p1 == 1: # pure bir veri seti ise entropi = 0
        return entropy

    entropy = -p1 * np.log2(p1) - (1 - p1) * np.log2(1 - p1) 
    return entropy

In [11]:
print("Entropy at root node: ", compute_entropy(y_train))

Entropy at root node:  1.0


# veri setini sağ ve sol olarak bölme

In [12]:
def split_dataset(X, node_indices, feature):
    left_indices = []
    right_indices = []

    for index in node_indices:
        if X[index][feature] == 1:
            left_indices.append(index)
        else:
            right_indices.append(index)
    return left_indices, right_indices

In [15]:
# X_train = np.array([[1,1,1],[1,0,1],[1,0,0],[1,0,0],[1,1,1],[0,1,1],[0,0,0],[1,0,1],[0,1,0],[1,0,0]])

# CASE 1
root_indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
feature = 0
left_indices, right_indices = split_dataset(X_train, root_indices, feature)

print("CASE 1:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)

# CASE 2
root_indices_subset = [0, 2, 4, 6, 8]
left_indices, right_indices = split_dataset(X_train, root_indices_subset, feature)

print("CASE 2:")
print("Left indices: ", left_indices)
print("Right indices: ", right_indices)


CASE 1:
Left indices:  [0, 1, 2, 3, 4, 7, 9]
Right indices:  [5, 6, 8]
CASE 2:
Left indices:  [0, 2, 4]
Right indices:  [6, 8]


# Information Gain 
![image.png](attachment:83c36aa6-5faf-4187-bace-48e19de86d93.png)

In [16]:
def compute_information_gain(X,y,node_indices, feature):
    # veri setini böl 
    left_indices, right_indices = split_dataset(X, node_indices, feature)

    X_node, y_node = X[node_indices], y[node_indices]
    X_left, y_left = X[left_indices], y[left_indices]
    X_right, y_right = X[right_indices], y[right_indices]

    information_gain = 0

    entropy_node = compute_entropy(y_node)
    entropy_left = compute_entropy(y_left)
    entropy_right = compute_entropy(y_right)

    w_left = len(y_left) / len(y_node) # soldakiler / tüm 
    w_right = len(y_right) / len(y_node) # sağdakiler / tüm

    information_gain = entropy_node - (w_left * entropy_left + w_right * entropy_right)

    return information_gain
    

In [17]:
info_gain0 = compute_information_gain(X_train, y_train, root_indices, feature=0)
print("Information Gain from splitting the root on brown cap: ", info_gain0)

info_gain1 = compute_information_gain(X_train, y_train, root_indices, feature=1)
print("Information Gain from splitting the root on tapering stalk shape: ", info_gain1)

info_gain2 = compute_information_gain(X_train, y_train, root_indices, feature=2)
print("Information Gain from splitting the root on solitary: ", info_gain2)

Information Gain from splitting the root on brown cap:  0.034851554559677034
Information Gain from splitting the root on tapering stalk shape:  0.12451124978365313
Information Gain from splitting the root on solitary:  0.2780719051126377


Sonuca göre solitary en yüksek çıkan, karar ağaçlarına ona göre düzenlemek mantıklı

# information gain bilgisine göre veri setini en iyi şekilde bölmek

In [23]:
def get_best_split(X, y, node_indices):
    num_features = X.shape[1] # (10,3) 
    best_feature = -1

    max_info_gain = 0
    for feature in range(num_features):
        info_gain = compute_information_gain(X,y,node_indices,feature)
        if info_gain > max_info_gain:
            max_info_gain = info_gain
            best_feature = feature
    return best_feature

In [24]:
best_feature = get_best_split(X_train, y_train, root_indices)
print("Best feature to split on: %d" % best_feature)

Best feature to split on: 2


# Ağacı oluşturmak

In [25]:
tree = []

def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
    if current_depth == max_depth:
        formatting = " "*current_depth + "-"*current_depth
        print(formatting, "%s leaf node with indices" % branch_name, node_indices)
        return 

    best_feature = get_best_split(X, y, node_indices) 
    formatting = "-"*current_depth
    print("%s Depth %d, %s: Split on feature: %d" % (formatting, current_depth, branch_name, best_feature))
    
    left_indices, right_indices = split_dataset(X, node_indices, best_feature)
    tree.append((left_indices, right_indices, best_feature))
    
    build_tree_recursive(X, y, left_indices, "Left", max_depth, current_depth+1)
    build_tree_recursive(X, y, right_indices, "Right", max_depth, current_depth+1)
    

In [28]:
build_tree_recursive(X_train, y_train, root_indices, "Root", max_depth=2, current_depth=0)


 Depth 0, Root: Split on feature: 2
- Depth 1, Left: Split on feature: 0
  -- Left leaf node with indices [0, 1, 4, 7]
  -- Right leaf node with indices [5]
- Depth 1, Right: Split on feature: 1
  -- Left leaf node with indices [8]
  -- Right leaf node with indices [2, 3, 6, 9]
