In [92]:
class DecisionTreeClassifier1(object):
    __class__ = "DecisionTreeClassifier"
    __doc__ = "docs string"
    
    def __init__(self, max_depth, counter=0):
        
        self.max_depth = max_depth
        self.counter = counter
        
    def check_purity(self, data):
        label_column = data[:,-1] # вытащили массив таргетов
        unique_classes = np.unique(label_column) 

        if len(unique_classes) == 1:
            return True
        else:
            return False    
    
    
    
    
    
    
    def decision_tree_algorithm(self, df, min_samples=2):
        # data preparations
        if self.counter == 0:
            global COLUMN_HEADERS
            COLUMN_HEADERS = df.columns 
            data = df.values
        else:
            data = df           


        # base cases
        if (self.check_purity(data)) or (len(data) < min_samples) or (self.counter == self.max_depth):
            #если дошли до макс. глубины, или сэмплов 1
            classification = self.classify_data(data) # возвращает самое часто встреч.
            return classification
            

        # recursive part
        else:    
            self.counter += 1 

            # получаем лучшие сплиты 
            potential_splits = self.get_potential_splits(data)
            split_column, split_value = self.determine_best_split(data, potential_splits)
            data_below, data_above = self.split_data(data, split_column, split_value)

            # instantiate sub-tree
            feature_name = COLUMN_HEADERS[split_column]
            # лучшая фича для сплита 
            question = "{} <= {}".format(feature_name, split_value) 

            sub_tree = {question: []} # словарь

            # find answers (recursion)
            yes_answer = self.decision_tree_algorithm(data_below,  min_samples) # рекурсивный вызов для левого листа
            no_answer = self.decision_tree_algorithm(data_above,  min_samples) # правый

            if yes_answer == no_answer:
                sub_tree = yes_answer
            else:
                sub_tree[question].append(yes_answer)
                sub_tree[question].append(no_answer)

            return sub_tree

    
        
        
        
    def classify_data(self, data):
        label_column = data[:,-1]
        unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True) # возвращает уникальные и как часто они встречаются (tuple), складывает в два массива

        index = counts_unique_classes.argmax()  
        classification = unique_classes[index] # appears most often

        return classification # string with target label
    
    
    def get_potential_splits(self, data): # возвращаем словарь. ключи - индексы фичей, объект - массив потенциальных сплитов
        potential_splits = {}
        n_columns = data.shape[-1]

        for column_index in range(n_columns - 1): 
            potential_splits[column_index]  = [] # закидываем в словарь ключ текущей фичи
            values = data[:,column_index] # вытаскиваем данные по  текущей фиче 
            unique_values = np.unique(values) 

            for index in range(len(unique_values)): 
                if index != 0: 
                    current_value = unique_values[index] 
                    previous_value = unique_values[index - 1]
                    potential_split = (current_value + previous_value)/2 

                    potential_splits[column_index].append(potential_split)

        return potential_splits
    
    def split_data(self, data, split_column, split_value):
        split_column_values = data[:,split_column] 

        data_below = data[split_column_values <= split_value]
        data_above = data[split_column_values > split_value]

        return data_below, data_above 
    
    
    def calculate_entropy(self, data): 
        label_column = data[:,-1] 
        counts = np.unique(label_column, return_counts=True)[1] 
        probabilities = counts / counts.sum() 
        entropy = sum(probabilities * -np.log2(probabilities)) 
        return entropy
    
    
    def calculate_overall_entropy(self, data_below, data_above): 
        n = len(data_below) +  len(data_above) 
        p_data_below = len(data_below) / n
        p_data_above = len(data_above) / n

        overall_entropy = p_data_below * self.calculate_entropy(data_below)+ p_data_above * self.calculate_entropy(data_above)
        return overall_entropy
    
    
    def determine_best_split(self, data, potential_splits):

        overall_entropy = 999
        for column_index in potential_splits:
            for value in potential_splits[column_index]:
                data_below, data_above = self.split_data(data, split_column=column_index, split_value=value)
                current_overall_entropy = self.calculate_overall_entropy(data_below, data_above)

                if current_overall_entropy <= overall_entropy:
                    overall_entropy = current_overall_entropy
                    best_split_column = column_index
                    best_split_value = value
        return best_split_column, best_split_value
    
    
    def classify_example(self, example, tree):
        question = list(tree.keys())[0] 
        feature_name, comparison_operator, value =  question.split() # string    'petal <= 0' ==> ['petal', '<=', '0']

        if example[feature_name] <= float(value):
            # ответ правильный - возвращаем класс
            answer = tree[question][0]
        else:
            # нет - словарь дальше
            answer = tree[question][1]

        if not isinstance(answer, dict):
            return answer

        else:
            residual_tree = answer
            return self.classify_example(example, residual_tree)
    
    def calculate_accuracy(self, df, tree):

        df["classification"] = df.apply(self.classify_example, axis=1, args=(tree,))
        df["classification_correct"] = df["classification"] == df["label"]

        accuracy = df["classification_correct"].mean()

        return accuracy
    
    
    
    
    

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import random 
from pprint import pprint

In [3]:
# preparing data
df = pd.read_csv('Iris.csv')

In [4]:
df=df.drop('Id', axis=1) 

In [5]:
df.head(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa


In [6]:
df=df.rename(columns={"species": 'label' })
df.head(4)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa


In [7]:
def train_test_split(df, test_size):
    
    if isinstance(test_size, float): 
        test_size = round(test_size * len(df))

    indices = df.index.tolist() 
    test_indices = random.sample(population=indices, k=test_size) 

    test_df = df.loc[test_indices] 
    train_df = df.drop(test_indices) 
    
    return train_df, test_df 

In [134]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=0.3)

In [135]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 105 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  105 non-null    float64
 1   sepal_width   105 non-null    float64
 2   petal_length  105 non-null    float64
 3   petal_width   105 non-null    float64
 4   label         105 non-null    object 
dtypes: float64(4), object(1)
memory usage: 4.9+ KB


In [136]:
model = DecisionTreeClassifier1(max_depth=2)

In [137]:
treee = model.decision_tree_algorithm(train_df)

In [138]:
pprint(treee)

{'petal_width <= 0.8': ['Iris-setosa',
                        {'petal_width <= 1.65': ['Iris-versicolor',
                                                 'Iris-virginica']}]}


In [139]:
example = test_df.iloc[2]
print(example)

sepal_length            5.4
sepal_width             3.7
petal_length            1.5
petal_width             0.2
label           Iris-setosa
Name: 10, dtype: object


In [140]:
model.classify_example(example, treee)

'Iris-setosa'

In [141]:
model.calculate_accuracy(test_df, treee)

0.9333333333333333

In [164]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size=0.25)
model = DecisionTreeClassifier1(max_depth=3)
treee = model.decision_tree_algorithm(train_df)
model.calculate_accuracy(test_df, treee)

0.9473684210526315