In [1]:
import pandas as pd
import math
data = {
    'level': ['Senior', 'Senior','Mid','Junior','Junior','Junior','Mid','Senior','Senior','Junior','Senior','Mid','Mid','Junior'],
    'lang': ['Java', 'Java', 'Python', 'Python', 'R', 'R', 'R', 'Python', 'R', 'Python', 'Python', 'Python', 'Java', 'Python'],
    'tweets': ['no', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no'],
    'phd': ['no', 'yes', 'yes', 'no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'no', 'yes'],
    'labels': [0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0]
}


data = pd.DataFrame(data)

In [2]:
def gini_index(subset, features, cl_names, cl_values):
    gini_split_list = []
    N = len(subset)
    r_1 = r_2 = 0
    for feature in features:
        values = subset[feature].value_counts()
        gini_list = []
        n_list = []
        gini_split = 0

        for i in range(len(values)):
            # количество записей со значение зависимой переменной
            n = values[i]
            n_list.append(n)
            # значение зависимой переменной
            value_key = values.keys()[i]

            # количество записей с конкретным значение зависимой переменной и конкретным классом
            query = subset.query("(" + feature + " == '" + str(value_key) + "')").copy()
            for i in range(len(cl_names)):
                query = query.query("(" + cl_names[i] + " == '" + str(cl_values[i]) + "')")
            r_1 = len(query)
            r_2 = n - r_1
            gini = 1 - math.pow(r_1 / n, 2) - math.pow(r_2 / n, 2)
            gini_list.append(gini)

        for i in range(len(gini_list)):
            gini_split += n_list[i] / N * gini_list[i]

        gini_split_list.append(round(gini_split, 5))

    return gini_split_list.index(min(gini_split_list))

In [3]:
# разбивает корневой узел на ветки
def split(dataset, feature):

    values = dataset[feature].value_counts().keys()
    df_list = []
    for value in values:
        df_list.append((feature, value, pd.DataFrame(dataset[dataset[feature] == value].copy())))

    return df_list

In [4]:
def build_tree(inputs, split_features, class_name, class_value):
    num_inputs = len(inputs)
    num_trues = len(inputs[inputs[class_name[0]] == class_value[0]])
    num_falses = num_inputs - num_trues

    if num_trues == 0:
        return False
    if num_falses == 0:
        return True

    if not split_features:
        return num_trues > num_falses

    index = gini_index(inputs, split_features, class_name, class_value)
    best_feature = split_features[index]
    partitions = split(inputs, best_feature)

    new_features = [a for a in split_features if a != best_feature]

    # создаем поддеревья
    subtrees = {}
    class_name.append('')
    class_value.append('')

    for feature_type, feature_value, subset in partitions:
        class_name[-1] = feature_type
        class_value[-1] = feature_value

        # костыль
        if class_name[-1] == class_name[-2]:
            class_name.pop(-2)
            class_value.pop(-2)
        subtrees[feature_value] = build_tree(subset, new_features, class_name, class_value)

    return best_feature, subtrees

In [5]:
tree = build_tree(data, ['level', 'lang', 'tweets', 'phd'], ['labels'], [1])

print(tree)

('level', {'Senior': ('tweets', {'no': False, 'yes': True}), 'Junior': ('phd', {'no': True, 'yes': False}), 'Mid': True})
