In [71]:
import pandas as pd 
import numpy as np
 
pd_data = pd.read_csv('https://raw.githubusercontent.com/AugustLONG/ML01/master/01decisiontree/AllElectronics.csv')
pd_data.drop("RID",axis=1, inplace = True) #RID는 그냥 순서라서 삭제
pd_data

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes


In [72]:
def get_gini(df, label):
    aa = df[label].value_counts()  #label 분포 
    a_class = aa.index   #label 클래스 
    a_counts = aa.values #클래스 각각의 개수 
    mother = a_counts.sum()  #전체 개수
    
    gini = 1
    for value in a_counts:  #gini계수 공식
        gini -= (value/mother)**2
    return gini   
 
get_gini(pd_data, "age")  #0.663265306122449
get_gini(pd_data,'class_buys_computer')  #0.4591836734693877 

0.4591836734693877

In [73]:
def get_power_set(s):
    power_set=[[]]
    for elem in s:  #하위 집합 루프
        for sub_set in power_set:  #나머지 하위집합 포함하는 새로운 집합 추가
              power_set=power_set+[list(sub_set)+[elem]]   
    return power_set
 
get_power_set(pd_data.age.unique())

[[],
 ['youth'],
 ['middle_aged'],
 ['youth', 'middle_aged'],
 ['senior'],
 ['youth', 'senior'],
 ['middle_aged', 'senior'],
 ['youth', 'middle_aged', 'senior']]

In [74]:
def get_binary_split(df, attribute):
    result = []
    cl = get_power_set(df[attribute].unique())  #feature들에 대해서 부분집합 만들기
    cl.pop()  #처음과 끝 집합(공집합, 전체집합) 삭제
    cl.pop(0)
    result = cl
    return result
 
get_binary_split(pd_data,"age")


[['youth'],
 ['middle_aged'],
 ['youth', 'middle_aged'],
 ['senior'],
 ['youth', 'senior'],
 ['middle_aged', 'senior']]

In [75]:
def get_attribute_gini_index(df, attribute, label):
    result = {}
    binarysplit = get_binary_split(df, attribute) #이진분류
    gini_1 = gini_2 = None
    for i in range(len(binarysplit)):
        if len(binarysplit[i]) == 1 :  #분류한 집합 중 원소가 하나 있을 경우 ex) 4 -> (2,2) (1,3) 에서 1
            gini_1 = get_gini(df[df[attribute] == str(binarysplit[i][0])], label)  #분류한 집합 원소에 해당하는 데이터 gini계수 구하기
            gini_2 = get_gini(df[df[attribute] != str(binarysplit[i][0])], label)  #나머지 데이터 gini 계수 구하기
            nrow = df[df[attribute] == binarysplit[i][0]].shape[0]  #아래 result를 구할 때 사용하기 위해 나뉜 후 데이터 개수 저장! 
        elif len(binarysplit[i]) > 1:  #분류한 집합 중 원소가 하나 이상 있을 경우 ex) 4 -> (2,2) (1,3) 에서 2나 3
            a = binarysplit[i]
            d1 = pd.DataFrame()
            for k,j in enumerate(a):  #집합 원소들을 클래스로 가지는 데이터 분류 (원소가 2개 이상이기 때문에 데이터 아래로 붙히기)
                d = df[df[attribute] == j]
                d1 = pd.concat([d1,d])
            d2 = pd_data.drop(d1.index)  #나머지 데이터
            gini_1 = get_gini(d1, label)  #위에서 만든 데이터로 gini계수 구하기
            gini_2 = get_gini(d2, label)  
            nrow = d1.shape[0]   #역시 result를 구할 때 사용하기 위해 나뉜 후 데이터 개수 저장!
            binarysplit[i] = (binarysplit[i][0] + "_" + binarysplit[i][1]).split()  #원소가 2개 이상('youth','senior')일 때 "youth_senior"처럼 한 단어로 이름 변경
        result[binarysplit[i][0]] = (nrow/df.shape[0])*gini_1 + (1-(nrow/df.shape[0]))*gini_2  #최종 gini계수 dictionary형태로 저장 
    return result
 
get_attribute_gini_index(pd_data, "age", "class_buys_computer")
min(get_attribute_gini_index(pd_data, "age", "class_buys_computer").items())  #가장 작은 값으로 분류해야 함!

('middle_aged', 0.35714285714285715)

In [76]:
get_binary_split(pd_data, "income")

[['high'],
 ['medium'],
 ['high', 'medium'],
 ['low'],
 ['high', 'low'],
 ['medium', 'low']]

In [79]:
ginis = {'age':min(get_attribute_gini_index(pd_data, "age", "class_buys_computer").values()),  #분류하는 데 가장 중요한 변수 선정
         "income":min(get_attribute_gini_index(pd_data, "income", "class_buys_computer").values()),
         'student': min(get_attribute_gini_index(pd_data, "student", "class_buys_computer").values()),
        "credit_rating": min(get_attribute_gini_index(pd_data, "credit_rating", "class_buys_computer").values())}
print(min(ginis),ginis[min(ginis)])

age 0.35714285714285715


In [82]:
data1 = pd_data[pd_data["age"]=="middle_aged"]  #데이터 분류
data2 = pd_data[pd_data["age"]!="middle_aged"]
 
ginis1 = {"income":min(get_attribute_gini_index(data1, "income", "class_buys_computer").values()),  #가장 작은 gini계수와 해당 feature 찾기
         'student': min(get_attribute_gini_index(data1, "student", "class_buys_computer").values()),
        "credit_rating": min(get_attribute_gini_index(data1, "credit_rating", "class_buys_computer").values())}
ginis2 = {'age':min(get_attribute_gini_index(data2, "age", "class_buys_computer").values()),
         "income":min(get_attribute_gini_index(data2, "income", "class_buys_computer").values()),
         'student': min(get_attribute_gini_index(data2, "student", "class_buys_computer").values()),
        "credit_rating": min(get_attribute_gini_index(data2, "credit_rating", "class_buys_computer").values())}
 
print(min(ginis1),ginis[min(ginis1)])
print(min(ginis2),ginis[min(ginis2)])

credit_rating 0.42857142857142855
age 0.35714285714285715


In [170]:
data1 = pd_data[pd_data["age"]=="youth"]
data2 = pd_data[pd_data["age"]=="middle_aged"]#데이터 분류
data3 = pd_data.drop(data1.index).drop(data2.index)

In [167]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
10,youth,medium,yes,excellent,yes


In [168]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
2,middle_aged,high,no,fair,yes
6,middle_aged,low,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
12,middle_aged,high,yes,fair,yes


In [171]:
data3

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
9,senior,medium,yes,fair,yes
13,senior,medium,no,excellent,no


In [172]:
import math
d = - (9/14 * math.log(9/14, 2)) - (5/14 * math.log(5/14, 2))
d

0.9402859586706309

In [177]:
a = (- (2/5 * math.log(2/5, 2)) - (3/5 * math.log(3/5, 2))) * 5/14
a

0.34676806944809596

In [178]:
b =0

In [179]:
c = (- (3/5 * math.log(3/5, 2)) - (2/5 * math.log(2/5, 2))) * 5/14
c

0.34676806944809596

In [180]:
a+b+c, d - (a+b+c)

(0.6935361388961919, 0.246749819774439)

In [116]:
data1 = pd_data[pd_data["age"]=="middle_aged"]  #데이터 분류
data2 = pd_data.drop(data1.index)

In [117]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
2,middle_aged,high,no,fair,yes
6,middle_aged,low,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
12,middle_aged,high,yes,fair,yes


In [118]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
13,senior,medium,no,excellent,no


In [119]:
a = (- (5/10 * math.log(5/10, 2)) - (5/10 * math.log(5/10, 2))) * 10/14
a

0.7142857142857143

In [120]:
10 / 14

0.7142857142857143

In [121]:
d-a

0.22600024438491662

In [122]:
data1 = pd_data[pd_data["age"]=="senior"]  #데이터 분류
data2 = pd_data.drop(data1.index)

In [123]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
9,senior,medium,yes,fair,yes
13,senior,medium,no,excellent,no


In [124]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
6,middle_aged,low,yes,excellent,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
10,youth,medium,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
12,middle_aged,high,yes,fair,yes


In [125]:
a = (- (2/5 * math.log(2/5, 2)) - (3/5 * math.log(3/5, 2))) * 5/14
a

0.34676806944809596

In [126]:
b = (- (6/9 * math.log(6/9, 2)) - (3/9 * math.log(3/9, 2))) * 9/14
b

0.5903330361778861

In [127]:
a+b

0.9371011056259821

In [128]:
d-(a+b)

0.003184853044648772

In [189]:
data1 = pd_data[pd_data["income"]=="high"]
data2 = pd_data[pd_data["income"]=="medium"]#데이터 분류
data3 = pd_data.drop(data1.index).drop(data2.index)

In [190]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
12,middle_aged,high,yes,fair,yes


In [191]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
3,senior,medium,no,fair,yes
7,youth,medium,no,fair,no
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


In [192]:
data3

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
8,youth,low,yes,fair,yes


In [132]:
a = 4/14
4/14

0.2857142857142857

In [193]:
b = (- (4/6 * math.log(4/6, 2)) - (4/6 * math.log(4/6, 2))) * 6/14
b

0.33426428612637504

In [194]:
c = (- (3/4 * math.log(3/4, 2)) - (1/4 * math.log(1/4, 2))) * 4/14
c

0.23179374984546652

In [195]:
a+b+c

0.9128261054199376

In [196]:
d-(a+b+c)

0.027459853250693333

In [138]:
data1 = pd_data[pd_data["income"]=="medium"]  #데이터 분류
data2 = pd_data.drop(data1.index)

In [139]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
3,senior,medium,no,fair,yes
7,youth,medium,no,fair,no
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


In [140]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
8,youth,low,yes,fair,yes
12,middle_aged,high,yes,fair,yes


In [141]:
a = (- (4/6 * math.log(4/6, 2)) - (2/6 * math.log(2/6, 2))) * 6/14
a

0.3935553574519241

In [142]:
b = (- (5/8 * math.log(5/8, 2)) - (3/8 * math.log(3/8, 2))) * 8/14
b

0.5453908588142656

In [143]:
a+b

0.9389462162661897

In [144]:
d-(a+b)

0.0013397424044412354

In [145]:
data1 = pd_data[pd_data["income"]=="low"]  #데이터 분류
data2 = pd_data.drop(data1.index)

In [146]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
8,youth,low,yes,fair,yes


In [147]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
7,youth,medium,no,fair,no
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
12,middle_aged,high,yes,fair,yes
13,senior,medium,no,excellent,no


In [148]:
a = (- (3/4 * math.log(3/4, 2)) - (1/4 * math.log(1/4, 2))) * 4/14
a

0.23179374984546652

In [149]:
b = (- (6/10 * math.log(6/10, 2)) - (4/10 * math.log(4/10, 2))) * 10/14
b

0.6935361388961919

In [150]:
a+b

0.9253298887416584

In [151]:
d-(a+b)

0.01495606992897247

In [152]:
data1 = pd_data[pd_data["student"]=="yes"]  #데이터 분류
data2 = pd_data.drop(data1.index)

In [153]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
4,senior,low,yes,fair,yes
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes
10,youth,medium,yes,excellent,yes
12,middle_aged,high,yes,fair,yes


In [154]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
1,youth,high,no,excellent,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
7,youth,medium,no,fair,no
11,middle_aged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


In [155]:
a = (- (6/7 * math.log(6/7, 2)) - (1/7 * math.log(1/7, 2))) * 7/14
a

0.29583638929116374

In [156]:
b = (- (3/7 * math.log(3/7, 2)) - (4/7 * math.log(4/7, 2))) * 7/14
b

0.4926140680171258

In [157]:
a+b

0.7884504573082896

In [158]:
d-(a+b)

0.15183550136234136

In [159]:
data1 = pd_data[pd_data["credit_rating"]=="fair"]  #데이터 분류
data2 = pd_data.drop(data1.index)

In [160]:
data1

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
0,youth,high,no,fair,no
2,middle_aged,high,no,fair,yes
3,senior,medium,no,fair,yes
4,senior,low,yes,fair,yes
7,youth,medium,no,fair,no
8,youth,low,yes,fair,yes
9,senior,medium,yes,fair,yes
12,middle_aged,high,yes,fair,yes


In [161]:
data2

Unnamed: 0,age,income,student,credit_rating,class_buys_computer
1,youth,high,no,excellent,no
5,senior,low,yes,excellent,no
6,middle_aged,low,yes,excellent,yes
10,youth,medium,yes,excellent,yes
11,middle_aged,medium,no,excellent,yes
13,senior,medium,no,excellent,no


In [162]:
a = (- (6/8 * math.log(6/8, 2)) - (2/8 * math.log(2/8, 2))) * 8/14
a

0.46358749969093305

In [163]:
b = 6/14
b

0.42857142857142855

In [164]:
a+b

0.8921589282623617

In [165]:
d-(a+b)

0.04812703040826927