# Ensemble Learning - Adaboost


### 알고리즘은 강의 자료 참고 
Input : Required ensemble size T 

Input : Training set S = {(x1,y1), (x2,y2), ... , (xn, yn)}, where yi $\in$ {-1,+1} 

1. Define a uniform distribution D1(i) over elements of S 
2. for t =1 to T do 
- Train a model ht using distribution Dt 
- Calculate $\epsilon_t = P_{D_t}(h_t(x) \neq y)$ 

- if $\epsilon_t >= 0.5 $ break 

- Set $\alpha_t = \frac{1}{2}ln(\frac{1-\epsilon _t}{\epsilon_t}) $
- Update $D_{t+1}(i) = \frac{D_t(i)exp(=\alpha_ty_ih_t(x_i))}{Z_t}$ 
- where $Z_t$ is a normalization factor so that $D_{t+1}$ is a valid distribution 
- end for 

3. For a new testing point(x', y') 
4. H(x') = sign($\sum_{t=1}^T \alpha_th_t(x')$) 



In [5]:
import numpy as np
import pandas as pd
import random as rand

from sklearn.datasets import load_iris
X = load_iris()['data'][:100]

# y의 값을 +1, -1 둘 중 하나로 변경 
y = load_iris()["target"][:100]
y[:50] = -1
y= y.reshape(-1,1)
S = np.concatenate((X,y), axis=1)

import matplotlib.pyplot as plt
import scipy as sc
from scipy.stats import norm
from sys import maxsize

**구현해야 하는 것**
- Stump tree 
- $\epsilon$ 
- $\alpha_t$ 
- $Z_t$ 
- $H(x')$ 

**필요한 것**
- T
- data : y의 값이 +1, -1으로 정의되어 있을 것 

**함수의 형태** 
- def ababoost(T,S) 

<아래는 외부 함수로 구현> 
- def stump_tree(data): 랜덤한 한 변수와 특성을 기준으로 분류
- def epsilon(data) : > data의 output 결과와 h 함수로 도출된 결과가 다른 정도 반환 
- def alpha(epsilon) : > 입실론 값 입력시 alpha 값 반환


In [181]:
# stump_tree 함수 구현 
def stump_tree(data) : 
    chose_var = data[np.random.choice(range(len(data)))]
    chose_att = np.random.choice(range(np.shape(data)[1]-1))
    crit = chose_var[chose_att]
    
    left = [] 
    right = [] 
    result = np.zeros(len(data))
    for index in range(len(data)) : 
        if data[index][chose_att] > crit : right.append(index)
        else : left.append(index)
    
    right_result = [1 if data[right][i,-1] == 1 else 0 for i in range(len(right)) ] 
    left_result = [1 if data[left][i,-1] == -1 else 0 for i in range(len(left)) ]
    if np.sum(right_result) + np.sum(left_result) > len(data)/2 : 
        result[right] = 1 
        result[left] = -1 
        direction = "right" 
    else : 
        result[right] = -1 
        result[left] = 1 
        direction = "left"
    return result, chose_att, crit, direction 

def cal_stump_tree(vector, chose_att, crit, direction) :
    if vector[chose_att] > crit :
        if direction == "right":  return 1
        else : return -1 
        
    else : 
        if direction == "right" : return -1 
        else : return 1 

In [176]:
a,b,c,d = stump_tree(S)
print(a,b,c,d)
a = [ 5.1,  3.5,  1.4,  0.2, -1. ]

cal_stump_tree(a,b,c,d)

[-1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.  1. -1. -1.
  1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.
 -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1. -1.  1.  1.  1. -1.
  1.  1.  1. -1.  1. -1. -1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.  1.
  1.  1.  1.  1.  1.  1.  1.  1. -1. -1.  1.  1. -1.  1.  1.  1.  1. -1.
 -1.  1.  1. -1.  1.  1.  1.  1. -1.  1.] 0 5.5 right


-1

In [164]:
import math

# h_t는 Stump_tree 함수에 D_t 값을 입력한 것. 
def epsilon(data) : 
    result = 0 
    h_result,_,_,_ = stump_tree(data)
    for i in range(len(data)) : 
        if data[i,-1] != h_result[i] : result += 1 
    return result / np.shape(data)[0]
                
def alpha(epsilon) :
    return math.log((1-epsilon)/(epsilon + 1e-20))/2


In [183]:
#ht 를 수식으로 남기는 방법이 애매함. 

def adaboost(T,data, new_vector) : 
    D_list = [] 
    D = np.ones(len(data)) / len(data) 
    D_list.append(D)
    
    alpha_list = []
    h_list = [] 
    
    for t in range(T) :
        Z = 0 
        
        # 아래처럼 Random Choice로 샘플을 골라내는 게 맞을까? 
        new_index = np.random.choice(range(len(data)), len(data), p=D)
        new_data = data[new_index]
        epsil = epsilon(new_data)
        
        h, chose_att, crit, direction = stump_tree(new_data)
        h_list.append([chose_att, crit, direction])
        a = alpha(epsil)
        alpha_list.append(a)
        new_D = [] 
        for i in range(len(new_data)) : 
            new_D_value =  D[i]* np.exp(-a * new_data[i][-1] * h[i])
            Z += new_D_value
            new_D.append(new_D_value)
        
        D = np.array(new_D)/Z
    
    result = 0 
    for t in range(T):
        chose_att, crit, direction = h_list[t]
        result += alpha_list[t] * cal_stump_tree(new_vector, chose_att, crit, direction)
    
    if result > 0 : return 1
    else : return -1 


In [140]:
a = []
a.append(cal_stump_tree)
print(a)

[<function cal_stump_tree at 0x000001FBC1093940>]


In [209]:
a = [ 5.1,  3.5,  1.4,  0.2, -1. ]

adaboost(10, S, a)

-1

SyntaxError: invalid syntax (<ipython-input-158-57e2a59d8f90>, line 1)