## Implementing binary decision trees

In [1]:
import graphlab

# Load the titanic dataset

In [5]:
train = graphlab.SFrame('train_sf.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,int,str,int,int,int,int,str,float,str,str,int,int,int,int,int,int,int,int,int]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [6]:
len(train)

714

In [7]:
train

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket
1,0,3,"Braund, Mr. Owen Harris",0,1,1,0,A/5 21171
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs ...",1,1,1,0,PC 17599
3,1,3,"Heikkinen, Miss. Laina",1,1,0,0,STON/O2. 3101282
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel) ...",1,1,1,0,113803
5,0,3,"Allen, Mr. William Henry",0,1,0,0,373450
7,0,1,"McCarthy, Mr. Timothy J",0,1,0,0,17463
8,0,3,"Palsson, Master. Gosta Leonard ...",0,0,3,1,349909
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina ...",1,1,0,2,347742
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem) ...",1,0,1,0,237736
11,1,3,"Sandstrom, Miss. Marguerite Rut ...",1,0,1,1,PP 9549

Fare,Cabin,Embarked,Has_Cabin,FamilySize,IsAlone,1,2,3,C,Q,S
7.25,,S,0,2,0,0,0,1,0,0,1
71.2833,C85,C,1,2,0,1,0,0,1,0,0
7.925,,S,0,1,1,0,0,1,0,0,1
53.1,C123,S,1,2,0,1,0,0,0,0,1
8.05,,S,0,1,1,0,0,1,0,0,1
51.8625,E46,S,1,1,1,1,0,0,0,0,1
21.075,,S,0,5,0,0,0,1,0,0,1
11.1333,,S,0,3,0,0,0,1,0,0,1
30.0708,,C,0,2,0,0,1,0,1,0,0
16.7,G6,S,1,3,0,0,0,1,0,0,1


In [8]:
train['Survived'] = train['Survived'].apply(lambda x : +1 if x==1 else -1)


In [51]:
train_data, test_data = train.random_split(0.8, seed=1)

In [52]:
features = ['Sex',
            'Age',            
            'Has_Cabin',     
            'IsAlone',
            '1', #1 classe
            '2', #2 classe
            '3', #3 classe
            'C',
            'Q',
            'S'
           ]
target = 'Survived'

titanic = train_data[features + [target]]

Let's explore what the dataset looks like.

In [54]:
len(titanic)

572

# Decision tree 

In [174]:
def node_num_mistakes(labels):
    # Se labels_in node è empty ritorna zero
    if len(labels_in_node) == 0:
        return 0
    
    # Conta il numero labels = -1
    negative = sum([x == -1 for x in labels])
    
    # Conta il numero di labels = +1
    positive = sum([x == 1 for x in labels])

    #Siccome stiamo calcolando il majority  class prediction, tutti i datapoint che non sono 
    #nel majority class sono errori
    # Ritorna il numero di errori del majority classifier. 
    return min(positive, negative)
    

In [56]:
# Test case 1
example_labels = graphlab.SArray([-1, -1, 1, 1, 1])
if node_num_mistakes(example_labels) == 2:
    print 'ok!'
else:
    print 'non corretto'

# Test case 2
example_labels = graphlab.SArray([-1, -1, 1, 1, 1, 1, 1])
if node_num_mistakes(example_labels) == 2:
    print 'ok!'
else:
    print 'non corretto'
    
# Test case 3
example_labels = graphlab.SArray([-1, -1, -1, -1, -1, 1, 1])
if node_num_mistakes(example_labels) == 2:
    print 'ok!'
else:
    print 'non corretto'

Test passed!
Test passed!
Test passed!


## Trovare la migliore feature su cui fare split

In [164]:
def best_splitting_feature(data, features, target):
    
    best_feature = None #seleziona la best feature
    #errore è sempre <=1 quindi possiamo inizializzarlo con qualcosa più grande di 1
    best_error = 10         
    # Iteriamo su ogni feature per considerare lo split su quella feature 
    for feature in features:
        #  Nel left_split memorizziamo tutti i data point dove il valore della feature è 0
        left_split = data[data[feature] == 0]
        #Nel right split memorizziamo tutti i data point dove il valore della feature è 1
        right_split =  data[data[feature] == 1]
        #Calcoliamo il numero di errori nel left_split utilizzando la funzione node_num_mistakes
        left_mistakes = node_num_mistakes(left_split['Survived'])            
        #Calcoliamo il numero di errori nel right_split utilizzando la funzione node_num_mistakes
        right_mistakes = node_num_mistakes(right_split['Survived'])     
        # Calcola il classification error
        # Error = (#  errori (left) + # errori (right)) / (# of data points)
        error = float(left_mistakes + right_mistakes) / len(data)
        # Se è il miglior errore, fai aggiornamento.
        if error < best_error:
            best_feature, best_error = feature, error
    return best_feature # Ritorna la best feature

## Building the tree

Costruiamo il decision tree. Ogni nodo del decision tree è rappresentato da un dizionario che contiene 

    { 
       'is_leaf'            : True/False.
       'prediction'         : Predizione al nodo foglia.
       'left'               : (dizionario corrispondente all'albero sinistro).
       'right'              : (dizionario corrispondente all'albero destro).
       'splitting_feature'  : La feature che questo nodo splitta.
    }



In [175]:
def create_leaf(target_values):
    
    # Crea un nodo foglia
    leaf = {'splitting_feature' : None,
            'left' : None,
            'right' : None,
            'is_leaf': True    }   
    
    # Conta il numero di data point che sono +1 o -1 
    num_positive = len(target_values[target_values == +1])
    num_negative = len(target_values[target_values == -1])
    
    #  Per il nodo foglia imposta la predizione al majority class
    if num_positive > num_negative:
        leaf['prediction'] = 1       
    else:
        leaf['prediction'] = -1                 
    return leaf 

In [166]:
def decision_tree_create(data, features, target, current_depth = 0, max_depth = 10):
    remaining_features = features[:]  #Copia le feature
    
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Sottoalbero, profondità = %s (%s data points)." % (current_depth, len(target_values))
    

    # Stopping condition 1
    #Controlla se ci sono errori al nodo corrente

    if node_num_mistakes(target_values) == 0:  
        print "Stopping condition 1 reached."     
        # Se non ci sono errori al nodo corrente, rendilo una foglia
        return create_leaf(target_values)
    
    # Stopping condition 2 (Controlla se ci sono feature su cui splittare )
    if remaining_features == []:   
        print "Stopping condition 2 reached."    
        # il nodo corrente è un nodo foglia
        return create_leaf(target_values)    
    
    # max_depth
    if current_depth >= max_depth:  
        print "Raggiunta max depth. Stopping for now."
        # Se è stata raggiunta la massima profondità, stopping condition
        return create_leaf(target_values)

    #  Trova la migliore feature su cui fare split
    splitting_feature = best_splitting_feature(data, remaining_features, 'Survived')
    # Fai lo split sulla best feature trovata
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]       
    remaining_features.remove(splitting_feature)
    print "Split su %s. (%s, %s)" % (\
                      splitting_feature, len(left_split), len(right_split))
    
    # Crea un nodo foglia se lo split è perfetto 
    if len(left_split) == len(data):
        print "Crea foglia."
        return create_leaf(left_split[target])
    if len(right_split) == len(data):
        print "Crea foglia"
        return create_leaf(right_split[target])        
    # Ripeti sui sottoalberi destro e sinistro. 
    left_tree = decision_tree_create(left_split, remaining_features, target, current_depth + 1, max_depth)        
    right_tree = decision_tree_create(right_split, remaining_features, target, current_depth + 1, max_depth)        

    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

Funzione ricorsiva per contare i nodi nel tuo albero

In [167]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [168]:
my_decision_tree = decision_tree_create(titanic, features, 'Survived', max_depth = 3)


--------------------------------------------------------------------
Subtree, depth = 0 (572 data points).
Split on feature Sex. (369, 203)
--------------------------------------------------------------------
Subtree, depth = 1 (369 data points).
Split on feature Age. (42, 327)
--------------------------------------------------------------------
Subtree, depth = 2 (42 data points).
Split on feature 3. (11, 31)
--------------------------------------------------------------------
Subtree, depth = 3 (11 data points).
Stopping condition 1 reached.
--------------------------------------------------------------------
Subtree, depth = 3 (31 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (327 data points).
Split on feature Has_Cabin. (251, 76)
--------------------------------------------------------------------
Subtree, depth = 3 (251 data points).
Reached maximum depth. Stopping for now.
----------

## Fare predictions


In [84]:
def classify(tree, x, annotate = False):   
    # se il nodo è una foglia
    if tree['is_leaf']:
        if annotate: 
            print "Alla foglia, predici %s" % tree['prediction']
        return tree['prediction'] 
    else:
        # split on feature.
        split_feature_value = x[tree['splitting_feature']]
        if annotate: 
            print "Split su %s = %s" % (tree['splitting_feature'], split_feature_value)
        if split_feature_value == 0:
            if annotate:
                print 'left split'
            return classify(tree['left'], x, annotate)
        else:
            if annotate:
                print 'right split'
            return classify(tree['right'], x, annotate)

In [169]:
test_data[0]

{'1': 0,
 '2': 1,
 '3': 0,
 'Age': 0,
 'C': 1,
 'Cabin': 'nan',
 'Embarked': 'C',
 'FamilySize': 2,
 'Fare': 30.0708,
 'Has_Cabin': 0,
 'IsAlone': 0,
 'Name': 'Nasser, Mrs. Nicholas (Adele Achem)',
 'Parch': 0,
 'PassengerId': 10,
 'Pclass': 2,
 'Q': 0,
 'S': 0,
 'Sex': 1,
 'SibSp': 1,
 'Survived': 1,
 'Ticket': '237736'}

In [170]:
print 'Classe predetta: %s ' % classify(my_decision_tree, test_data[0])

Predicted class: 1 


Let's add some annotations to our prediction to see what the prediction path was that lead to this predicted class:

In [92]:
classify(my_decision_tree, test_data[0], annotate=True)

Split su Sex = 1
right split
Split su 3 = 0
left split
Split su Age = 0
left split
Alla foglia, predici 1


1

## Evaluation

In [190]:
def evaluate_classification_error(tree, data):
    # Fai prediction per ogni riga
    prediction = data.apply(lambda x: classify(tree, x))    
#Calcola il classification error   
    return (prediction != data['Survived']).sum() / float(len(data))

In [191]:
evaluate_classification_error(my_decision_tree, test_data)

0.18309859154929578

##  Stampa un decision stump

In [176]:
def print_stump(tree, name = 'root'):
    split_name = tree['splitting_feature'] # split_name è ad esempio age
    if split_name is None:
        print "(leaf, label: %s)" % tree['prediction']
        return None
    print '                       %s' % name
    print '         |---------------|----------------|'
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '  [{0} == 0]               [{0} == 1]    '.format(split_name)
    print '         |                                |'
    print '         |                                |'
    print '         |                                |'
    print '    (%s)                         (%s)' \
        % (('leaf, label: ' + str(tree['left']['prediction']) if tree['left']['is_leaf'] else 'subtree'),
           ('leaf, label: ' + str(tree['right']['prediction']) if tree['right']['is_leaf'] else 'subtree'))

In [177]:
print_stump(my_decision_tree)

                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [Sex == 0]               [Sex == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)



### Exploring the intermediate left subtree

* `my_decision_tree['left']` per andare a sinistra 
* `my_decision_tree['right']` per andare a destra

In [98]:
print_stump(my_decision_tree['left'], my_decision_tree['splitting_feature'])

                       Sex
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [Age == 0]               [Age == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


### Left->Left


In [99]:
print_stump(my_decision_tree['left']['left'], my_decision_tree['left']['splitting_feature'])

                       Age
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [3 == 0]               [3 == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: 1)                         (leaf, label: -1)


In [103]:
print_stump(my_decision_tree['left']['right'], my_decision_tree['left']['splitting_feature'])

                       Age
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [Has_Cabin == 0]               [Has_Cabin == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: -1)                         (leaf, label: -1)


In [100]:
print_stump(my_decision_tree['right'], my_decision_tree['splitting_feature'])

                       Sex
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [3 == 0]               [3 == 1]    
         |                                |
         |                                |
         |                                |
    (subtree)                         (subtree)


In [102]:
## Right->Right

In [101]:
print_stump(my_decision_tree['right']['right'], my_decision_tree['right']['splitting_feature'])

                       3
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [C == 0]               [C == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: -1)                         (leaf, label: 1)


In [104]:
print_stump(my_decision_tree['right']['left'], my_decision_tree['right']['splitting_feature'])

                       3
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [Age == 0]               [Age == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: 1)                         (leaf, label: 1)


In [178]:
#ADA BOOST

In [107]:
def node_weighted_mistakes(labels_in_node, data_weights):
    # Somma tutti i pesi delle label positive
    total_weight_positive = sum(data_weights[labels_in_node == +1])
    
    # Peso errore per -1 è uguale alla somma dei pesi dei positivi
    weighted_mistakes_all_negative = total_weight_positive
    
    # Somma tutti i pesi delle label negative
    total_weight_negative = sum(data_weights[labels_in_node == -1])
    # Peso errore per +1 è uguale alla somma dei pesi dei negativi
    weighted_mistakes_all_positive = total_weight_negative
    #Ritorna il peso minore tra i due pesi. 
    if weighted_mistakes_all_positive <= weighted_mistakes_all_negative:
        return weighted_mistakes_all_positive, +1
    else:
        return weighted_mistakes_all_negative, -1

In [179]:
def best_splitting_feature_boost(data, features, target, data_weights):
    print data_weights
    
    # Feature che tengono traccia della migliore feature e del migliore errore
    best_feature = None
    best_error = float('+inf') 
    # Loop su ogni feature
    for feature in features:
        # Left, feature =0
        # Right feature= 1
        left_split = data[data[feature] == 0]
        right_split = data[data[feature] == 1]
        # Applico  filtro a  data_weights per creare left_data_weights, right_data_weights
        left_data_weights = data_weights[data[feature] == 0]
        right_data_weights = data_weights[data[feature] == 1]
            
        # Calcola il peso degli errori per left e right 
        left_weighted_mistakes, left_class = node_weighted_mistakes(left_split[target], left_data_weights)
        right_weighted_mistakes, right_class = node_weighted_mistakes(right_split[target], right_data_weights)
        
        # Calcola errore pesato
        #  ( [weight of mistakes (left)] + [weight of mistakes (right)] ) / [total weight of all data points]
        error = (left_weighted_mistakes + right_weighted_mistakes) / (sum(left_data_weights) + sum(right_data_weights))
        
        # Se è il miglior errore, salva la feature e l'errore
        if error < best_error:
            best_feature = feature
            best_error = error
    # Ritorna la migliore feature
    return best_feature

In [182]:
def create_leaf(target_values, data_weights):
    
    # Crea un nodo foglia
    leaf = {'splitting_feature' : None,
            'is_leaf': True}
    # Calcola il peso degli errori
    weighted_error, best_class = node_weighted_mistakes(target_values, data_weights)
    # Memorizza la classe predetta
    leaf['prediction'] = best_class 
    return leaf 

In [126]:
def weighted_decision_tree_create(data, features, target, data_weights, current_depth = 1, max_depth = 3):
    remaining_features = features[:] # Fai una copia delle feature
    target_values = data[target]
    print "--------------------------------------------------------------------"
    print "Sottoalbero, profondità = %s (%s data points)." % (current_depth, len(target_values))
    
    # Stopping condition 1. Attenzione, sono pesi, abbiamo bisogno di una soglia!!
    if node_weighted_mistakes(target_values, data_weights)[0] <= 1e-15:
        print "Stopping condition 1 ."                
        return create_leaf(target_values, data_weights)
    
    # Stopping condition 2. Non ci sono più features.
    if remaining_features == []:
        print "Stopping condition 2."                
        return create_leaf(target_values, data_weights)    
    
    # Max_depth
    if current_depth > max_depth:
        print "Raggiunta massima profondità."
        return create_leaf(target_values, data_weights)
    
    # Se tutti i datapoint appartengono alla stessa classe, crea una foglia
    splitting_feature = best_splitting_feature_boost(data, features, target, data_weights)
    remaining_features.remove(splitting_feature)
        
    left_split = data[data[splitting_feature] == 0]
    right_split = data[data[splitting_feature] == 1]
    
    left_data_weights = data_weights[data[splitting_feature] == 0]
    right_data_weights = data_weights[data[splitting_feature] == 1]
    
    print "Split su feature %s. (%s, %s)" % (\
              splitting_feature, len(left_split), len(right_split))
    
    # Crea una foglia se lo split è perfettol
    if len(left_split) == len(data):
        print "Crea nodo foglia."
        return create_leaf(left_split[target], data_weights)
    if len(right_split) == len(data):
        print "Crea nodo foglia."
        return create_leaf(right_split[target], data_weights)
    
    # Fai ricorsione sui sub_trees
    left_tree = weighted_decision_tree_create(
        left_split, remaining_features, target, left_data_weights, current_depth + 1, max_depth)
    right_tree = weighted_decision_tree_create(
        right_split, remaining_features, target, right_data_weights, current_depth + 1, max_depth)
    
    return {'is_leaf'          : False, 
            'prediction'       : None,
            'splitting_feature': splitting_feature,
            'left'             : left_tree, 
            'right'            : right_tree}

In [120]:
def count_nodes(tree):
    if tree['is_leaf']:
        return 1
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

In [196]:
import numpy as np

In [199]:
sample_random = np.random.uniform(low=0.0, high=1, size=(572,))
random_data_decision_tree = weighted_decision_tree_create(train_data, features, target,
                                        sample_random, max_depth=3)

--------------------------------------------------------------------
Subtree, depth = 1 (572 data points).
[ 0.72467147  0.42471509  0.37703988  0.63775391  0.37776725  0.79872089
  0.27369329  0.4703735   0.52427341  0.04669385  0.81443818  0.92632238
  0.11366561  0.03967887  0.08654353  0.95033428  0.02552969  0.9979868
  0.43061829  0.45886651  0.77229004  0.02145871  0.43772729  0.47343555
  0.72514099  0.26778839  0.64487751  0.8187038   0.86402442  0.83450133
  0.98011175  0.74119745  0.94608064  0.13882146  0.3281539   0.03763578
  0.52365205  0.52256546  0.58535752  0.89538643  0.47550451  0.05519968
  0.84777816  0.92818268  0.34415312  0.8436549   0.75557451  0.11250183
  0.72143619  0.1491032   0.77655732  0.18774885  0.30413586  0.32597063
  0.03803729  0.25455503  0.16709128  0.87995648  0.86387658  0.48267653
  0.98386388  0.17716998  0.10797871  0.82352857  0.96427417  0.398303
  0.46471141  0.90657038  0.73793512  0.32986128  0.29254479  0.14106917
  0.19255682  0.8460

In [194]:
example_data_weights = graphlab.SArray([1.0 for i in range(len(train_data))])
print len(example_data_weights)
small_data_decision_tree = weighted_decision_tree_create(train_data, features, target,
                                        example_data_weights, max_depth=3)

572
--------------------------------------------------------------------
Subtree, depth = 1 (572 data points).
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ... ]
Split on feature Sex. (369, 203)
--------------------------------------------------------------------
Subtree, depth = 2 (369 data points).
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1

In [128]:
def evaluate_classification_error(tree, data):
    # Applica classify (tree, x) a ogni riga
    prediction = data.apply(lambda x: classify(tree, x))
    # Una volta che hai fatto le prediction, calcola il classification error. 
    return (prediction != data[target]).sum() / float(len(data))

In [192]:
evaluate_classification_error(small_data_decision_tree, test_data)

0.18309859154929578

In [200]:
evaluate_classification_error(random_data_decision_tree, test_data)

0.5845070422535211

In [130]:
# Implementare ADA Boost

In [184]:
from math import log
from math import exp

def adaboost_with_tree_stumps(data, features, target, num_tree_stumps):
    # inizialmente tutti gli alpha valgono 1
    alpha = graphlab.SArray([1.]*len(data))
    weights = []
    tree_stumps = []
    target_values = data[target]
    for t in range(num_tree_stumps):
        print '====================================================='
        print 'Adaboost Iteration %d' % t
        print '====================================================='        
        #  Apprendi un tree stump. Usa max_depth=1
        tree_stump = weighted_decision_tree_create(data, features, target, data_weights=alpha, max_depth=1)
        tree_stumps.append(tree_stump)
        
        # Fai la prediction
        predictions = data.apply(lambda x: classify(tree_stump, x))
        
        # Valuta se ogni valore è predetto correttamente o no
        is_correct = predictions == target_values
        is_wrong   = predictions != target_values
        # Calcola weighted error
        weighted_error = sum(alpha * is_wrong) / sum(alpha)
        # Calcola il coefficiente del modello usando  weighted error
        weight = .5 * log((1 - weighted_error) / weighted_error)
        weights.append(weight)
        # Modifica i pesi dei datapoint
        a_c = is_correct.apply(lambda is_correct : exp(-weight) if is_correct else exp(weight))
        # Scala alpha * a_c
        ## Normalizza
        alpha *= a_c
        alpha /= sum(alpha)
    
    return weights, tree_stumps

In [185]:
stump_weights, tree_stumps = adaboost_with_tree_stumps(train_data, features, target, num_tree_stumps=10)

Adaboost Iteration 0
--------------------------------------------------------------------
Subtree, depth = 1 (572 data points).
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ... ]
Split on feature Sex. (369, 203)
--------------------------------------------------------------------
Subtree, depth = 2 (369 data points).
Reached maximum depth. Stopping for now.
--------------------------------------------------------------------
Subtree, depth = 2 (203 data points).
Reached maximum depth. Stopping for now.
Adaboost Iteration 1
---------------

In [186]:
print_stump(tree_stumps[0])

                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [Sex == 0]               [Sex == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: -1)                         (leaf, label: 1)


In [187]:
print_stump(tree_stumps[1])

                       root
         |---------------|----------------|
         |                                |
         |                                |
         |                                |
  [3 == 0]               [3 == 1]    
         |                                |
         |                                |
         |                                |
    (leaf, label: 1)                         (leaf, label: -1)


In [188]:
#Quanto è importante ogni tree_stump nel momento in cui si fanno predizioni 
print stump_weights

[0.6218971492563038, 0.3726164285952097, 0.1434322630168566, 0.12116534914748711, 0.14548377922363728, 0.06813340444625805, 0.08875044694752335, 0.05528385856192642, 0.04253431318677091, 0.04001943453649553]


In [189]:
def predict_adaboost(stump_weights, tree_stumps, data):
    #creo un array di scores
    scores = graphlab.SArray([0.]*len(data))
    
    for i, tree_stump in enumerate(tree_stumps):
        predictions = data.apply(lambda x: classify(tree_stump, x))
        scores += stump_weights[i] * predictions
        
    return scores.apply(lambda score : +1 if score > 0 else -1)

In [155]:
predictions = predict_adaboost(stump_weights, tree_stumps, test_data)
accuracy = graphlab.evaluation.accuracy(test_data[target], predictions)
print 'Accuracy of 10-component ensemble = %s' % accuracy 

Accuracy of 10-component ensemble = 0.802816901408


In [156]:
#Calcola il training error alla fine di ogni iterazione
error_all = []
for n in xrange(1, 31):
    predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], train_data)
    error = 1.0 - graphlab.evaluation.accuracy(train_data[target], predictions)
    error_all.append(error)
    print "Iteration %s, training error = %s" % (n, error_all[n-1])

Iteration 1, training error = 0.223776223776
Iteration 2, training error = 0.223776223776
Iteration 3, training error = 0.223776223776
Iteration 4, training error = 0.243006993007
Iteration 5, training error = 0.218531468531
Iteration 6, training error = 0.218531468531
Iteration 7, training error = 0.222027972028
Iteration 8, training error = 0.222027972028
Iteration 9, training error = 0.222027972028
Iteration 10, training error = 0.218531468531
Iteration 11, training error = 0.218531468531
Iteration 12, training error = 0.218531468531
Iteration 13, training error = 0.218531468531
Iteration 14, training error = 0.218531468531
Iteration 15, training error = 0.218531468531
Iteration 16, training error = 0.218531468531
Iteration 17, training error = 0.218531468531
Iteration 18, training error = 0.218531468531
Iteration 19, training error = 0.218531468531
Iteration 20, training error = 0.218531468531
Iteration 21, training error = 0.218531468531
Iteration 22, training error = 0.2185314685

In [162]:
#Calcola il training error alla fine di ogni iterazione
test_error_all = []
for n in xrange(1, 31):
    predictions = predict_adaboost(stump_weights[:n], tree_stumps[:n], test_data)
    error = 1.0 - graphlab.evaluation.accuracy(test_data[target], predictions)
    test_error_all.append(error)
    print "Iteration %s, training error = %s" % (n, test_error_all[n-1])

Iteration 1, training error = 0.204225352113
Iteration 2, training error = 0.204225352113
Iteration 3, training error = 0.204225352113
Iteration 4, training error = 0.225352112676
Iteration 5, training error = 0.204225352113
Iteration 6, training error = 0.204225352113
Iteration 7, training error = 0.19014084507
Iteration 8, training error = 0.19014084507
Iteration 9, training error = 0.19014084507
Iteration 10, training error = 0.197183098592
Iteration 11, training error = 0.197183098592
Iteration 12, training error = 0.197183098592
Iteration 13, training error = 0.197183098592
Iteration 14, training error = 0.197183098592
Iteration 15, training error = 0.197183098592
Iteration 16, training error = 0.197183098592
Iteration 17, training error = 0.197183098592
Iteration 18, training error = 0.197183098592
Iteration 19, training error = 0.197183098592
Iteration 20, training error = 0.197183098592
Iteration 21, training error = 0.197183098592
Iteration 22, training error = 0.197183098592
