# Ensemble methods. Exercises


In this section we have only one exercise:

1. Find the best three classifier in the stacking method using the classifiers from scikit-learn package, such as:


* Linear regression,
* Nearest Neighbors,
* Linear SVM,
* Decision Tree,
* Naive Bayes,
* QDA.

In [1]:
%store -r data_set
%store -r labels
%store -r test_data_set
%store -r test_labels
%store -r unique_labels

## Exercise 1: Find the best three classifier in the stacking method

In [2]:
import numpy as np
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [3]:
def build_classifiers():
    """
    Creates objects of models from Sklearn library: 
        - Linear regression
        - Nearest Neighbors
        - Linear SVM
        - Decision Tree
        - Naive Bayes
        - QDA
        
    Returns
    -------
    dict(str: BaseEstimator)
        Dictionary containing all models
    
    """
    
    linear_regression_model = LinearRegression()
    linear_regression_model.fit(data_set, labels)
    
    k_neighbors_model = KNeighborsClassifier()
    k_neighbors_model.fit(data_set, labels)
    
    svc_model = SVC()
    svc_model.fit(data_set, labels)
    
    decision_tree_model = DecisionTreeClassifier()
    decision_tree_model.fit(data_set, labels)
    
    naive_bayes_model = GaussianNB()
    naive_bayes_model.fit(data_set, labels)
    
    quadratic_discriminant_model = QuadraticDiscriminantAnalysis()
    quadratic_discriminant_model.fit(data_set, labels)
    
    return dict(linear_regression_model = linear_regression_model,
                k_neighbors_model = k_neighbors_model,
                svc_model = svc_model,
                decision_tree_model = decision_tree_model,
                naive_bayes_model = naive_bayes_model, 
                quadratic_discriminant_model = quadratic_discriminant_model)

In [4]:
def build_stacked_classifier(classifiers):
    output = []
    for classifier in classifiers:
        output.append(classifier.predict(data_set))
    output = np.array(output).reshape((130,3))
    
    # stacked classifier part:
    stacked_classifier = DecisionTreeClassifier()
    stacked_classifier.fit(output.reshape((130,3)), labels.reshape((130,)))
    test_set = []
    for classifier in classifiers:
        test_set.append(classifier.predict(test_data_set))
    test_set = np.array(test_set).reshape((len(test_set[0]),3))
    predicted = stacked_classifier.predict(test_set)
    return predicted

In [5]:
import itertools

def predict_all():
    """
    Creates all listed classifiers, for each combination creates stacked classifier
    and saves the accuracy.
    
    Returns
    -------
    classifiers_sets: List[ dict( str: BaseEstimator, float) ]
        each dictionary contains 3 values of BaseEstimator 
        and float value for 'accuracy' key
    
    """
    all_classifiers = build_classifiers()
    classifiers_sets = []
    combinations = itertools.combinations(all_classifiers, 3)
    for c in combinations:
        # if labels are unique
        if len(c) == len(set(c)):
            c_set = {}
            for i in c:
                c_set[i] = all_classifiers[i]
            classifiers_sets.append(c_set)

    for c_set in classifiers_sets:
        predicted = build_stacked_classifier(c_set.values())
        accuracy = accuracy_score(test_labels, predicted)
        c_set['accuracy'] = accuracy
        
    return classifiers_sets

In [6]:
classifiers_sets = predict_all()
best_set = max(classifiers_sets, key = lambda x: x['accuracy'])
accuracy = best_set.pop('accuracy')

print("Best set : {} , accuracy : {} ".format(best_set.keys(),accuracy ))

Best set : dict_keys(['k_neighbors_model', 'decision_tree_model', 'quadratic_discriminant_model']) , accuracy : 0.95 




## Summary

The best accuracy of stacked classifier that was generated is 0,95.
The stacked classifier was created basen on 3 classifiers : Nearest Neighbors, Decision Tree and QDA. 


## Exercise 2: 

Use the boosting method and change the code to fullfill the following requirements:

* the weights should be calculated as:
$w_{n}^{(t+1)}=\frac{1+ I(y_{n}\neq h_{t}(x_{n})}{\sum_{i=1}^{N}1+I(y_{n}\neq h_{t}(x_{n})}$,
* the prediction is done with a voting method.

In [46]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# prepare data set

def generate_data(sample_number, feature_number, label_number):
    data_set = np.random.random_sample((sample_number, feature_number))
    labels = np.random.choice(label_number, sample_number)
    return data_set, labels

labels = 2
dimension = 2
test_set_size = 1000
train_set_size = 5000
train_set, train_labels = generate_data(train_set_size, dimension, labels)
test_set, test_labels = generate_data(test_set_size, dimension, labels)

# init weights
number_of_iterations = 10
weights = np.ones((test_set_size,)) / test_set_size


def train_model(classifier, weights):
    return classifier.fit(X=test_set, y=test_labels, sample_weight=weights)

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0

def calculate_accuracy_vector(predicted, labels):
    result = []
    for i in range(len(predicted)):
        if predicted[i] == labels[i]:
            result.append(0)
        else:
            result.append(1)
    return result

Fill the two functions below:

In [47]:
def get_new_weights(model):
    predicted = model.predict(test_set)
    I = calculate_accuracy_vector(predicted, test_labels)
    N = len(I)
    denominator = np.sum(I) + N
    return [(1 + I[i]) / denominator for i in range(N)]

def calculate_error(model):
    predicted = model.predict(test_set)
    I=calculate_accuracy_vector(predicted, test_labels)
    Z=np.sum(I)
    return (1+Z)/1.0



Train the classifier with the code below:

In [48]:
classifier = DecisionTreeClassifier(max_depth=1, random_state=1)
classifier.fit(X=train_set, y=train_labels)
classifiers = []
for iteration in range(number_of_iterations):
    model = train_model(classifier, weights)
    weights = get_new_weights(model)
    classifiers.append(model)

print(weights)


validate_x, validate_label = generate_data(1, dimension, labels)

[0.0013271400132714001, 0.0006635700066357001, 0.0006635700066357001, 0.0006635700066357001, 0.0006635700066357001, 0.0013271400132714001, 0.0006635700066357001, 0.0006635700066357001, 0.0013271400132714001, 0.0006635700066357001, 0.0013271400132714001, 0.0013271400132714001, 0.0006635700066357001, 0.0006635700066357001, 0.0006635700066357001, 0.0013271400132714001, 0.0006635700066357001, 0.0006635700066357001, 0.0006635700066357001, 0.0006635700066357001, 0.0006635700066357001, 0.0013271400132714001, 0.0013271400132714001, 0.0013271400132714001, 0.0006635700066357001, 0.0013271400132714001, 0.0013271400132714001, 0.0013271400132714001, 0.0013271400132714001, 0.0013271400132714001, 0.0013271400132714001, 0.0006635700066357001, 0.0006635700066357001, 0.0013271400132714001, 0.0006635700066357001, 0.0013271400132714001, 0.0013271400132714001, 0.0013271400132714001, 0.0006635700066357001, 0.0013271400132714001, 0.0013271400132714001, 0.0006635700066357001, 0.0006635700066357001, 0.00066357

Set the validation data set:

In [49]:
# dataset   label
validate_x, validate_label = generate_data(1, dimension, labels)


Fill the prediction code:

In [50]:
def get_prediction(x):
    predictions = []
    for i in range(len(classifiers)):
        predicted = classifiers[i].predict(x)
        predictions.append(predicted)
    #print(predictions)
    N = len(predictions[0])
    voting = [ np.sum([j[i] for j in predictions ]) for i in range(N)]
    final_prediction = [ 1 if voting[i] > len(predictions) / 2 else 0 for i in range (N)  ]
    return final_prediction

Test it:

In [51]:
prediction = get_prediction(validate_x)[0]

print(prediction)

0
