In [1]:
from jyquickhelper import add_notebook_menu
add_notebook_menu()

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Let's try to do our best with a few ML model

In [3]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

In [4]:
sklearn.__version__

'0.18.1'

In [None]:
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                              GradientBoostingClassifier, ExtraTreesClassifier)

from sklearn.cross_validation import KFold

## Data Preparation

We use the data preparation explained in the notebook called "Data Preparation for our studies "


In [None]:
from transplant.tools.learningset import Learningset

learningset = Learningset()

X_train, X_test, y_train, y_test, X_col = learningset.get_data_merged_dynamic_flatten_full("cls", False)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

We need to change the y format in order to make our models work

In [None]:
y_train, y_test = y_train.reshape(-1, 1), y_test.reshape(-1, 1)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## ML Tuning

In [None]:
cols = X_col  # On a besoin des colonnes pour plus tard
# On observe le grand X et pas le petit x ...

ntrain = X_train.shape[0]

ntest = X_test.shape[0]

seed_fix = 5                   # Pour reproduire les résultats

In [None]:
ntrain, ntest

In [None]:
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

    def fit(self, x, y):
        return self.clf.fit(x, y)

    def feature_importances(self, x, y):
        return self.clf.fit(x, y).feature_importances_

In [None]:
NFOLDS = 5
kf = KFold(ntrain, n_folds=NFOLDS, random_state=seed_fix)


def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    'warm_start': True,
    # 'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features': 'sqrt',
    'verbose': 0
}

In [None]:
et_params = {
    'n_jobs': -1,
    'n_estimators': 500,
    # 'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

In [None]:
ada_params = {
    'n_estimators': 500,
    'learning_rate': 0.75
}

In [None]:
gb_params = {
    'n_estimators': 500,
    # 'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

In [None]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=seed_fix, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=seed_fix, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=seed_fix, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier,
                   seed=seed_fix, params=gb_params)

In [None]:
X_train.shape, y_train.shape, X_test.shape

## Learning zone

In [None]:
%%time
et_oof_train, et_oof_test = get_oof(
    et, X_train, y_train, X_test)  # Extra Trees
rf_oof_train, rf_oof_test = get_oof(
    rf, X_train, y_train, X_test)  # Random Forest
ada_oof_train, ada_oof_test = get_oof(
    ada, X_train, y_train, X_test)  # AdaBoost
gb_oof_train, gb_oof_test = get_oof(
    gb, X_train, y_train, X_test)  # Gradient Boost

print("I am done learning :D")

In [None]:
rf_feature = rf.feature_importances(X_train, y_train)
et_feature = et.feature_importances(X_train, y_train)
ada_feature = ada.feature_importances(X_train, y_train)
gb_feature = gb.feature_importances(X_train, y_train)

# On met plutot dans un dataFrame
feature_dataframe = pd.DataFrame({'features': cols,
                                  'Random Forest Importance des features': rf_feature,
                                  'Extra Trees Importance des features': et_feature,
                                  'AdaBoost Importance des features': ada_feature,
                                  'Gradient Boost Importance des features': gb_feature
                                  })

In [None]:
rf_pred = rf.predict(X_test)
et_pred = et.predict(X_test)
ada_pred = ada.predict(X_test)
gb_pred = gb.predict(X_test)

## Classification Reports

For the ones who forgot : 

Compute the precision

The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

The best value is 1 and the worst value is 0.

Compute the recall

The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.

The best value is 1 and the worst value is 0.

The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:

F1 = 2 (precision recall) / (precision + recall)

The support is the number of occurrences of each class in y_true

In [None]:
from sklearn import metrics
print(metrics.classification_report(rf_pred, y_test))

In [None]:
print(metrics.classification_report(et_pred, y_test))

In [None]:
print(metrics.classification_report(ada_pred, y_test))

In [None]:
print(metrics.classification_report(gb_pred, y_test))

## Feature Importance Visualisation 

In [None]:
trace = go.Scatter(x=feature_dataframe['features'].values,
                   y=feature_dataframe['Random Forest Importance des features'].values,

                   mode='markers',
                   marker=dict(
    sizemode='diameter',
    sizeref=1,
    size=25,
    # color = np.random.randn(500), #set color equal to a variable
    color=feature_dataframe['Random Forest Importance des features'].values,
    colorscale='Portland',
    showscale=True
),
    text=feature_dataframe['features'].values
)

data = [trace]

layout = go.Layout(
    autosize=True,
    title='Random Forest Importance des features',
    hovermode='closest',
    yaxis=dict(
        title='Importance des features',
        ticklen=5,
        gridwidth=2
    ),
    showlegend=False
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='scatter2010')

In [None]:
trace = go.Scatter(
    y=feature_dataframe['Extra Trees Importance des features'].values,
    x=feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode='diameter',
        sizeref=1,
        size=25,
        color=feature_dataframe['Extra Trees Importance des features'].values,
        colorscale='Portland',
        showscale=True
    ),
    text=feature_dataframe['features'].values
)
data = [trace]

layout = go.Layout(
    autosize=True,
    title='Extra Trees Importance des features',
    hovermode='closest',

    yaxis=dict(
        title='Importance des features',
        ticklen=5,
        gridwidth=2
    ),
    showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2010')

In [None]:
trace = go.Scatter(
    y=feature_dataframe['AdaBoost Importance des features'].values,
    x=feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode='diameter',
        sizeref=1,
        size=25,
        color=feature_dataframe['AdaBoost Importance des features'].values,
        colorscale='Portland',
        showscale=True
    ),
    text=feature_dataframe['features'].values
)
data = [trace]

layout = go.Layout(
    autosize=True,
    title='AdaBoost Importance des features',
    hovermode='closest',
    yaxis=dict(
        title='Importance des features',
        ticklen=5,
        gridwidth=2
    ),
    showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2010')

In [None]:
trace = go.Scatter(
    y=feature_dataframe['Gradient Boost Importance des features'].values,
    x=feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode='diameter',
        sizeref=1,
        size=25,
        color=feature_dataframe['Gradient Boost Importance des features'].values,
        colorscale='Portland',
        showscale=True
    ),
    text=feature_dataframe['features'].values
)
data = [trace]

layout = go.Layout(
    autosize=True,
    title='Gradient Boosting Importance des features',
    hovermode='closest',
    yaxis=dict(
        title='Importance des features',
        ticklen=5,
        gridwidth=2
    ),
    showlegend=False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='scatter2010')