# Modeles de Gradient boosting 

In [91]:
import plotly.offline as py
import plotly.graph_objs as go
import pandas as pd
import cufflinks as cf
import pandas as pd
import numpy as np
py.init_notebook_mode(connected=True)
cf.set_config_file(offline=False, world_readable=True, theme='ggplot')

In [133]:
def plot_optimisation(train, test, parameter, parameter_name):
    train_trace = go.Scatter(
        x = train,
        y = parameter,
        mode = 'lines',
        name = 'train_scores'
    )
    test_trace = go.Scatter(
        x = test,
        y = parameter,
        mode = 'lines',
        name = 'test_scores'
    )

    data = [train_trace, test_trace]

    layout= go.Layout(
        title= 'Train et Test scores en fonction du '+parameter_name,
        xaxis= dict(title= 'Scores'),
        yaxis=dict(title= parameter_name),
        showlegend= True
    )
    fig= go.Figure(data=data, layout=layout)
    py.iplot(fig)    

In [151]:
plot_optimisation([0.85, 0.95, 1], [0.8, 0.9, 1], [1,2,3], 'test2')

In [8]:
import pandas as pd
raw = pd.read_csv('data/kc_house_data.csv')
raw[['floors', 'zipcode']] = pd.DataFrame(
    {
        'floors': [floor.replace("'","") for floor in raw.floors.values],
        'zipcode': [code.replace("'","") for code in raw.zipcode.values]
    }, dtype='float32')
raw['date'] = pd.to_datetime(raw['date'])
raw['month'] = pd.DatetimeIndex(raw.loc[:,'date']).month
raw['year'] = pd.DatetimeIndex(raw.loc[:,'date']).year
raw['day'] = pd.DatetimeIndex(raw.loc[:,'date']).day
raw = raw[['id',
 'price',
 'date',
 'year',
 'month',
 'day',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15',
]]
data = {
    feature: raw[feature].values for feature in raw.columns[3:]
}

X = pd.DataFrame(
    data=data,
    index=raw['id'].values, 
)
y = pd.DataFrame(
    raw.price.values,
    columns=['price'],
    index=raw['id'].values, 
)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.33, random_state=0)

## AdaBoost

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

# Create the dataset
rng = np.random.RandomState(1)

reg = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4),
                          n_estimators=300, random_state=rng)
reg.fit(X_train, y_train)

train_score = reg.score(X_train, y_train)
test_score = reg.score(X_test, y_test)

print('Train score : {}\tTest score : {}'
      .format(round(train_score,5), round(test_score,5)))

  y = column_or_1d(y, warn=True)


Train score : 0.39465	Test score : 0.38879


## Gradient Tree Boosting

### Selection de la meilleure fonction cout

In [155]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import ensemble
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error

# Fit regression model
loss_functions = ['ls', 'lad', 'huber', 'quantile']
train_scores =[]
test_scores=[]

for fct in loss_functions:
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': 0.01, 'loss': fct}

    reg = ensemble.GradientBoostingRegressor(**params)

    reg.fit(X_train, y_train.price.values)

    train_score = reg.score(X_train, y_train.price.values)
    test_score = reg.score(X_test, y_test.price.values)
    train_scores.append(round(train_score, 6))        
    test_scores.append(round(test_score, 6))        

plot_optimisation(train_scores, test_scores, n_estimators, 'loss_function') 

### Selection du meilleur nombre d'iteration

In [154]:
n_estimators =[]
train_scores =[]
test_scores=[]

for n in [10, 50, 100, 500, 1000]:
    params = {'n_estimators': n, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': 0.01, 'loss': 'ls'}

    reg = ensemble.GradientBoostingRegressor(**params)

    reg.fit(X_train, y_train.price.values)

    train_score = reg.score(X_train, y_train.price.values)
    test_score = reg.score(X_test, y_test.price.values)
    n_estimators.append(n)
    train_scores.append(round(train_score, 6))        
    test_scores.append(round(test_score, 6))        

plot_optimisation(train_scores, test_scores, n_estimators, 'n_estimators') 

In [159]:
min_samples_splits =[]
train_scores =[]
test_scores=[]

for n in np.arange(2, 10, 1):
    params = {'n_estimators': 200, 'max_depth': 4, 'min_samples_split': n,
              'learning_rate': 0.01, 'loss': 'ls'}

    reg = ensemble.GradientBoostingRegressor(**params)

    reg.fit(X_train, y_train.price.values)

    train_score = reg.score(X_train, y_train.price.values)
    test_score = reg.score(X_test, y_test.price.values)
    min_samples_splits.append(n)
    train_scores.append(train_score)        
    test_scores.append(test_score)    
    
plot_optimisation(train_scores, test_scores, min_samples_splits, 'min_samples_split') 

In [158]:
min_samples_splits

[2, 3]

In [162]:
rates =[]
train_scores =[]
test_scores=[]

for rate in np.arange(0.01, 1, 0.02):
    params = {'n_estimators': 50, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': rate, 'loss': 'ls'}

    reg = ensemble.GradientBoostingRegressor(**params)

    reg.fit(X_train, y_train.price.values)

    train_score = reg.score(X_train, y_train.price.values)
    test_score = reg.score(X_test, y_test.price.values)
    rates.append(rate)
    train_scores.append(train_score)        
    test_scores.append(test_score)    
    
plot_optimisation(train_scores, test_scores, min_samples_splits, 'Learning_rate')     


In [163]:
max_depths =[]
train_scores =[]
test_scores=[]

for rate in np.arange(3, 10, 1):
    params = {'n_estimators': 100, 'max_depth': n, 'min_samples_split': 2,
              'learning_rate': 9, 'loss': 'ls'}

    reg = ensemble.GradientBoostingRegressor(**params)

    reg.fit(X_train, y_train.price.values)

    train_score = reg.score(X_train, y_train.price.values)
    test_score = reg.score(X_test, y_test.price.values)
    max_depth.append(rate)
    train_scores.append(train_score)        
    test_scores.append(test_score)    
    
plot_optimisation(train_scores, test_scores, max_depths, 'max_depth')     


NameError: name 'max_depth' is not defined