In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# Import des données

In [3]:
df = pd.read_csv('../data.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


## Modélisation

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
X = df.drop('charges', axis=1)
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=False,test_size=0.2, random_state=42)

# Construction du pipeline

In [9]:
numeric_features = ["age","bmi", "children"]
categorial_features = [ "sex", "region", "smoker"]

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

categorical_transformer = OneHotEncoder()

In [11]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorial_features)
    ]
   , remainder="passthrough" 
)

In [12]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()

In [13]:
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline

pipe = Pipeline([
     ('preprocessor', preprocessor),
     ('decisiontree', tree)
])


# Training

In [14]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np
def run_experiment(model):
    model.fit(X_train, y_train)
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
#    if model est un grid_search:
    print('Best Hyperparameters: %s' % model.best_params_)
    print("######## R^2 : ")
    print("TRAIN :",r2_score(y_train, y_pred_train))
    print("TEST :",r2_score(y_test, y_pred_test))
    print("######## MAE : ")
    print("TRAIN :",mean_absolute_error(y_train, y_pred_train))
    print("TEST :",mean_absolute_error(y_test, y_pred_test))
    print("######## MSE : ")
    print("TRAIN :",mean_squared_error(y_train, y_pred_train))
    print("TEST :",mean_squared_error(y_test, y_pred_test))
    return model

# Justification Hyperparamètres - DecisionTreeRegressor

In [15]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()

# gérer les valeurs abbérantes
space['decisiontree__criterion'] = ["squared_error", "absolute_error"] # absolute error traite mieux les valeurs abbérantes

# Gérer la profondeur de l'arbre
space['decisiontree__max_depth'] = np.arange(2,10,1) # plus c'est profond, plus ça over-fit (Un seul arbre, il faut un peu de profondeur pour ne pas avoir trop de biais)
# space['decisiontree__max_leaf_nodes'] = np.arange(10,100,5) # plus c'est grand, plus ça overfit 

# Gérer la taille des feuilles
# space['decisiontree__min_samples_split'] = np.arange(60,400,20) # plus c'est petit, plus ça overfit
space['decisiontree__min_samples_leaf'] = np.arange(30,150,10)  # plus c'est petit, plus ça overfit / on vise la loi des grands nombre autour de 50
# space['decisiontree__min_weight_fraction_leaf'] = np.arange(60,400,20) # plus c'est petit, plus ça overfit 

# Limiter les splits sans interets
space['decisiontree__min_impurity_decrease'] = np.linspace(0,1000,100) # plus c'est petit, plus ça overfit 
# space['decisiontree__ccp_alpha'] = np.linspace(0,1,100) # plus c'est petit, plus ça overfit 
# define search

random_search = RandomizedSearchCV(pipe, space, n_iter=1000, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv, random_state=1,verbose=2)



In [16]:
# run_experiment(random_search)

Fitting 30 folds for each of 1000 candidates, totalling 30000 fits
[CV] END decisiontree__criterion=absolute_error, decisiontree__max_depth=2, decisiontree__min_impurity_decrease=656.5656565656565, decisiontree__min_samples_leaf=100; total time=   0.0s
[CV] END decisiontree__criterion=absolute_error, decisiontree__max_depth=2, decisiontree__min_impurity_decrease=656.5656565656565, decisiontree__min_samples_leaf=100; total time=   0.1s
[CV] END decisiontree__criterion=absolute_error, decisiontree__max_depth=2, decisiontree__min_impurity_decrease=656.5656565656565, decisiontree__min_samples_leaf=100; total time=   0.0s
[CV] END decisiontree__criterion=absolute_error, decisiontree__max_depth=2, decisiontree__min_impurity_decrease=656.5656565656565, decisiontree__min_samples_leaf=100; total time=   0.0s
[CV] END decisiontree__criterion=absolute_error, decisiontree__max_depth=2, decisiontree__min_impurity_decrease=656.5656565656565, decisiontree__min_samples_leaf=100; total time=   0.0s
[CV

In [17]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
import numpy as np

# define evaluation
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()

# gérer les valeurs abbérantes
space['decisiontree__criterion'] = ["squared_error", "absolute_error"] # absolute error traite mieux les valeurs abbérantes

# Gérer la profondeur de l'arbre
space['decisiontree__max_depth'] = np.arange(6,9,1) # plus c'est profond, plus ça over-fit

# Gérer la taille des feuilles
space['decisiontree__min_samples_split'] = np.arange(30,50,5) # plus c'est petit, plus ça overfit

# Limiter les splits sans interets
space['decisiontree__min_impurity_decrease'] = np.linspace(0,30,2) # plus c'est petit, plus ça overfit 

# define search
grid_search = GridSearchCV(pipe, space, scoring='neg_mean_absolute_error', n_jobs=-1, cv=cv)

In [18]:
grid_search_fit = run_experiment(grid_search)

Best Hyperparameters: {'decisiontree__criterion': 'absolute_error', 'decisiontree__max_depth': 7, 'decisiontree__min_impurity_decrease': 0.0, 'decisiontree__min_samples_split': 30}
######## R^2 : 
TRAIN : 0.8601105027263687
TEST : 0.8537802986897045
######## MAE : 
TRAIN : 1687.802174985033
TEST : 2047.5025273880594
######## MSE : 
TRAIN : 20240205.123421565
TEST : 22507298.38493529


# Affichage de l'arbre

In [160]:
my_model = grid_search_fit.best_estimator_.named_steps["decisiontree"]

In [129]:
import graphviz 
from sklearn import tree
dot_data =tree.export_graphviz(my_model, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("best_decision_tree") 

'iris.pdf'

# Feature Importance

In [None]:
my_model.feature_importances_

array([0.36658101, 0.19407224, 0.02969388, 0.0013703 , 0.00146453,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.40681804])

## Shap (non fini)

In [140]:
import shap

#set the tree explainer as the model of the pipeline
explainer = shap.TreeExplainer(grid_search_fit.best_estimator_['decisiontree'])

#apply the preprocessing to x_test
observations = grid_search_fit.best_estimator_['preprocessor'].transform(X_test)

In [147]:
#get Shap values from preprocessed data
shap_values = explainer.shap_values(observations[0])

In [148]:
#plot the feature importance
shap.summary_plot(shap_values, X_test, plot_type="bar")

AssertionError: Summary plots need a matrix of shap_values, not a vector.