# Import modules

In [None]:
import time

import missingno as msno
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from scipy.stats import loguniform
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import GradientBoostingRegressor, RandomForestClassifier, RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import Lasso, LinearRegression, Ridge
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Import the pickle

In [12]:
df_arbres = pd.read_pickle('./data/arbres.pkl')

In [15]:
#df_arbres = df_arbres.drop('code_parent', axis=1)

In [16]:
df_arbres.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29526 entries, 0 to 31666
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sous_categorie        29526 non-null  object 
 1   code_parent           29526 non-null  object 
 2   adr_secteur           29526 non-null  int64  
 3   genre_bota            29486 non-null  object 
 4   espece                28313 non-null  object 
 5   stadededeveloppement  27018 non-null  object 
 6   anneedeplantation     29526 non-null  float64
 7   collectivite          29432 non-null  object 
 8   hauteurarbre          23738 non-null  object 
 9   portarbre             18336 non-null  object 
 10  latitude              29526 non-null  float64
 11  longitude             29526 non-null  float64
dtypes: float64(3), int64(1), object(8)
memory usage: 2.9+ MB


# Split the data frame into testing and training sets

In [17]:
pred_col = 'anneedeplantation'

In [18]:
X_train, X_test, y_train, y_test = train_test_split(df_arbres.drop(pred_col, axis=1),
                                                    df_arbres[pred_col], 
                                                    test_size=0.25, 
                                                    random_state=42)

# Create pipeline

In [None]:
# Colonnes à traiter
num_cols = ['adr_secteur', 'latitude', 'longitude']
cat_cols = ['sous_categorie', 'genre_bota', 'espece', 
           'stadededeveloppement', 'hauteurarbre']

# Pipelines pour chaque type
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Fusion dans un seul transformateur
preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [23]:
model_fc= Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

In [None]:
#feature_names = [f"feature {i}" for i in range(X_train.shape[1])]
feature_names = X_train.columns
model_fc.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessing', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,280
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [35]:
X_train

Unnamed: 0,sous_categorie,code_parent,adr_secteur,genre_bota,espece,stadededeveloppement,collectivite,hauteurarbre,portarbre,latitude,longitude
3406,ESP174,ESP546,6,Metasequoia,glyptostroboides,Arbre jeune,Grenoble Alpes Métropole,Moins de 10 m,Libre,45.163906,5.727471
6046,ESP065,ESP511,4,Betula,verrucosa,Arbre adulte,Ville de Grenoble,de 10 m à 20 m,Libre,45.174690,5.740281
24971,ESP065,ESP1107,6,Tilia,tomentosa,Arbre adulte,Ville de Grenoble,de 10 m à 20 m,,45.164132,5.738750
19120,ESP174,ESP755,6,Fraxinus,excelsior,Arbre adulte,Ville de Grenoble,de 10 m à 20 m,Libre,45.164290,5.733986
8962,ESP174,ESP273,1,Betula,utilis,Arbre adulte,Ville de Grenoble,Moins de 10 m,Libre,45.183025,5.713802
...,...,...,...,...,...,...,...,...,...,...,...
21775,ESP151,ESP1177,1,Platanus,acerifolia,Arbre adulte,Grenoble Alpes Métropole,de 10 m à 20 m,Semi-libre,45.185533,5.718070
5589,ESP151,ESP1354,5,Tilia,platyphyllos,Arbre adulte,Grenoble Alpes Métropole,,,45.183191,5.744201
1059,ESP151,ESP268,1,Magnolia,grandiflora,Arbre adulte,Grenoble Alpes Métropole,Moins de 10 m,Libre,45.190950,5.709426
15995,ESP174,ESP998,5,Tilia,tomentosa,Arbre adulte,Ville de Grenoble,de 10 m à 20 m,Libre,45.183101,5.744414


In [34]:
feature_names

['feature 0',
 'feature 1',
 'feature 2',
 'feature 3',
 'feature 4',
 'feature 5',
 'feature 6',
 'feature 7',
 'feature 8',
 'feature 9',
 'feature 10']

In [33]:
importances

array([0.06548416, 0.28741145, 0.27908575, 0.07150977, 0.11425164,
       0.12679136, 0.02529624, 0.03016961])

In [32]:
start_time = time.time()
importances = model_fc.steps[1][1].feature_importances_
std = np.std([tree.feature_importances_ for tree in model_fc.steps[1][1].estimators_], axis=0)
elapsed_time = time.time() - start_time

print(f"Elapsed time to compute the importances: {elapsed_time:.3f} seconds")

forest_importances = pd.Series(importances, index=feature_names)

fig, ax = plt.subplots()
forest_importances.plot.bar(yerr=std, ax=ax)
ax.set_title("Feature importances using MDI")
ax.set_ylabel("Mean decrease in impurity")
fig.tight_layout()


Elapsed time to compute the importances: 0.044 seconds


ValueError: Length of values (8) does not match length of index (11)

## Linear Regression

In [141]:
model1 = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LinearRegression())
])

In [142]:
model1.fit(X_train, y_train)
y_pred_test = model1.predict(X_test)
y_pred_train = model1.predict(X_train)

print(f"Mean squared error: {mean_squared_error(y_test, y_pred_test):.2f}")
print(f"Coefficient of determination: {r2_score(y_test, y_pred_test):.2f}")

Mean squared error: 279.66
Coefficient of determination: 0.15


In [143]:
print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))

16.722973479153573
16.747899901769333


In [144]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model1, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
print("Accuracy moyenne :", scores.mean())


Accuracy moyenne : -280.56709809749947


## GridSearchCV lasso

In [147]:
model_lasso = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', Lasso(max_iter=10000))
])

In [162]:
param_grid = {"classifier__alpha": loguniform.rvs(0.0001, 10000, size=100, random_state=42)}

gs_lasso = GridSearchCV(model_lasso, param_grid, cv=10, scoring='neg_mean_squared_error')
gs_lasso.fit(X_train, y_train)

gs_lasso_results = pd.DataFrame(gs_lasso.cv_results_)
gs_lasso_results = gs_lasso_results.sort_values('param_classifier__alpha')

print("Meilleurs paramètres :", gs_lasso.best_params_)

Meilleurs paramètres : {'classifier__alpha': np.float64(0.0023130924416844114)}


In [None]:
gs_lasso_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__alpha,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
72,0.050731,0.007099,0.007995,0.001038,0.000111,{'classifier__alpha': 0.00011070747281639221},-283.222755,-274.459267,-283.074140,-272.516158,-283.801893,-281.302081,-276.164687,-288.912126,-289.105129,-273.123580,-280.568182,5.863359,28
10,0.045909,0.002546,0.007982,0.000771,0.000146,{'classifier__alpha': 0.00014610865886287216},-283.222710,-274.459226,-283.074113,-272.516251,-283.801989,-281.302020,-276.164676,-288.912082,-289.105076,-273.123659,-280.568180,5.863328,27
98,0.046843,0.001643,0.008098,0.001291,0.000160,{'classifier__alpha': 0.00015971768764426244},-283.222692,-274.459210,-283.074103,-272.516287,-283.802026,-281.301997,-276.164672,-288.912066,-289.105056,-273.123689,-280.568180,5.863317,26
42,0.110141,0.020918,0.017338,0.005015,0.000188,{'classifier__alpha': 0.00018841183049085134},-283.222656,-274.459177,-283.074081,-272.516363,-283.802103,-281.301948,-276.164663,-288.912031,-289.105013,-273.123753,-280.568179,5.863292,25
58,0.048126,0.002564,0.008437,0.001375,0.000230,{'classifier__alpha': 0.0002300479202014584},-283.222602,-274.459128,-283.074050,-272.516473,-283.802216,-281.301877,-276.164650,-288.911980,-289.104951,-273.123845,-280.568177,5.863257,24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,0.046092,0.001327,0.008023,0.000549,4033.800833,{'classifier__alpha': 4033.8008326003933},-335.050027,-324.699428,-331.551261,-325.480978,-332.966335,-329.710161,-323.800778,-338.317735,-339.273113,-330.016956,-331.086677,5.168242,83
34,0.047988,0.002716,0.008008,0.000897,5309.532269,{'classifier__alpha': 5309.5322690092235},-335.050027,-324.699428,-331.551261,-325.480978,-332.966335,-329.710161,-323.800778,-338.317735,-339.273113,-330.016956,-331.086677,5.168242,83
50,0.051465,0.007997,0.008404,0.001569,5710.537951,{'classifier__alpha': 5710.5379511268075},-335.050027,-324.699428,-331.551261,-325.480978,-332.966335,-329.710161,-323.800778,-338.317735,-339.273113,-330.016956,-331.086677,5.168242,83
11,0.047257,0.003271,0.008128,0.000536,5744.851636,{'classifier__alpha': 5744.851636320435},-335.050027,-324.699428,-331.551261,-325.480978,-332.966335,-329.710161,-323.800778,-338.317735,-339.273113,-330.016956,-331.086677,5.168242,83


In [163]:
y_pred_test = gs_lasso.predict(X_test)
y_pred_train = gs_lasso.predict(X_train)

print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))

16.722990633838396
16.747901333682996


## GridSearchCV ridge

In [107]:
param_grid = {"alpha": loguniform.rvs(0.0001, 10000, size=100, random_state=42)}

model_grid_ridge = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', GridSearchCV(Ridge(max_iter=10000), 
                                param_grid, 
                                cv=5, 
                                scoring='neg_mean_squared_error',
                                refit=True))
])

In [None]:
model_grid_ridge.fit(X_train, y_train)

y_pred_test = model_grid_ridge.predict(X_test)
y_pred_train = model_grid_ridge.predict(X_train)

print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))

16.722208650218498
16.74708774275367


## Decision Tree

In [166]:
model_dt = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', DecisionTreeRegressor())
])

In [None]:
param_grid = {"classifier__criterion": ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
              "classifier__splitter" : ['best', 'random'],
              'classifier__random_state' : [42]}

gs_dt = GridSearchCV(model_dt, param_grid, cv=10, scoring='neg_mean_squared_error')
gs_dt.fit(X_train, y_train)

gs_dt_results = pd.DataFrame(gs_dt.cv_results_)

print("Meilleurs paramètres :", gs_dt.best_params_)

KeyError: 'param_classifier__alpha'

In [168]:
gs_dt_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__criterion,param_classifier__random_state,param_classifier__splitter,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.138683,0.022736,0.008813,0.001324,squared_error,42,best,"{'classifier__criterion': 'squared_error', 'cl...",-102.942212,-100.67088,...,-93.566591,-87.892954,-88.49187,-107.33514,-96.709575,-110.677507,-95.344625,-98.548418,7.156731,6
1,0.06245,0.00152,0.008583,0.001145,squared_error,42,random,"{'classifier__criterion': 'squared_error', 'cl...",-92.490293,-93.13544,...,-93.793228,-89.037489,-92.67028,-110.729901,-109.995032,-98.140018,-87.422313,-97.193499,7.964342,2
2,0.138993,0.017133,0.008575,0.000842,friedman_mse,42,best,"{'classifier__criterion': 'friedman_mse', 'cla...",-102.942212,-100.67088,...,-93.566591,-87.892954,-88.49187,-107.33514,-96.709575,-110.677507,-95.344625,-98.548418,7.156731,6
3,0.073788,0.011814,0.009178,0.001619,friedman_mse,42,random,"{'classifier__criterion': 'friedman_mse', 'cla...",-97.490293,-103.790068,...,-98.125959,-94.451671,-97.147245,-103.545167,-106.038392,-91.427733,-88.520777,-97.602038,5.294191,4
4,9.255598,0.333542,0.008114,0.000849,absolute_error,42,best,"{'classifier__criterion': 'absolute_error', 'c...",-101.169752,-102.185553,...,-101.214447,-101.881662,-92.123758,-104.989612,-109.344173,-91.775519,-103.554652,-101.585493,5.463933,8
5,3.928208,0.253822,0.008333,0.000734,absolute_error,42,random,"{'classifier__criterion': 'absolute_error', 'c...",-101.602257,-104.461851,...,-88.59684,-85.826558,-92.329268,-109.771454,-100.736676,-94.514905,-92.62963,-97.897418,7.879015,5
6,0.144861,0.002629,0.008296,0.000381,poisson,42,best,"{'classifier__criterion': 'poisson', 'classifi...",-103.809932,-99.744018,...,-98.288939,-88.372177,-83.727642,-103.564589,-98.411021,-109.925926,-91.405149,-97.565797,7.405475,3
7,0.068399,0.001676,0.008172,0.000701,poisson,42,random,"{'classifier__criterion': 'poisson', 'classifi...",-96.175169,-95.269526,...,-96.492099,-86.81346,-75.786811,-107.929088,-85.073622,-104.366305,-82.385727,-92.460151,9.425265,1


In [169]:
print("Meilleurs paramètres :", gs_dt.best_params_)

Meilleurs paramètres : {'classifier__criterion': 'poisson', 'classifier__random_state': 42, 'classifier__splitter': 'random'}


In [None]:
y_pred_test = gs_dt.best_estimator_.predict(X_test)
y_pred_train = gs_dt.best_estimator_.predict(X_train)

print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))

9.819675977573013
0.0


## Random Forest

In [172]:
model_rf = Pipeline([
    ('preprocessing', preprocessor),
    ('rf', RandomForestRegressor())
])

In [184]:
param_grid = [
    {'rf__bootstrap': [True, False], 
     'rf__n_estimators': np.arange(1,100), 
     'rf__max_features': ['sqrt', 'log', 'None'] + list(np.arange(2,10)), 
     'rf__criterion' : ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']},
]

param_grid = [
    {'rf__n_estimators': [3, 10, 30], 'rf__max_features': [2, 4, 6, 8]},
    {'rf__bootstrap': [False], 'rf__n_estimators': [3, 10], 'rf__max_features': [2, 3, 4]},
]

gs_rf = RandomizedSearchCV(model_rf, param_grid, cv=10, n_iter=10,
                           scoring='neg_mean_squared_error',
                           return_train_score=True, n_jobs=-1)

gs_rf.fit(X_train, y_train)

gs_rf_results = pd.DataFrame(gs_rf.cv_results_)

print("Meilleurs paramètres :", gs_rf.best_params_)

KeyboardInterrupt: 

In [None]:
gs_rf_results

In [None]:
# Get the best hyperparameters and model
best_params = gs_rf.best_params_
best_model = gs_rf.best_estimator_


# Evaluate the best model
y_pred_test = gs_rf.best_estimator_.predict(X_test)
y_pred_train = gs_rf.best_estimator_.predict(X_train)

print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))
print(f"Best Hyperparameters: {best_params}")

In [174]:
y_pred_test = gs_rf.predict(X_test)
y_pred_train = gs_rf.predict(X_train)

print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))

7.442996682835416
0.0510429077549559


## Dummy classifier

In [93]:
model_d = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', DummyClassifier(strategy='most_frequent'))
])

In [113]:
model_d.fit(X_train, y_train)
y_pred_test = model_d.predict(X_test)
y_pred_train = model_d.predict(X_train)

# Calculate and print the model's accuracy on the test data
print(f"Accuracy: {accuracy_score(y_test, y_pred_test)*100:.4f}%")
print(np.sqrt(mean_squared_error(y_test, y_pred_test)))
print(np.sqrt(mean_squared_error(y_train, y_pred_train)))

Accuracy: 11.4061%
23.84445383873398
23.852532894973237


Scratch work

In [18]:
# le = LabelEncoder()
# for col in col_cat:
#     df_explore[col] = le.fit_transform(df_explore[col])

In [19]:
# enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
# test = enc.fit(df_explore[col_num])

# # Create an encoder 
# encoder= OneHotEncoder(sparse_output=False, handle_unknown='ignore') 
 
# # Train encoder using multiple columns
# encoder.fit(df_explore[col_num]) 
 
# # Get encoded column names 
# encoded_columns=encoder.get_feature_names_out() 
 
# # Transform data using the trained encoder 
# encoded_data = encoder.transform(df_explore[col_num]) 
 
# # Create a dataframe using the encoded data 
# encoded_df=pd.DataFrame(encoded_data, columns=encoded_columns) 
# encoded_df