In [113]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [12]:
data = pd.read_csv('./data/arbres_grenoble_epsg4326(1).csv')

In [261]:
class Adder(BaseEstimator, TransformerMixin):
    def __init__(self, fill_value=0):
        """
        Initialize
        """
        self.fill_value = fill_value

    def fit(self, X, y=None):

        return self

    def transform(self, X):

        # Make a copy of the DataFrame to avoid modifying the original
        data = X.copy()

        # Remove columns with more than 50% missing values
        for col in data.columns:
            if data[col].isnull().sum() >= len(data) / 1.5:
                data = data.drop(col, axis=1)

        # Remove columns with only one unique value
        for col in data.columns:
            if len(data[col].unique()) == 1:
                data = data.drop(col, axis=1)

        # Filter outliers for numerical columns
        for col in data.select_dtypes(include=['int64', 'float64']).columns:
            Q1 = data[col].quantile(0.25)
            Q3 = data[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR

            # Replace outliers with NaN
            # data[col] = data[col].apply(lambda x: x if lower_bound <= x <= upper_bound else np.nan)

        # data['typenature'] = X['typenature']
        # data['forme'] = X['forme']
        data['lat'] = X['geo_point_2d'].map(lambda x: x.split(',')[0])
        data['lon'] = X['geo_point_2d'].map(lambda x: x.split(',')[1])
        data['lat'] = data['lat'].replace('',np.nan)
        data['lon'] = data['lon'].replace('',np.nan)
        data['lat'] = data['lat'].astype('float64')
        data['lon'] = data['lon'].astype('float64')
        data = data.drop('geo_point_2d',axis=1)
        data = data.dropna(subset=['anneedeplantation'])
        data = data.drop(['code','sous_categorie_desc','code_parent_desc','bien_reference','nom'],axis=1)
#         # Preparing categorical values
#         le = LabelEncoder()
        
#         for i in data.select_dtypes(include=['object']).columns:
#             data[i] = le.fit_transform(data[i])

        return pd.DataFrame(data)

In [262]:
clean_pipe = Pipeline([
    ('clean',Adder())
])

In [263]:
X_clean = clean_pipe.fit_transform(data)
y_clean = X_clean.pop('anneedeplantation')

In [264]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=100)

In [265]:
col_cat = [col for col in X_train.columns if X_train[col].dtype == 'object']
col_num = [col for col in X_train.columns if X_train[col].dtype != 'object']

In [272]:
col_ord = ['hauteurarbre','stadededeveloppement']
col_nord = X_train.drop(columns=col_num + col_ord).columns

In [273]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

nord_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('encode', OneHotEncoder(handle_unknown='ignore',sparse_output=True))
    ])

ord1_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ord_1', OrdinalEncoder(categories=[['Moins de 10 m','de 10 m à 20 m','Plus de 20 m']])),
])

ord2_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('ord_2', OrdinalEncoder(categories=[['Arbre jeune','Arbre adulte','Arbre vieillissant']]))
])

full_pipeline = ColumnTransformer([
        ("num", num_pipeline, col_num),
        ('ord1', ord1_pipeline,['hauteurarbre']),
        ('ord2', ord2_pipeline, ['stadededeveloppement']),
        ('nord', nord_pipeline, col_nord)
    ])

In [278]:
X_train_pip = full_pipeline.fit_transform(X_train)
X_test_pip = full_pipeline.transform(X_test)

In [279]:
X_train_pip

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 322155 stored elements and shape (23853, 1856)>

### Linear

In [288]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

lin_reg = LinearRegression()
lin_reg.fit(X_train_pip, y_train)
y_pred = lin_reg.predict(X_test_pip)


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 62.88
R^2 Score: 0.80


### Tree

In [284]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=100)
tree_reg.fit(X_train_pip, y_train)

y_pred = tree_reg.predict(X_test_pip)

tree_mse = mean_squared_error(y_test, y_pred)
tree_rmse = np.sqrt(tree_mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {tree_rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 6.47
R^2 Score: 0.87


### Forest

In [289]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=10, random_state=42)
forest_reg.fit(X_train_pip, y_train)

y_pred = forest_reg.predict(X_test_pip)
forest_mse = root_mean_squared_error(y_test, y_pred)
forest_rmse = np.sqrt(forest_mse)
r2 = r2_score(y_test, y_pred)
print(f"ROOT Mean Squared Error: {forest_rmse:.2f}")
print(f"R^2 Score: {r2:.2f}")

Mean Squared Error: 2.36
R^2 Score: 0.90


In [287]:
scores = cross_val_score(forest_reg, X_train_pip, y_train,
                         scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-scores)

print(f'min: {min(forest_rmse_scores)}')
print(f'mean: {forest_rmse_scores.mean()}')
print(f'std: {forest_rmse_scores.std()}')

min: 4.556054326635457
mean: 5.1764360728063945
std: 0.2881092101058571


In [338]:
param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {'min_samples_split': [2,5,10,15], 'max_features': [2, 4, 6, 14, 20], 'max_depth': [None,2,5,10,50]},
    # then try 6 (2×3) combinations with bootstrap set as False
    # {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]

forest_reg = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True,
                           n_jobs=-1)
grid_search.fit(X_train_pip, y_train)
grid_search.best_params_

KeyboardInterrupt: 

In [332]:
grid_search.best_estimator_

In [333]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

5.854261926408778 {'max_depth': None, 'max_features': 2, 'n_estimators': 10}
5.394327253969635 {'max_depth': None, 'max_features': 2, 'n_estimators': 50}
5.348011735352094 {'max_depth': None, 'max_features': 2, 'n_estimators': 100}
5.7090658313121025 {'max_depth': None, 'max_features': 4, 'n_estimators': 10}
5.355099654813399 {'max_depth': None, 'max_features': 4, 'n_estimators': 50}
5.29751750008417 {'max_depth': None, 'max_features': 4, 'n_estimators': 100}
5.830090559843015 {'max_depth': None, 'max_features': 6, 'n_estimators': 10}
5.347033172881813 {'max_depth': None, 'max_features': 6, 'n_estimators': 50}
5.28547081631759 {'max_depth': None, 'max_features': 6, 'n_estimators': 100}
5.64681790766771 {'max_depth': None, 'max_features': 14, 'n_estimators': 10}
5.253874110344973 {'max_depth': None, 'max_features': 14, 'n_estimators': 50}
5.204760194786394 {'max_depth': None, 'max_features': 14, 'n_estimators': 100}
5.597910317521502 {'max_depth': None, 'max_features': 20, 'n_estimators

### random search

In [293]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

In [316]:
param_distribs = {
        'n_estimators': randint(low=1, high=30),
        'max_features': randint(low=1, high=14),
    }

forest_reg = RandomForestRegressor(random_state=42)
rnd_search = RandomizedSearchCV(forest_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)
rnd_search.fit(X_train_pip, y_train)
cvres = rnd_search.cv_results_

In [319]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

6.899051945899707 {'max_features': 2, 'n_estimators': 3}
5.854261926408778 {'max_features': 2, 'n_estimators': 10}
5.453807180135 {'max_features': 2, 'n_estimators': 30}
6.849189597126166 {'max_features': 4, 'n_estimators': 3}
5.7090658313121025 {'max_features': 4, 'n_estimators': 10}
5.437030100220561 {'max_features': 4, 'n_estimators': 30}
6.990158448296535 {'max_features': 6, 'n_estimators': 3}
5.830090559843015 {'max_features': 6, 'n_estimators': 10}
5.437709806366928 {'max_features': 6, 'n_estimators': 30}
6.6169940735108055 {'max_features': 8, 'n_estimators': 3}
5.731641992863562 {'max_features': 8, 'n_estimators': 10}
5.395625213687249 {'max_features': 8, 'n_estimators': 30}
6.45322955231337 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
5.548851718151228 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
6.342244234125692 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
5.465018671921323 {'bootstrap': False, 'max_features': 3, 'n_estimators': 10}


In [308]:
feature_importances = grid_search.best_estimator_.feature_importances_
pd.DataFrame(feature_importances).head(10)

Unnamed: 0,0
0,0.131452
1,0.029909
2,0.06238
3,0.067254
4,0.049515
5,0.045717
6,0.002373
7,0.008599
8,0.009718
9,0.002913


In [310]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23853 entries, 5715 to 6277
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   elem_point_id         23853 non-null  int64  
 1   sous_categorie        23853 non-null  object 
 2   code_parent           23853 non-null  object 
 3   adr_secteur           23853 non-null  int64  
 4   genre_bota            23830 non-null  object 
 5   espece                22766 non-null  object 
 6   stadededeveloppement  22042 non-null  object 
 7   collectivite          23775 non-null  object 
 8   hauteurarbre          15864 non-null  object 
 9   portarbre             11013 non-null  object 
 10  structure             23671 non-null  object 
 11  typenature            11013 non-null  object 
 12  lat                   23853 non-null  float64
 13  lon                   23853 non-null  float64
dtypes: float64(2), int64(2), object(10)
memory usage: 2.7+ MB


In [None]:
RandomForestRegressor(max_features=8, n_estimators=30, random_state=42)

In [334]:
final_model = grid_search.best_estimator_

In [335]:
final_predictions = final_model.predict(X_test_pip)
final_rmse = root_mean_squared_error(y_test, final_predictions)

In [336]:
print(f'Final RMSE: {final_rmse}')

Final RMSE: 5.159838780846912


#### 95% confidence interval

In [315]:
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
loc=squared_errors.mean(),
scale=stats.sem(squared_errors)))

array([4.95845872, 5.88103473])