In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Ridge, Lasso,LinearRegression
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
from sklearn.model_selection import GridSearchCV


# chargement des données

In [142]:
df = pd.read_csv("..\\data\\Walmart_Store_sales_cleaned.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,0,6,1572117.54,0,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0
1,1,13,1807545.43,0,42.38,3.435,128.616064,7.47,2011.0,3.0,25.0
2,4,6,1644470.66,0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0
3,5,4,1857533.7,0,,2.756,126.160226,7.896,2010.0,5.0,28.0
4,6,15,695396.19,0,69.8,4.069,134.855161,7.658,2011.0,6.0,3.0


## stratify
Vu l'importance des "store", il est nécessaire de tester un stratify sur le store.

In [143]:
target = "Weekly_Sales"

X = df.drop(target, axis=1) 
Y = df[target]
display(X)
display(Y)

Unnamed: 0.1,Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day
0,0,6,0,59.61,3.045,214.777523,6.858,2011.0,2.0,18.0
1,1,13,0,42.38,3.435,128.616064,7.470,2011.0,3.0,25.0
2,4,6,0,78.89,2.759,212.412888,7.092,2010.0,5.0,28.0
3,5,4,0,,2.756,126.160226,7.896,2010.0,5.0,28.0
4,6,15,0,69.80,4.069,134.855161,7.658,2011.0,6.0,3.0
...,...,...,...,...,...,...,...,...,...,...
118,144,3,0,73.44,3.594,226.968844,6.034,2012.0,10.0,19.0
119,145,14,0,72.62,2.780,182.442420,8.899,2010.0,6.0,18.0
120,147,17,0,57.14,2.841,126.111903,,2010.0,6.0,11.0
121,148,8,0,86.05,3.638,219.007525,,2011.0,8.0,12.0


0      1572117.54
1      1807545.43
2      1644470.66
3      1857533.70
4       695396.19
          ...    
118     424513.08
119    2248645.59
120     845252.21
121     856796.10
122    1255087.26
Name: Weekly_Sales, Length: 123, dtype: float64

In [144]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,random_state=20, stratify=X["Store"])
print(X_train.shape)
print(X_test.shape)

(98, 10)
(25, 10)


On voit une amélioration sur le test. Il existe toujours du sur-apprentissage mais bien moindre.

## Pré processing
- Pour les valeurs numériques
  - On impute les valeurs moyennes aux valeur absentes
  - On effectue un StandardScaler()  => Cela permet de mettre à l'échelle les données numériques en fonction de la moyenne et de l'écart type
- Pour les variables catégorielles
  - Pas d'imputation de valeur manquante car Store et Holiday_Flag sont tous renseignés
  - OneHotEncoder permet de convertir les catégories en vecteur binaire et on supprime la première occurence afin d'éviter la colinéarité 

In [145]:
numerical_columns = ["Temperature", "Fuel_Price", "CPI", "Unemployment", "Year", "Month", "Day"]
categorical_columns = ["Store", "Holiday_Flag"]

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(
    steps=[
    ("encoder", OneHotEncoder(drop="first", handle_unknown="ignore"))
    ])

feature_encoder = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_columns),
        ("cat", categorical_transformer, categorical_columns)
    ])

X_train = feature_encoder.fit_transform(X_train)
X_test = feature_encoder.transform(X_test) 

In [146]:
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")


print("R2 score training :", regressor.score(X_train, Y_train))
print("R2 score test :",  regressor.score(X_test, Y_test))


print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")

print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")

# Évaluation du modèle
mse = mean_squared_error(Y_test, Y_test_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(Y_test, Y_test_pred)
r2_train = r2_score(Y_train, Y_train_pred)
r2_test = r2_score(Y_test, Y_test_pred)

print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R² train: {r2_train}")
print(f"R² test: {r2_test}")

Train model...
...Done.
R2 score training : 0.9741158719630514
R2 score test : 0.9392390364631339
Predictions on training set...
...Done.
Predictions on test set...
...Done.
RMSE: 162779.36287075552
MAE: 125838.88804553183
R² train: 0.9741158719630514
R² test: 0.9392390364631339


In [147]:
scores_df = pd.read_csv("..\\data\\Walmart_Scores.csv")
new_rows = [{"model": "linear_regression_stratify", "R2_train": r2_train,"R2_test":r2_test,"RMSE":rmse,"MAE":mae}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)

# Ridge

Ridge : Permet de réduire les coefficients des variables

Ridge sans alpha dans un remier temps (alpha = 1 par défaut)

In [148]:
ridge = Ridge()

# Entraîner le modèle
ridge.fit(X_train, Y_train)

print("Predictions sur le train...")
Y_train_pred = ridge.predict(X_train)
print("...Done.")

print("Predictions sur le test...")
Y_test_pred = ridge.predict(X_test)
print("...Done.")

print("R2 score sur le train : ", r2_score(Y_train, Y_train_pred))
print("R2 score sur le test : ", r2_score(Y_test, Y_test_pred))

# Évaluation du modèle
mse_ridge = mean_squared_error(Y_test, Y_test_pred)
rmse_ridge = np.sqrt(mse_ridge)
mae_ridge = mean_absolute_error(Y_test, Y_test_pred)
r2_train_ridge = r2_score(Y_train, Y_train_pred)
r2_test_ridge = r2_score(Y_test, Y_test_pred)

print(f"MSE: {mse_ridge}")
print(f"RMSE: {rmse_ridge}")
print(f"MAE: {mae_ridge}")
print(f"R²: {r2_test_ridge}")

Predictions sur le train...
...Done.
Predictions sur le test...
...Done.
R2 score sur le train :  0.9363315109588681
R2 score sur le test :  0.8889917645330846
MSE: 48409348261.603905
RMSE: 220021.24502330204
MAE: 184005.404252432
R²: 0.8889917645330846


L'écart entre le test et le train reste important et les résultats sont moins bons.

In [149]:
new_rows = [{"model": "ridge", "R2_train": r2_train_ridge,"R2_test":r2_test_ridge,"RMSE":rmse_ridge,"MAE":rmse_ridge}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)

J'ajoute un hyper paramètre alpha

In [150]:
alphas = np.logspace(-4, 2, 50)
params = {'alpha': alphas}
gridsearch = GridSearchCV(ridge,params, cv=3, scoring="neg_mean_squared_error") # cv : the number of folds to be used for CV
gridsearch.fit(X_train, Y_train)
gridsearch.best_estimator_.get_params()

{'alpha': np.float64(0.08685113737513521),
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': None,
 'solver': 'auto',
 'tol': 0.0001}

* La valeur alpha qui ressort est 0.087.

In [151]:

Y_train_pred = gridsearch.best_estimator_.predict(X_train)
Y_test_pred =  gridsearch.best_estimator_.predict(X_test)

print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

mse_ridge2 = mean_squared_error(Y_test, Y_test_pred)
rmse_ridge2 = np.sqrt(mse_ridge2)
mae_ridge2 = mean_absolute_error(Y_test, Y_test_pred)
r2_train_ridge2 = r2_score(Y_train, Y_train_pred)
r2_test_ridge2 = r2_score(Y_test, Y_test_pred)

print(f"MSE: {mse_ridge2}")
print(f"RMSE: {rmse_ridge2}")
print(f"MAE: {mae_ridge2}")
print(f"R²: {r2_test_ridge2}")

R2 score on training set :  0.9726903072495336
R2 score on test set :  0.937312491328953
MSE: 27337263997.9803
RMSE: 165339.84395172357
MAE: 126511.53386205136
R²: 0.937312491328953


In [152]:
new_rows = [{"model": "ridge_alpha", "R2_train": r2_train_ridge2,"R2_test":r2_test_ridge2,"RMSE":rmse_ridge2,"MAE":rmse_ridge2}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)

La valeur en test grandment amélioré, cela réduit le sur-apprentissage même si l'écart reste presque de 0.04

# Lasso

Lasso : Lasso permet de pénaliser des variables jusqu'à les supprimer.

- Comme pour ridge, je commence sans valeur alpha.

In [153]:
lasso = Lasso()

# Entraîner le modèle
lasso.fit(X_train, Y_train)

print("Predictions on training set...")
Y_train_pred = lasso.predict(X_train)
print("...Done.")

print("Predictions on test set...")
Y_test_pred = lasso.predict(X_test)
print("...Done.")

print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))
# Évaluation du modèle
mse_lasso = mean_squared_error(Y_test, Y_test_pred)
rmse_lasso = np.sqrt(mse_lasso)
mae_lasso = mean_absolute_error(Y_test, Y_test_pred)
r2_train_lasso = r2_score(Y_train, Y_train_pred)
r2_test_lasso = r2_score(Y_test, Y_test_pred)

print(f"MSE: {mse_lasso}")
print(f"RMSE: {rmse_lasso}")
print(f"MAE: {mae_lasso}")
print(f"R²: {r2_test_lasso}")

Predictions on training set...
...Done.
Predictions on test set...
...Done.
R2 score on training set :  0.9741158686933935
R2 score on test set :  0.9392507913890294
MSE: 26491994795.66342
RMSE: 162763.61631416102
MAE: 125828.49726796118
R²: 0.9392507913890294


Le test semble légèrement meilleur même le ridge avec hyper-paramètre

In [154]:
new_rows = [{"model": "lasso", "R2_train": r2_train_lasso,"R2_test":r2_test_lasso,"RMSE":rmse_lasso,"MAE":mae_lasso}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)

- Je teste ensuite des valeurs de alpha.

In [155]:
alphas=np.logspace(-2, 4, 40)
params = {'alpha': alphas}

gridsearch = GridSearchCV(lasso, params, cv=3, scoring="neg_mean_squared_error")
gridsearch.fit(X_train, Y_train)
gridsearch.best_estimator_.get_params()


{'alpha': np.float64(1701.2542798525892),
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': 1000,
 'positive': False,
 'precompute': False,
 'random_state': None,
 'selection': 'cyclic',
 'tol': 0.0001,
 'warm_start': False}

L'alpha 1701 ressort comme le meilleur. Il y a donc une forte pénalisation ce qui peut s'expliquer par le fait que la valeur cible est importante.

In [156]:
Y_train_pred = gridsearch.best_estimator_.predict(X_train)
Y_test_pred =  gridsearch.best_estimator_.predict(X_test)

print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

mse_lasso2 = mean_squared_error(Y_test, Y_test_pred)
rmse_lasso2 = np.sqrt(mse_lasso2)
mae_lasso2 = mean_absolute_error(Y_test, Y_test_pred)
r2_train_lasso2 = r2_score(Y_train, Y_train_pred)
r2_test_lasso2 = r2_score(Y_test, Y_test_pred)

print(f"MSE: {mse_lasso2}")
print(f"RMSE: {rmse_lasso2}")
print(f"MAE: {mae_lasso2}")
print(f"R²: {r2_test_lasso2}")

R2 score on training set :  0.9702199102586673
R2 score on test set :  0.9444801297516998
MSE: 24211543611.915665
RMSE: 155600.59001146385
MAE: 120524.4380396757
R²: 0.9444801297516998


In [157]:
new_rows = [{"model": "lasso_alpha"
"", "R2_train": r2_train_lasso2,"R2_test":r2_test_lasso2,"RMSE":rmse_lasso2,"MAE":mae_lasso2}]
scores_df = pd.concat([scores_df, pd.DataFrame(new_rows)], ignore_index=True)
scores_df.to_csv("../data/Walmart_Scores.csv", mode="w", index=False)