## REGRESSION

### Librairies utilisées

In [49]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor


### Fonctions utilisés dans le notebook

In [4]:
def alerteCombinaison(dictionary_param):
    """
    Cette fonction est utilisée pour calculer le nombre de combinaison
    défini des GridSearch pour optimiser les paramtètres des modèles.
    Elle prend en entrée un dictionnaire et retourne le nombre de combinaison
    """
    from numpy import prod
    lengths = [len(v) for v in dictionary_param.values()]
    count = prod(lengths)
    if count < 500:
        print("C'est bon tu peux envoyer l'apprentissage !")
    elif count < 1000:
        print("Tu peux lire tes mails pendant l'apprentissage !")
    elif count < 2000:
        print("Tu peux répondre à tes mails pendant l'apprentissage !")
    else:
        print("Tu peux mettre ton PC en veille et revenir demain !")

    print("Nombre de combinaison :")
    return(count)

### Importation des données

In [5]:
with open('dataframe.pkl', 'rb') as file:
    df = pickle.load(file)

# Maintenant, df contient la DataFrame importée depuis le fichier
df

Unnamed: 0,No disposition,Date mutation,Nature mutation,Valeur fonciere,Type de voie,Voie,Code postal,Commune,Code commune,Section,No plan,1er lot,Surface Carrez du 1er lot,Nombre de lots,Type local,Surface reelle bati,Nombre pieces principales,Nature culture,Surface terrain,col_concat
3,1,04/01/2021,Vente,204332.0,ALL,DES ECUREUILS,1310.0,BUELLAS,65,B,1325,,,0,Maison,88.0,4.0,S,866.0,04/01/2021ALL7.00276DES ECUREUILS1310.0BUELLAS
14,2,04/01/2021,Vente,226700.0,CHE,DU MOULIN DE POLAIZE,1310.0,POLLIAT,301,AA,289,,,0,Maison,96.0,3.0,,,04/01/2021CHE173.00164DU MOULIN DE POLAIZE1310...
15,1,08/01/2021,Vente,185000.0,RUE,DES GRANGES BONNET,1960.0,PERONNAS,289,AD,31,,,0,Maison,100.0,4.0,S,703.0,08/01/2021RUE46.00161DES GRANGES BONNET1960.0P...
16,1,07/01/2021,Vente,114500.0,RUE,DE LA MAIRIE,1340.0,FOISSIAT,163,AB,302,,,0,Maison,85.0,2.0,S,87.0,07/01/2021RUE179.00110DE LA MAIRIE1340.0FOISSIAT
19,1,08/01/2021,Vente,145000.0,IMP,DE CHAMANDRE,1340.0,FOISSIAT,163,WC,215,,,0,Maison,92.0,1.0,S,2480.0,08/01/2021IMP8.00255DE CHAMANDRE1340.0FOISSIAT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15125094,1,27/12/2018,Vente,1800.0,PL,DES VOSGES,75004.0,PARIS 04,104,AO,7,109.0,,1,Appartement,20.0,2.0,,,27/12/2018PL9.09917DES VOSGES75004.0PARIS 04
15125095,1,28/12/2018,Vente,405000.0,RUE,BEAUTREILLIS,75004.0,PARIS 04,104,AQ,16,16.0,33.87,2,Appartement,34.0,1.0,,,28/12/2018RUE13.00797BEAUTREILLIS75004.0PARIS 04
15125096,1,26/12/2018,Vente,220000.0,RUE,DES LIONS SAINT PAUL,75004.0,PARIS 04,104,AQ,127,126.0,,2,Appartement,29.0,1.0,,,26/12/2018RUE14.05702DES LIONS SAINT PAUL75004...
15125098,1,03/12/2018,Vente,383000.0,RUE,POISSONNIERE,75002.0,PARIS 02,102,AO,85,9.0,34.78,1,Appartement,34.0,1.0,,,03/12/2018RUE12.07561POISSONNIERE75002.0PARIS 02


### Ajout d'une variable openData - taux de chomage

In [17]:
# dataframe du taux de chomage par département
df_chomage = pd.read_excel('./donnees/txChomage_par_dep.xls', sheet_name='Département')
df_chomage.keys()


Index(['Code', 'Libellé', 'T2_2023'], dtype='object')

In [38]:
def count_digits(n):
    # Utilisez la fonction str() pour convertir l'entier en une chaîne de caractères
    num_str = str(n)
    
    # Utilisez len() pour compter le nombre de caractères dans la chaîne
    return len(num_str)

In [54]:
# Utilisez .notna() pour vérifier que les valeurs ne sont pas NaN
mask = df['Code postal'].notna()

# Appliquez la conversion uniquement aux lignes où la valeur n'est pas NaN
codepostaux_int = df.loc[mask, 'Code postal'].astype(int).astype(str) 

In [55]:
codepostaux_int

3      1340
14     1000
15     1250
16     1290
19     1560
       ... 
442    1310
443    1340
444    1000
445    1000
446    1000
Name: Code postal, Length: 2765769, dtype: object

In [56]:
# Nombre de code postaux à 4 ou 5 chiffres 
nb_digits = []
for code in codepostaux_int:
    nb_digits.append(count_digits(code))
pd.Series(nb_digits).value_counts()

5    2584384
4     181385
Name: count, dtype: int64

In [67]:
list_tx_chomage = []
# parcourt nos code postaux pour ensuite récupérer les départements et pouvoir créer notre liste de taux de chomage
# par département 
for code in codepostaux_int:
    if 1000 <= int(code) <= 9999:
        dep = "0"+code[0]
    else:
        dep = code[:2]
    tx_chomage = df_chomage[df_chomage["Code"]==dep].T2_2023
    list_tx_chomage.append(tx_chomage)
list_tx_chomage

### Echantillonage

In [33]:
# variables_explicatives = ['Type local', 'Nombre pieces principales', 'Surface reelle bati', 'Surface terrain',
#                           'Nombre de lots', 'Code commune', 'Nature mutation']
variables_explicatives = ['Type local', 'Nombre pieces principales', 'Surface reelle bati', 'Surface terrain', 'Nombre de lots']


X = df[variables_explicatives]
X = pd.get_dummies(data=X, columns=['Type local'])
X

Unnamed: 0,Nombre pieces principales,Surface reelle bati,Surface terrain,Nombre de lots,Type local_Appartement,Type local_Dépendance,Type local_Local industriel. commercial ou assimilé,Type local_Maison
3,4.0,88.0,866.0,0,False,False,False,True
14,3.0,96.0,,0,False,False,False,True
15,4.0,100.0,703.0,0,False,False,False,True
16,2.0,85.0,87.0,0,False,False,False,True
19,1.0,92.0,2480.0,0,False,False,False,True
...,...,...,...,...,...,...,...,...
15125094,2.0,20.0,,1,True,False,False,False
15125095,1.0,34.0,,2,True,False,False,False
15125096,1.0,29.0,,2,True,False,False,False
15125098,1.0,34.0,,1,True,False,False,False


In [7]:
# notre target
Y = df['Valeur fonciere']
Y

3           204332.0
14          226700.0
15          185000.0
16          114500.0
19          145000.0
              ...   
15125094      1800.0
15125095    405000.0
15125096    220000.0
15125098    383000.0
15125101     45000.0
Name: Valeur fonciere, Length: 2765530, dtype: float64

In [34]:
# on remplace les NA dans la colonne 'Nombre pieces principales' et dans 'Surface reelle bati'
# et dans 'Surface terrain' par leurs moyennes
X['Nombre pieces principales'].fillna(X['Nombre pieces principales'].mean(), inplace=True)

X['Surface reelle bati'].fillna(X['Surface reelle bati'].mean(), inplace=True)

X['Surface terrain'].fillna(X['Surface terrain'].mean(), inplace=True)

Surface terrain
False    2765530
Name: count, dtype: int64

In [6]:
pourcentage_manquant = (X.isna().sum() / len(X)) * 100
pourcentage_manquant

Nombre pieces principales                              0.0
Surface reelle bati                                    0.0
Surface terrain                                        0.0
Nombre de lots                                         0.0
Code commune                                           0.0
Type local_Appartement                                 0.0
Type local_Dépendance                                  0.0
Type local_Local industriel. commercial ou assimilé    0.0
Type local_Maison                                      0.0
Nature mutation_Adjudication                           0.0
Nature mutation_Echange                                0.0
Nature mutation_Expropriation                          0.0
Nature mutation_Vente                                  0.0
Nature mutation_Vente en l'état futur d'achèvement     0.0
Nature mutation_Vente terrain à bâtir                  0.0
dtype: float64

In [35]:
from sklearn.model_selection import train_test_split
#70% des données pour l’apprentissage
#30% des données pour l'échantillon test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.30, random_state = 42)

Pour ne pas donner plus d'importance aux variables explicatives à forte variance, il est essentiel de centrer et réduire les données en amont. On centre et réduit également afin de les ramener à la même échelle 

On centre et réduit les données d'apprentissage

In [36]:
# Appliquez la standardisation aux données de X
scaler = StandardScaler()
X_train_CR = scaler.fit_transform(X_train)
X_test_CR = scaler.fit_transform(X_test)
pd.DataFrame(X_test_CR, columns=X_test.columns).head(3) 

Unnamed: 0,Nombre pieces principales,Surface reelle bati,Surface terrain,Nombre de lots,Type local_Appartement,Type local_Dépendance,Type local_Local industriel. commercial ou assimilé,Type local_Maison
0,0.06535,-0.036909,-0.199286,-0.692241,-0.608861,-0.403737,-0.229779,0.924372
1,0.579426,0.028268,-0.35865,-0.692241,-0.608861,-0.403737,-0.229779,0.924372
2,0.579426,0.105665,-0.099505,-0.692241,-0.608861,-0.403737,-0.229779,0.924372


### Régression linéaire multiple

#### Apprentissage

On lance l'apprentissage du modèle sur l'échantillon d'entrainement

In [10]:
lm = LinearRegression()
model_LinearRegression = lm.fit(X_train_CR,y_train)

### Test

In [11]:
y_pred = model_LinearRegression.predict(X_test_CR)
y_pred

array([ 2.87033067e+10,  2.87033295e+10,  2.87033304e+10, ...,
       -2.76069744e+10,  2.87033533e+10, -2.76070289e+10])

In [12]:
#on multiplie par 0.3 pour avoir un résultat en metre
mean_squared_error(y_test, y_pred, squared=False)

60598593738.452576

On s'interesse aux coefficients de la régression

In [13]:
coef = pd.DataFrame(lm.coef_ ,index = X_train.columns, columns=['Coef'])
coef.loc['Constante'] = lm.intercept_
coef

Unnamed: 0,Coef
Nombre pieces principales,44674.7
Nombre de lots,7175.25
Surface reelle bati,3514.173
Surface terrain,2146.642
Type local_Appartement,59048790000000.0
Type local_Dépendance,46168030000000.0
Type local_Local industriel. commercial ou assimilé,28967320000000.0
Type local_Maison,66290500000000.0
Constante,164101.3


### Régression Ridge

#### Sans GridSearch

In [59]:
ridge_model = Ridge(alpha=10)
ridge_model = ridge_model.fit(X_train_CR,y_train)

y_pred = ridge_model.predict(X_test_CR)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

RMSE : 100896.70759142876


#### Avec GridSearch

In [41]:
parameters = {'alpha': np.arange(start = 0, stop = 10, step = 1)}
print(alerteCombinaison(parameters))

ridge_model = Ridge()
grid_ridge = GridSearchCV(ridge_model, parameters, scoring = 'neg_mean_squared_error', verbose = 1)
grid_ridge.fit(X_train_CR, y_train)

C'est bon tu peux envoyer l'apprentissage !
Nombre de combinaison :
10
Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [42]:
print(pd.DataFrame(grid_ridge.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", grid_ridge.best_params_)
print("Meilleur score :", grid_ridge.best_score_)
y_pred = grid_ridge.best_estimator_.predict(X_test_CR)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

         params  mean_test_score
0  {'alpha': 0}    -1.021218e+10
1  {'alpha': 1}    -1.021218e+10
2  {'alpha': 2}    -1.021218e+10
3  {'alpha': 3}    -1.021218e+10
4  {'alpha': 4}    -1.021218e+10
5  {'alpha': 5}    -1.021218e+10
6  {'alpha': 6}    -1.021218e+10
7  {'alpha': 7}    -1.021218e+10
8  {'alpha': 8}    -1.021218e+10
9  {'alpha': 9}    -1.021218e+10
Meilleur paramètre : {'alpha': 1}
Meilleur score : -10212175876.177395
RMSE : 100896.70665757051


Coefficients du modèle

In [67]:
coef = pd.DataFrame(ridge_model.best_estimator_.coef_ ,
                    index = X_train.columns, columns=['Coef'])
coef.loc['Constante'] = ridge_model.best_estimator_.intercept_
coef

Unnamed: 0,Coef
Nombre pieces principales,44685.529907
Nombre de lots,7259.775261
Surface reelle bati,3547.56684
Surface terrain,1939.908889
Type local_Appartement,-2941.588973
Type local_Dépendance,2948.759634
Type local_Local industriel. commercial ou assimilé,10912.957287
Type local_Maison,-4202.114504
Constante,164101.304014


### Régression Lasso

In [58]:
lasso_model = Lasso(alpha=5)
lasso_model = lasso_model.fit(X_train_CR,y_train)

y_pred = lasso_model.predict(X_test_CR)

print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

RMSE : 100896.75555293108


In [16]:
coef = pd.DataFrame(lasso_model.coef_ ,
                    index = X_train.columns, columns=['Coef'])
coef.loc['Constante'] = lasso_model.intercept_
coef

Unnamed: 0,Coef
Nombre pieces principales,44632.33924
Nombre de lots,7128.452419
Surface reelle bati,3518.184656
Surface terrain,2140.86676
Type local_Appartement,-0.0
Type local_Dépendance,5221.526143
Type local_Local industriel. commercial ou assimilé,12364.257885
Type local_Maison,-641.661277
Constante,164101.304014


### Avec GridSearch

In [19]:
parameters = {'alpha': np.arange(start = 0, stop = 10, step = 1)}
print(alerteCombinaison(parameters))
lasso_model = Lasso()
lasso_model = GridSearchCV(lasso_model, parameters, scoring = 'r2', verbose = 2)
lasso_model.fit(X_train_CR, y_train)

C'est bon tu peux envoyer l'apprentissage !
Nombre de combinaison :
10
Fitting 5 folds for each of 10 candidates, totalling 50 fits


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=0; total time= 1.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=0; total time= 1.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[CV] END ............................................alpha=0; total time= 1.7min


  return fit_method(estimator, *args, **kwargs)
  model = cd_fast.enet_coordinate_descent(


In [None]:
print("Meilleur paramètre :", ridge_model.best_params_)
print("Meilleur score :", ridge_model.best_score_)

y_pred = ridge_model.best_estimator_.predict(X_test_CR)

print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

### Régression Elasticnet

#### Sans GridsearchCV

In [19]:
from sklearn.linear_model import ElasticNet
elastic_model = ElasticNet(alpha=5, l1_ratio=0.2)
elastic_model = elastic_model.fit(X_train_CR,y_train)

y_pred = elastic_model.predict(X_test_CR)

print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

RMSE : 104905.91176840823


### Arbre de décision

In [46]:
from sklearn.tree import DecisionTreeRegressor
tree_regressor = DecisionTreeRegressor(random_state=42)
tree_regressor.fit(X_train_CR, y_train)
y_pred = tree_regressor.predict(X_test_CR)
mse = mean_squared_error(y_test, y_pred)
# Affichez le MSE pour évaluer la performance du modèle
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 12777387809.924845


In [47]:
tree_regressor = DecisionTreeRegressor(max_depth=10, min_samples_split=2,min_samples_leaf=1, random_state=42)
tree_regressor.fit(X_train_CR, y_train)
y_pred = tree_regressor.predict(X_test_CR)
mse = mean_squared_error(y_test, y_pred)
# Affichez le MSE pour évaluer la performance du modèle
print("Mean Squared Error (MSE):", mse)

Mean Squared Error (MSE): 9488751985.915815


#### Avec GridSearch

In [48]:
tree_regressor = DecisionTreeRegressor()
param_grid = {
    'max_depth': [None, 10, 20, 30],  # Profondeur maximale de l'arbre
    'min_samples_split': [2, 5, 10],  # Nombre minimal d'échantillons requis pour diviser un nœud
    'min_samples_leaf': [1, 2, 4]  # Nombre minimal d'échantillons requis dans une feuille
}
tree_regressor = GridSearchCV(tree_regressor, param_grid=param_grid)

tree_regressor.fit(X_train_CR, y_train)

In [53]:
# print(pd.DataFrame(tree_regressor.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", tree_regressor.best_params_)
print("Meilleur score :", tree_regressor.best_score_)

y_pred = ridge_model.predict(X_test_CR)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))


Meilleur paramètre : {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10}
Meilleur score : 0.1874560876831884
RMSE : 72615672858.32788


### Random forest

In [56]:
random_forest_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_regressor.fit(X_train, y_train)
y_pred = random_forest_regressor.predict(X_test)

print(pd.DataFrame(tree_regressor.cv_results_).loc[:,['params','mean_test_score']])
print("Meilleur paramètre :", tree_regressor.best_params_)
print("Meilleur score :", tree_regressor.best_score_)
print("RMSE : " + str(mean_squared_error(y_test, y_pred, squared= False)))

                                               params  mean_test_score
0   {'max_depth': None, 'min_samples_leaf': 1, 'mi...        -0.054894
1   {'max_depth': None, 'min_samples_leaf': 1, 'mi...        -0.009273
2   {'max_depth': None, 'min_samples_leaf': 1, 'mi...         0.046871
3   {'max_depth': None, 'min_samples_leaf': 2, 'mi...         0.022037
4   {'max_depth': None, 'min_samples_leaf': 2, 'mi...         0.028152
5   {'max_depth': None, 'min_samples_leaf': 2, 'mi...         0.064082
6   {'max_depth': None, 'min_samples_leaf': 4, 'mi...         0.083782
7   {'max_depth': None, 'min_samples_leaf': 4, 'mi...         0.083764
8   {'max_depth': None, 'min_samples_leaf': 4, 'mi...         0.088451
9   {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.187218
10  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.187263
11  {'max_depth': 10, 'min_samples_leaf': 1, 'min_...         0.187304
12  {'max_depth': 10, 'min_samples_leaf': 2, 'min_...         0.187289
13  {'

# Kaggle

In [20]:
!kaggle datasets list

ref                                                        title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
---------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
iamsouravbanerjee/customer-shopping-trends-dataset         Customer Shopping Trends Dataset                  146KB  2023-10-05 06:45:37           4688        116  1.0              
nelgiriyewithana/top-spotify-songs-2023                    Most Streamed Spotify Songs 2023                   47KB  2023-08-26 11:04:57          31838        961  1.0              
nelgiriyewithana/credit-card-fraud-detection-dataset-2023  Credit Card Fraud Detection Dataset 2023          143MB  2023-09-18 10:00:19           5456        185  1.0              
nelgiriyewithana/billionaires-statistics-dataset           Billionaires Statistics Dataset (202

In [21]:
!kaggle competitions download -c m2-sise-2023

Downloading m2-sise-2023.zip to c:\Users\bourh\Cours\CoursM2SISE\Machine_Learning_Python\ProjetPython




  0%|          | 0.00/295M [00:00<?, ?B/s]
  0%|          | 1.00M/295M [00:00<03:25, 1.50MB/s]
  1%|          | 3.00M/295M [00:00<01:05, 4.69MB/s]
  2%|▏         | 6.00M/295M [00:00<00:32, 9.33MB/s]
  3%|▎         | 10.0M/295M [00:01<00:19, 15.7MB/s]
  5%|▍         | 14.0M/295M [00:01<00:15, 19.0MB/s]
  6%|▌         | 18.0M/295M [00:01<00:12, 23.7MB/s]
  7%|▋         | 21.0M/295M [00:01<00:12, 22.8MB/s]
  8%|▊         | 24.0M/295M [00:01<00:11, 24.5MB/s]
  9%|▉         | 27.0M/295M [00:01<00:12, 22.0MB/s]
 11%|█         | 31.0M/295M [00:01<00:13, 20.6MB/s]
 12%|█▏        | 35.0M/295M [00:02<00:11, 24.1MB/s]
 13%|█▎        | 38.0M/295M [00:02<00:12, 21.4MB/s]
 14%|█▍        | 41.0M/295M [00:02<00:16, 16.2MB/s]
 15%|█▍        | 43.0M/295M [00:02<00:21, 12.6MB/s]
 15%|█▌        | 45.0M/295M [00:03<00:23, 11.0MB/s]
 17%|█▋        | 49.0M/295M [00:03<00:16, 15.5MB/s]
 18%|█▊        | 52.0M/295M [00:03<00:15, 16.5MB/s]
 19%|█▊        | 55.0M/295M [00:03<00:13, 19.0MB/s]
 20%|█▉        | 59.

In [28]:
import zipfile
with zipfile.ZipFile("./content/m2-sise-2023.zip", 'r') as zip_ref:
    zip_ref.extractall("./content")
    