In [1]:
# Classic imports
import pandas as pd
import numpy as np

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# - Models -
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

# Other imports

In [2]:
# Load the datasets
train_name = "train_with_clusters_with_sin_ratios_with_m2_price_with_zipcode_average_price_scaled_standard" # nom du fichier csv à utiliser pour train
pred_name = "test_with_clusters_with_sin_ratios_with_m2_price_with_zipcode_average_price_scaled_standard" # nom du fichier csv pour prédire

train_df = pd.read_csv('CSV DATA/'+ train_name +'.csv').drop(columns=['prix_m2_interieur', 'prix_m2_total'])  # fichier csv à utiliser pour train
pred_df = pd.read_csv('CSV DATA/'+ pred_name +'.csv')  # fichier csv pour prédire

target_columns = ['prix'] # colonnes à prédire

#little printy print to see the data
print("Train shape: ", train_df.shape)
print("Prediction file shape: ", pred_df.shape)

# Définir l'ordre désiré pour train_df : toutes les colonnes sauf 'prix' puis 'prix'
desired_order = [col for col in train_df.columns if col != 'prix'] 

# Réordonner train_df si nécessaire (ici, on force que 'prix' soit en dernière position)
train_df = train_df[desired_order + ['prix']]

# Pour le fichier de prédiction (pred_df), on souhaite les mêmes colonnes que desired_order,
# mais sans 'prix' (qui n'existe pas encore dans pred_df)
pred_order = [col for col in desired_order if col in pred_df.columns]
pred_df = pred_df[pred_order]



Train shape:  (17147, 30)
Prediction file shape:  (4287, 29)


In [3]:
train_df.columns, pred_df.columns

(Index(['id', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage',
        'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note',
        'design_note', 'annee_construction', 'annee_renovation',
        'm2_interieur_15voisins', 'm2_jardin_15voisins', 'cluster_tres_bas',
        'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve',
        'sin2_month', 'sin_dayofyear', 'jardin', 'etage', 'soussol',
        'm2_outside', 'm2_total', 'zipcode_average_price', 'prix'],
       dtype='object'),
 Index(['id', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage',
        'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note',
        'design_note', 'annee_construction', 'annee_renovation',
        'm2_interieur_15voisins', 'm2_jardin_15voisins', 'cluster_tres_bas',
        'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve',
        'sin2_month', 'sin_dayofyear', 'jardin', 'etage', 'soussol',
        'm2_outside', 'm2_

In [4]:
from sklearn.preprocessing import StandardScaler

# Définition des colonnes à scaler
cols = ['prix']

# Création et ajustement du scaler sur ces colonnes, puis transformation inplace
scaler = StandardScaler()
train_df[cols] = scaler.fit_transform(train_df[cols])



In [5]:
from sklearn.preprocessing import StandardScaler
def standard_scaler(X_train, X_test, scaler=None):
    if scaler:
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    else:
        # Standardisation des données
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    return X_train, X_test, scaler

In [6]:
train_df.set_index('id', inplace=True)
pred_df.set_index('id', inplace=True)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=target_columns), train_df[target_columns], test_size=0.2, random_state=42)

In [8]:
descaled_y_test = scaler.inverse_transform(y_test)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13717, 28), (3430, 28), (13717, 1), (3430, 1))

In [10]:
def cross_val_func(model, X, y, cv=5):
    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=cv)  # 5-fold cross-validation

    # Print the cross-validation scores
    print(f"Cross-validation scores: {scores}")
    print(f"Mean cross-validation score: {scores.mean()}")
    print(f"Standard deviation of scores: {scores.std()}")

    return scores

In [11]:
def RMSE_calc(x1, x2):
    RMSE = np.sqrt(mean_squared_error(x1, x2))
    print("RMSE: ", RMSE)
    return RMSE

In [12]:
from sklearn.linear_model import Lasso
def lasso_reg(X_train, y_train, X_test, lasso = None):
    if lasso :
        lasso = lasso
    else :
        # Initialisation du modèle Lasso
        lasso = Lasso(alpha=0.1)

    # Entraînement du modèle
    lasso.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred = lasso.predict(X_test)

    return y_pred, lasso

In [13]:
lasso = Lasso(alpha=0.1)
y_pred_lasso = lasso_reg(X_train, y_train, X_test, lasso)[0]
scores_crossval_lasso = cross_val_func(lasso, X_train, y_train, cv = 5)

y_pred_lasso_descaled = scaler.inverse_transform(y_pred_lasso.reshape(-1, 1))
RMSE_lasso = RMSE_calc(descaled_y_test, y_pred_lasso_descaled)


# Création du DataFrame de prédiction avec 2 colonnes : 'id' et 'prix'
# Ici, on récupère l'index (les id) et on l'associe aux prédictions
# pred_df = pd.DataFrame({'id': X_test.index, 'prix': predictions})

Cross-validation scores: [0.76644564 0.72453762 0.75559273 0.74730896 0.70529051]
Mean cross-validation score: 0.7398350912906559
Standard deviation of scores: 0.0220890212500048
RMSE:  186708.90977939824


In [14]:
descaled_y_test

array([[746300.],
       [257000.],
       [252000.],
       ...,
       [345000.],
       [310000.],
       [160000.]], shape=(3430, 1))

In [15]:
y_pred_lasso_descaled

array([[687019.74179112],
       [208969.05381211],
       [369391.80027441],
       ...,
       [392384.21950297],
       [438598.21040476],
       [239025.78756855]], shape=(3430, 1))

In [16]:
def randomforest_reg(X_train, y_train, X_test):
    randomforest = RandomForestRegressor(n_estimators=100, random_state=42)

    # Entraînement du modèle
    randomforest.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred =    randomforest.predict(X_test)

    return y_pred,  randomforest

In [17]:
X_train

Unnamed: 0_level_0,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,vue_mer,vue_note,etat_note,...,cluster_eleve,cluster_tres_eleve,sin2_month,sin_dayofyear,jardin,etage,soussol,m2_outside,m2_total,zipcode_average_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5711200170,-0.398785,0.487771,0.135326,-0.181514,0.500080,-0.656952,0.926869,-0.090074,-0.306657,-0.630227,...,1,0,-0.634753,0.942761,0.0,0.0,-0.800930,-0.181514,-0.177809,0.430030
7896300070,0.674174,-1.453014,-0.863328,-0.219499,-0.893869,-0.116034,-0.918153,-0.090074,-0.306657,-0.630227,...,0,0,-0.634753,0.863142,0.0,0.0,1.248548,-0.219499,-0.237587,-0.530585
6813600605,0.674174,3.075485,1.448772,-0.185825,1.954113,-0.656952,2.771890,-0.090074,-0.306657,-0.630227,...,0,1,-1.366523,0.238673,0.0,0.0,-0.800930,-0.185825,-0.153240,0.169303
4083802195,-1.471744,-0.159157,-1.112991,-0.268214,-0.953953,-0.521723,-0.918153,-0.090074,-0.306657,-2.169835,...,1,0,-1.113308,0.622047,0.0,0.0,1.248548,-0.268214,-0.291593,0.169303
7340600845,0.674174,-1.453014,-0.765633,-0.203548,-0.725634,-0.228725,-0.918153,-0.090074,-0.306657,-0.630227,...,0,0,0.384946,0.953681,0.0,0.0,1.248548,-0.203548,-0.219554,-1.274625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7207900030,0.674174,1.781628,0.309005,-0.275591,0.692349,-0.656952,1.849379,-0.090074,-0.306657,-0.630227,...,0,0,-0.326610,0.981306,0.0,0.0,-0.800930,-0.275591,-0.267691,-0.561971
6788201015,-1.471744,-0.482621,-0.526825,-0.268214,-1.134205,1.033417,-0.918153,-0.090074,-0.306657,2.448988,...,1,0,-0.326610,0.991114,0.0,0.0,1.248548,-0.268214,-0.278712,2.384167
3343301385,-0.398785,0.487771,0.786621,-0.179598,1.221088,-0.656952,0.926869,-0.090074,-0.306657,-0.630227,...,1,0,-0.634753,0.953681,0.0,0.0,-0.800930,-0.179598,-0.161589,1.354327
1775801090,0.674174,0.164307,-0.288016,0.123350,-0.545382,0.424884,-0.918153,-0.090074,-0.306657,2.448988,...,0,0,-0.634753,0.961130,0.0,0.0,1.248548,0.123350,0.116525,0.158312


In [18]:
randomforest = RandomForestRegressor(bootstrap= True, max_depth= 20, min_samples_leaf= 1, min_samples_split= 2, n_estimators= 200)
y_pred_randomforest = randomforest_reg(X_train, y_train, X_test)[0]
scores_crossval_randomforest = cross_val_func(randomforest, X_train, y_train, cv = 5)
y_pred_randomforest_descaled = scaler.inverse_transform(y_pred_randomforest.reshape(-1, 1))
RMSE_randomforest = RMSE_calc(descaled_y_test, y_pred_randomforest_descaled)

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Cross-validation scores: [0.89874048 0.88966252 0.91685225 0.90437832 0.88201947]
Mean cross-validation score: 0.898330608175683
Standard deviation of scores: 0.012008186934240371
RMSE:  122725.0981765127


In [19]:
pred_df

Unnamed: 0_level_0,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,vue_mer,vue_note,etat_note,...,cluster_eleve,cluster_tres_eleve,sin2_month,sin_dayofyear,jardin,etage,soussol,m2_outside,m2_total,zipcode_average_price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3600072,0.674174,0.811236,0.146181,-0.236839,-0.749667,1.709564,-0.918153,-0.090074,-0.306657,2.448988,...,0,0,-1.113308,-0.036431,0.0,0.0,1.248548,-0.236839,-0.232674,0.230150
6200017,-0.398785,-1.453014,-0.809053,0.146989,-0.545382,-0.656952,0.004358,-0.090074,-0.306657,0.909381,...,0,0,1.538983,-1.055894,0.0,0.0,-0.800930,0.146989,0.128619,-1.221135
7600136,-1.471744,-0.159157,-1.037006,-0.336521,-1.194289,0.086810,0.926869,-0.090074,-0.306657,-0.630227,...,0,0,0.016459,1.028726,0.0,0.0,1.248548,-0.336521,-0.357955,0.366722
11200400,-0.398785,0.487771,-0.190322,-0.256526,0.139576,-0.656952,0.926869,-0.090074,-0.306657,-0.630227,...,1,0,0.768639,0.176799,0.0,0.0,-0.800930,-0.256526,-0.259676,0.343172
11500890,-0.398785,0.487771,1.133979,-0.154450,1.605625,-0.656952,0.926869,-0.090074,-0.306657,-0.630227,...,1,0,-1.113308,-0.465540,0.0,0.0,-0.800930,-0.154450,-0.128909,0.430030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9834200975,-0.398785,1.134700,-0.613664,-0.266298,-0.329079,-0.656952,0.926869,-0.090074,-0.306657,2.448988,...,0,0,-1.270457,-1.284773,0.0,0.0,-0.800930,-0.266298,-0.278712,0.230150
9834201370,-0.398785,0.164307,-0.743923,-0.338197,-1.026054,0.379807,0.926869,-0.090074,-0.306657,-0.630227,...,0,0,-0.899431,0.333048,0.0,0.0,1.248548,-0.338197,-0.353184,0.230150
9835801000,-0.398785,0.164307,-0.483405,-0.162833,-0.737650,0.379807,-0.918153,-0.090074,-0.306657,-0.630227,...,0,0,-0.326610,1.059257,0.0,0.0,1.248548,-0.162833,-0.172800,-1.221135
9839300125,0.674174,-0.159157,-0.298871,-0.258634,-0.112777,-0.409031,0.926869,-0.090074,-0.306657,-0.630227,...,0,1,-1.366523,-2.309527,0.0,0.0,1.248548,-0.258634,-0.264161,0.366722


In [20]:
# Supposons que train_df et test_df soient déjà chargés à partir des fichiers CSV
# Par exemple :
# train_df = pd.read_csv("train_with_clusters_with_sin_ratios_with_m2_price.csv")
# test_df  = pd.read_csv("test_with_clusters_with_sin_ratios_with_m2_price_with_zipcode_average_price_scaled_standard.csv")

# Sauvegarder les identifiants du test
test_ids = pred_df.index

# Récupérer l'ordre des colonnes utilisées pour l'entraînement (features)
# Ici, on suppose que target_columns contient la/les colonnes cibles (par exemple ['prix'])
feature_columns = train_df.drop(columns=['prix']).columns

# Préparer le DataFrame de test pour la prédiction :
# On retire la colonne 'id' car elle ne fait pas partie des features, 
# puis on réordonne les colonnes pour qu'elles correspondent à l'ordre du train.
pred_features = pred_df.reindex(columns=feature_columns)

# Entraînement du modèle sur les features et la/les cibles
randomforest.fit(train_df[feature_columns], train_df[target_columns])

# Réaliser la prédiction
final_prediction = randomforest.predict(pred_features)

# Appliquer l'inverse du scaling pour revenir aux valeurs d'origine
final_prediction_descaled = scaler.inverse_transform(final_prediction.reshape(-1, 1))

# Création du DataFrame final de prédiction avec 2 colonnes : 'id' et 'prix'
pred_df_final = pd.DataFrame({
    'id': test_ids.astype('int64'),  # S'assurer que les id sont au format Int64
    'prix': final_prediction_descaled[:, 0]
})

# Sauvegarder le fichier de soumission au bon format
pred_df_final.to_csv('predictions/randomforest_prediction_flo.csv', index=False)


  return fit_method(estimator, *args, **kwargs)


## Test du modèle XGBoost

In [21]:
from xgboost import XGBRegressor
def xgb_reg(X_train, y_train, X_test, xgb_model):
    # xgb_model = XGBRegressor(objective='reg:squarederror', n_estimators=100, learning_rate=0.1, random_state=42, verbosity = 2, scale_pos_weight=0)

    # Entraînement du modèle
    xgb_model.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred = xgb_model.predict(X_test)

    return y_pred, xgb_model

In [22]:
xgb_model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
y_pred_xgb = xgb_reg(X_train, y_train, X_test, xgb_model)[0]
scores_crossval_xgbt = cross_val_func(xgb_model, X_train, y_train, cv = 5)
y_pred_xgb_descaled = scaler.inverse_transform(y_pred_xgb.reshape(-1,1))
RMSE_xgb = RMSE_calc(descaled_y_test, y_pred_xgb_descaled)

Cross-validation scores: [0.91092181 0.90613276 0.90989029 0.90420407 0.84165889]
Mean cross-validation score: 0.8945615649223327
Standard deviation of scores: 0.02656385148322523
RMSE:  117920.50769010748


In [23]:
descaled_y_test

array([[746300.],
       [257000.],
       [252000.],
       ...,
       [345000.],
       [310000.],
       [160000.]], shape=(3430, 1))

In [24]:
y_pred_xgb_descaled

array([[833202.  ],
       [248228.98],
       [269237.7 ],
       ...,
       [350082.5 ],
       [339940.3 ],
       [217876.73]], shape=(3430, 1), dtype=float32)

In [25]:
# Supposons que train_df et test_df soient déjà chargés à partir des fichiers CSV
# Par exemple :
# train_df = pd.read_csv("train_with_clusters_with_sin_ratios_with_m2_price.csv")
# test_df  = pd.read_csv("test_with_clusters_with_sin_ratios_with_m2_price_with_zipcode_average_price_scaled_standard.csv")

# Sauvegarder les identifiants du test
# test_ids = pred_df.index

# # Récupérer l'ordre des colonnes utilisées pour l'entraînement (features)
# # Ici, on suppose que target_columns contient la/les colonnes cibles (par exemple ['prix'])
# feature_columns = train_df.drop(columns=target_columns).columns

# # Préparer le DataFrame de test pour la prédiction :
# # On retire la colonne 'id' car elle ne fait pas partie des features, 
# # puis on réordonne les colonnes pour qu'elles correspondent à l'ordre du train.
# pred_features = pred_df.reindex(columns=feature_columns)

pred_cols = ['nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage',
       'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note',
       'design_note', 'annee_construction', 'annee_renovation',
       'm2_interieur_15voisins', 'm2_jardin_15voisins', 'cluster_tres_bas',
       'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve',
       'sin2_month', 'sin_dayofyear', 'jardin', 'etage', 'soussol',
       'm2_outside', 'm2_total', 'prix_m2_interieur', 'prix_m2_total',
       'zipcode_average_price']

# Entraînement du modèle sur les features et la/les cibles
xgb_model.fit(train_df[feature_columns], train_df[target_columns])

# Réaliser la prédiction
final_prediction_xgb = xgb_model.predict(pred_df[feature_columns])

# Appliquer l'inverse du scaling pour revenir aux valeurs d'origine
final_prediction_descaled_xgb = scaler.inverse_transform(final_prediction_xgb.reshape(-1, 1))

# Création du DataFrame final de prédiction avec 2 colonnes : 'id' et 'prix'
pred_df_final_xgb = pd.DataFrame({
    'id': test_ids.astype('int64'),  # S'assurer que les id sont au format Int64
    'prix': final_prediction_descaled_xgb[:, 0]
})

# Sauvegarder le fichier de soumission au bon format
pred_df_final_xgb.to_csv('predictions/xgb_prediction_flo.csv', index=False)

In [26]:
feature_columns

Index(['nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage',
       'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note',
       'design_note', 'annee_construction', 'annee_renovation',
       'm2_interieur_15voisins', 'm2_jardin_15voisins', 'cluster_tres_bas',
       'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve',
       'sin2_month', 'sin_dayofyear', 'jardin', 'etage', 'soussol',
       'm2_outside', 'm2_total', 'zipcode_average_price'],
      dtype='object')

In [27]:
pred_features.columns

Index(['nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage',
       'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note',
       'design_note', 'annee_construction', 'annee_renovation',
       'm2_interieur_15voisins', 'm2_jardin_15voisins', 'cluster_tres_bas',
       'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve',
       'sin2_month', 'sin_dayofyear', 'jardin', 'etage', 'soussol',
       'm2_outside', 'm2_total', 'zipcode_average_price'],
      dtype='object')

In [28]:
final_prediction_xgb

array([-0.17532092, -0.91315657, -0.2255556 , ..., -0.79459286,
        0.685218  ,  0.10228465], shape=(4287,), dtype=float32)

# MLP