In [1]:
# Classic imports
import pandas as pd
import numpy as np

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# - Models -
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

# Other imports

In [2]:
# Load the datasets
train_name = "train_with_clusters_with_sin_ratios_with_m2_price_with_zipcode_average_price_scaled_standard" # nom du fichier csv à utiliser pour train
pred_name = "test_with_clusters_with_sin_ratios_with_m2_price_with_zipcode_average_price_scaled_standard" # nom du fichier csv pour prédire

train_df = pd.read_csv('CSV DATA/'+ train_name +'.csv')  # fichier csv à utiliser pour train
pred_df = pd.read_csv('CSV DATA/'+ pred_name +'.csv')  # fichier csv pour prédire

target_columns = ['prix', 'prix_m2_interieur' , 'prix_m2_total'] # colonnes à prédire

#little printy print to see the data
print("Train shape: ", train_df.shape)
print("Prediction file shape: ", pred_df.shape)

Train shape:  (17147, 32)
Prediction file shape:  (4287, 29)


In [3]:
from sklearn.preprocessing import StandardScaler

# Définition des colonnes à scaler
cols = ['prix', 'prix_m2_interieur', 'prix_m2_total']

# Création et ajustement du scaler sur ces colonnes, puis transformation inplace
scaler = StandardScaler()
train_df[cols] = scaler.fit_transform(train_df[cols])



In [4]:
from sklearn.preprocessing import StandardScaler
def standard_scaler(X_train, X_test, scaler=None):
    if scaler:
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)
    else:
        # Standardisation des données
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

    return X_train, X_test, scaler

In [5]:
train_df.set_index('id', inplace=True)
pred_df.set_index('id', inplace=True)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=target_columns), train_df[target_columns], test_size=0.2, random_state=42)

In [7]:
descaled_y_test = scaler.inverse_transform(y_test)

In [8]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13717, 28), (3430, 28), (13717, 3), (3430, 3))

In [9]:
def cross_val_func(model, X, y, cv=5):
    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=cv)  # 5-fold cross-validation

    # Print the cross-validation scores
    print(f"Cross-validation scores: {scores}")
    print(f"Mean cross-validation score: {scores.mean()}")
    print(f"Standard deviation of scores: {scores.std()}")

    return scores

In [10]:
def RMSE_calc(x1, x2):
    RMSE = np.sqrt(mean_squared_error(x1, x2))
    print("RMSE: ", RMSE)
    return RMSE

In [11]:
from sklearn.linear_model import Lasso
def lasso_reg(X_train, y_train, X_test, lasso = None):
    if lasso :
        lasso = lasso
    else :
        # Initialisation du modèle Lasso
        lasso = Lasso(alpha=0.1)

    # Entraînement du modèle
    lasso.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred = lasso.predict(X_test)

    return y_pred, lasso

In [24]:
lasso = Lasso(alpha=0.1)
y_pred_lasso = lasso_reg(X_train, y_train, X_test, lasso)[0]
scores_crossval_lasso = cross_val_func(lasso, X_train, y_train, cv = 5)
y_pred_lasso_descaled = scaler.inverse_transform(y_pred_lasso)
RMSE_lasso = RMSE_calc(descaled_y_test, y_pred_lasso_descaled)


# Création du DataFrame de prédiction avec 2 colonnes : 'id' et 'prix'
# Ici, on récupère l'index (les id) et on l'associe aux prédictions
# pred_df = pd.DataFrame({'id': X_test.index, 'prix': predictions})

Cross-validation scores: [0.63381258 0.6088097  0.62985224 0.61608058 0.59812298]
Mean cross-validation score: 0.6173356168272852
Standard deviation of scores: 0.013202638285559109
RMSE:  103376.52742272695


In [21]:
def randomforest_reg(X_train, y_train, X_test):
    randomforest = RandomForestRegressor(n_estimators=100, random_state=42)

    # Entraînement du modèle
    randomforest.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred =    randomforest.predict(X_test)

    return y_pred,  randomforest

In [25]:
randomforest = RandomForestRegressor(n_estimators=100, random_state=42)
y_pred_randomforest = randomforest_reg(X_train, y_train, X_test)[0]
scores_crossval_randomforest = cross_val_func(randomforest, X_train, y_train, cv = 5)
y_pred_randomforest_descaled = scaler.inverse_transform(y_pred_randomforest)
RMSE_randomforest = RMSE_calc(descaled_y_test, y_pred_randomforest_descaled)

Cross-validation scores: [0.89985821 0.89086989 0.91266927 0.89544703 0.88321807]
Mean cross-validation score: 0.8964124928653222
Standard deviation of scores: 0.009817343387495802
RMSE:  68406.47582002312


In [32]:
# Récupérer l'ordre des colonnes utilisées pour l'entraînement (features)
feature_columns = train_df.drop(columns=target_columns).columns

# Entraînement du modèle sur les features et les cibles
randomforest.fit(train_df[feature_columns], train_df[target_columns])

# Réordonner le DataFrame de prédiction pour qu'il contienne exactement les mêmes colonnes que celles utilisées pour l'entraînement
pred_df = pred_df.reindex(columns=feature_columns)

# Prédiction
final_prediction = randomforest.predict(pred_df)

# Appliquer l'inverse du scaling pour revenir aux valeurs d'origine
final_prediction_descaled = scaler.inverse_transform(final_prediction)

# Création du DataFrame final de prédiction avec 2 colonnes : 'id' et 'prix'
pred_df_final = pd.DataFrame({'id': pred_df.index, 'prix': final_prediction_descaled[:, 0]})
pred_df_final.to_csv('predictions/randomforest_prediction_flo.csv', index=False)
