# Imports

## Libs

In [1]:
# Classic imports
import pandas as pd
import numpy as np

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# - Models -
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

# Other imports
import tqdm

## Datasets

In [4]:
# Load the datasets
train_name = "train_with_clusters_with_sin_ratios_with_m2_price_scaled" # nom du fichier csv à utiliser pour train
pred_name = "test_with_clusters_with_sin_ratios_with_m2_price_scaled" # nom du fichier csv pour prédire

train_df = pd.read_csv('CSV DATA/'+ train_name +'.csv')  # fichier csv à utiliser pour train
pred_df = pd.read_csv('CSV DATA/'+ pred_name +'.csv')  # fichier csv pour prédire

target_columns = ['prix', 'prix_m2_interieur' , 'prix_m2_total'] # colonnes à prédire

#little printy print to see the data
print("Train shape: ", train_df.shape)
print("Prediction file shape: ", pred_df.shape)

Train shape:  (17147, 32)
Prediction file shape:  (4287, 29)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=target_columns), train_df[target_columns], test_size=0.2, random_state=42)

# Cross Validation

In [7]:
def cross_val_func(model, X, y, cv=5):
    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=5)  # 5-fold cross-validation

    # Print the cross-validation scores
    print(f"Cross-validation scores: {scores}")
    print(f"Mean cross-validation score: {scores.mean()}")
    print(f"Standard deviation of scores: {scores.std()}")

    return scores

In [8]:
def RMSE_calc(x1, x2):
    RMSE = np.sqrt(mean_squared_error(x1, x2))
    print("RMSE: ", RMSE)
    return RMSE

# Train - regression pénalisée

In [9]:
from sklearn.linear_model import Lasso

In [10]:
def lasso_reg(X_train, y_train, X_test):
    # Initialisation du modèle Lasso
    lasso = Lasso(alpha=0.1)

    # Entraînement du modèle
    lasso.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred = lasso.predict(X_test)

    return y_pred

## Comparaison RMSE du lasso

In [None]:
y_pred_lasso = lasso_reg(X_train, y_train, X_test)
RMSE_lasso = RMSE_calc(y_test, y_pred_lasso)

RMSE:  0.11499759963766087


In [None]:
q

# First train model - Random Forest

In [None]:
from datetime import datetime

def current_time_filename():
    # Obtenir la date et l'heure actuelles au format mois_jour_heure_min pour au cas où on veut spammer le kaggle
    current_time = datetime.now().strftime("%m_%d_%H_%M")

    # Et paf le nom de fichier avec la date et l'heure actuelles
    file_name = f"predictions_{current_time}"

    return file_name

In [None]:
rf = RandomForestRegressor(random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],  # Nombre d'arbres dans la forêt
    'max_depth': [10, 20, 30],#, None],  # Profondeur maximale des arbres
    'min_samples_split': [2, 5, 10],  # Nombre minimal d'échantillons pour une division interne
    'min_samples_leaf': [1, 2, 4],    # Nombre minimal d'échantillons par feuille
    'bootstrap': [True, False]        # Utiliser bootstrap ou non, via ce qu'à dit le prof
}

In [None]:
# train with price m2
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')

def train_with_total_m2_price(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df[['prix_m2_interieur', 'prix_m2_total']]

    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape)

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_

def train_with_m2_price(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df['prix_m2_interieur']

    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape)

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_

def train_with_price(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix'])
    y_train = train_df['prix']

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_
    

## On cherche les meilleurs paramètres avec grid search

Meilleurs params :
Sans zip code, sans clustering des zip code, etude du prix
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Clustering des zips, etude du prix (52min13.9s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}

Clustering, etude du prix au m2 intérieur (24min14.4s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}

Clustering, prix m2 total, sin et scale (18min58.1s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [None]:
# best_rf_m2_price = train_with_m2_price(train_new, test_new)

In [None]:
# best_rf_full_price = train_with_price(train_df, test_df)

In [None]:
# best_rf_total_m2_price = train_with_total_m2_price(train_scaled, test_scaled)

In [None]:
def predict_for_m2(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df['prix_m2_interieur']

    model = RandomForestRegressor(bootstrap = True, max_depth = 20, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, verbose=2, n_jobs=-1)

    model.fit(X_train, y_train)

    X_test = test_df.drop(columns=['id'])

    y_pred_m2 = model.predict(X_test) # la ca predit pour un prix au m2 !
    print(len(y_pred_m2)) # ca c'est pour du debug

    date_filename = current_time_filename()
    print(date_filename)

    filename = f"{date_filename}-cluster-prix-m2-1.csv"
    # Sauvegarder les résultats dans un fichier CSV avec ce nouveau nom
    test_df['prix_m2'] = y_pred_m2
    test_df['prix'] = test_df['prix_m2'] * test_df['m2_interieur']
    test_df[['id', 'prix']].to_csv(filename, index=False)

    print(f"Fichier sauvegardé sous : {filename}")

def predict_for_total_m2(train_df, test_df, estimator = False, is_scaled = False, m2_price_col = None, id_col = None):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df[['prix_m2_interieur', 'prix_m2_total']]
    
    if not estimator :
        model = RandomForestRegressor(bootstrap = True, max_depth = 20, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, verbose=1, n_jobs=-1)
    else :
        model = estimator

    model.fit(X_train, y_train)

    X_test = test_df.drop(columns=['id'])

    y_pred_m2 = model.predict(X_test) # la ca predit pour un prix au m2 !
    print(len(y_pred_m2)) # ca c'est pour du debug

    #print(y_pred_m2[0]) # on voit que le premier de chaque duo est le prix au m2 interieur, l'autre au total

    y_pred_m2_array = seperate_total_and_inside(y_pred_m2)

    date_filename = current_time_filename()
    print(date_filename)

    # Sauvegarder les résultats dans un fichier CSV avec ce nouveau nom
    test_df['prix_m2_tot'] = y_pred_m2_array[1]
    test_df['prix'] = test_df['prix_m2_tot'] * test_df['m2_total']

    if is_scaled :
        filename = f"{date_filename}-cluster-prix-m2tot-DeScaled.csv"
        # Initialize the MinMaxScaler
        scaler = MinMaxScaler()

        prediction = test_df['prix']

        # Fit the MinMaxScaler
        scaler.fit(m2_price_col.values.reshape(-1, 1))
        # Reshape predictions to a 2D array (necessary for inverse_transform)
        predictions_scaled_reshaped = np.array(prediction).reshape(-1, 1)
        # Reverse the Min-Max scaling (inverse transform)
        predictions_original = scaler.inverse_transform(predictions_scaled_reshaped)

        test_df['prix'] = predictions_original
        test_df['id'] = id_col

        test_df[['id', 'prix']].to_csv(filename, index=False)
    
    else :
        filename = f"{date_filename}-cluster-prix-m2tot.csv"
        test_df[['id', 'prix']].to_csv(filename, index=False)

    print(f"Fichier sauvegardé sous : {filename}")

    return y_pred_m2_array[1]

def seperate_total_and_inside(array_to_separate):
    array_inside = []
    array_tot = []
    for c in array_to_separate :
        array_inside.append(c[0])
        array_tot.append(c[1])
    return array_inside, array_tot

In [None]:
prediction = predict_for_total_m2(train_scaled, test_scaled, best_rf_total_m2_price, True, m2_price_col = train_new['prix_m2_total']*train_new['m2_total'], id_col = test_new['id']) 

# le code du cours de Logic

In [None]:
# Train a regression model
#model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=1, n_jobs=-1)
#model.fit(X_train, y_train)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=2, n_jobs=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [None]:
# Prepare the test data (we don't have 'Sales_Qty' for the test set)
X_test = test_df.drop(columns=['index', 'Sales_Date'])

In [None]:
# Make predictions
model = best_model
predictions = model.predict(X_test)

In [None]:
# Output predictions (You can store them in a new DataFrame and save to CSV)
test_df['Predicted_Sales_Qty'] = predictions
test_df[['index', 'Predicted_Sales_Qty']].to_csv('predictions.csv', index=False)

In [None]:
# Optionally, evaluate the model on the training set
train_predictions = model.predict(X_train)
mse = mean_squared_error(y_train, train_predictions)
print(f"Mean Squared Error on training set: {mse}")