# Imports

## Libs

In [13]:
# Classic imports
import pandas as pd
import numpy as np

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# - Models -
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

# Other imports
import tqdm

## Datasets

In [None]:
# Load the datasets
train_name = "train_with_clusters_with_sin_ratios_with_m2_price_scaled" # nom du fichier csv à utiliser pour train
pred_name = "test_with_clusters_with_sin_ratios_with_m2_price_scaled" # nom du fichier csv pour prédire

train_df = pd.read_csv('CSV DATA/'+ train_name +'.csv')  # fichier csv à utiliser pour train
pred_df = pd.read_csv('CSV DATA/'+ pred_name +'.csv')  # fichier csv pour prédire

target_columns = ['prix', 'prix_m2_interieur' , 'prix_m2_total'] # colonnes à prédire

#little printy print to see the data
print("Train shape: ", train_df.shape)
print("Prediction file shape: ", pred_df.shape)

Train shape:  (17147, 32)
Prediction file shape:  (4287, 29)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(train_df.drop(columns=target_columns), train_df[target_columns], test_size=0.2, random_state=42)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((13717, 29), (3430, 29), (13717, 3), (3430, 3))

# Cross Validation

In [16]:
def cross_val_func(model, X, y, cv=5):
    # Perform cross-validation
    scores = cross_val_score(model, X, y, cv=cv)  # 5-fold cross-validation

    # Print the cross-validation scores
    print(f"Cross-validation scores: {scores}")
    print(f"Mean cross-validation score: {scores.mean()}")
    print(f"Standard deviation of scores: {scores.std()}")

    return scores

In [17]:
def RMSE_calc(x1, x2):
    RMSE = np.sqrt(mean_squared_error(x1, x2))
    print("RMSE: ", RMSE)
    return RMSE

# Model - regression pénalisée

## Useful functions

In [18]:
from sklearn.linear_model import Lasso

In [19]:
def lasso_reg(X_train, y_train, X_test, lasso = None):
    if lasso :
        lasso = lasso
    else :
        # Initialisation du modèle Lasso
        lasso = Lasso(alpha=0.1)

    # Entraînement du modèle
    lasso.fit(X_train, y_train)

    # Prédiction sur l'ensemble de test
    y_pred = lasso.predict(X_test)

    return y_pred, lasso

In [None]:
# on importe les fichiers avant scaling pour entrainer un standardscaler dessus,
# et l'utiliser pour de-scaler les prédictions et voir les scores à l'échelle

from sklearn.preprocessing import StandardScaler
unscaled_data = pd.read_csv('CSV data/train_with_clusters_with_sin_ratios_with_m2_price.csv')[target_columns]
scaler = StandardScaler()
scaler.fit(unscaled_data)


In [29]:
y_test_descaled = scaler.inverse_transform(y_test)

## Comparaison RMSE du lasso

In [28]:
lasso = Lasso(alpha=0.1)
y_pred_lasso = lasso_reg(X_train, y_train, X_test, lasso)[0]
scores_crossval_lasso = cross_val_func(lasso, X_train, y_train, cv = 5)
y_pred_lasso_descaled = scaler.inverse_transform(y_pred_lasso)
RMSE_lasso = RMSE_calc(y_test, y_pred_lasso)


Cross-validation scores: [ 0.00238959 -0.00093917  0.00047778  0.00142405  0.00180318]
Mean cross-validation score: 0.0010310869086803528
Standard deviation of scores: 0.001164829080732257
RMSE:  0.11499759963766087


In [30]:
RMSE_lasso_whole = RMSE_calc(y_test_descaled, y_pred_lasso_descaled)

RMSE:  11402.428961401441


In [36]:
pred_features = ['id', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin',
       'm2_etage', 'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note',
       'etat_note', 'design_note', 'annee_construction', 'annee_renovation',
       'm2_interieur_15voisins', 'm2_jardin_15voisins', 'zipcode',
       'cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve',
       'cluster_tres_eleve', 'sin_month', 'sin_dayofyear', 'jardin', 'etage',
       'soussol', 'm2_outside', 'm2_total']

In [60]:
final_train = train_df[pred_features]
final_y = train_df[target_columns]
final_train.shape, final_y.shape

((17147, 29), (17147, 3))

In [61]:
final_y

Unnamed: 0,prix,prix_m2_interieur,prix_m2_total
0,0.058104,0.293826,0.217792
1,0.067979,0.484948,0.030203
2,0.017882,0.053574,0.021759
3,0.041475,0.097776,0.088872
4,0.056530,0.243571,0.080639
...,...,...,...
17142,0.097317,0.245033,0.273391
17143,0.028336,0.125889,0.055955
17144,0.054097,0.274472,0.127564
17145,0.052737,0.182217,0.046045


In [42]:
pred_df.shape

(4287, 29)

In [54]:
lasso.fit(final_train, final_test)
lasso_final_pred = scaler.inverse_transform(lasso.predict(pred_df))
scaled_df_test = pd.DataFrame(lasso_final_pred, columns=target_columns, index=pred_df.index)['prix']

In [55]:
scaled_df_test

0       568645.364019
1       568645.051754
2       568644.883593
3       568644.451186
4       568644.415095
            ...      
4282    567464.665412
4283    567464.665365
4284    567464.473242
4285    567464.052982
4286    567464.052870
Name: prix, Length: 4287, dtype: float64

In [57]:
scaled_df_test.to_csv('predictions/lasso_pred_flo.csv')

# Model - Random Forest

## Useful functions

### time for filename of prediction

In [9]:
from datetime import datetime

def current_time_filename():
    # Obtenir la date et l'heure actuelles au format mois_jour_heure_min pour au cas où on veut spammer le kaggle
    current_time = datetime.now().strftime("%m_%d_%H_%M")

    # Et paf le nom de fichier avec la date et l'heure actuelles
    file_name = f"predictions_{current_time}"

    return file_name

### gridsearch of randomforest

In [10]:
rf = RandomForestRegressor(random_state=42)

In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],  # Nombre d'arbres dans la forêt
    'max_depth': [10, 20, 30],#, None],  # Profondeur maximale des arbres
    'min_samples_split': [2, 5, 10],  # Nombre minimal d'échantillons pour une division interne
    'min_samples_leaf': [1, 2, 4],    # Nombre minimal d'échantillons par feuille
    'bootstrap': [True, False]        # Utiliser bootstrap ou non, via ce qu'à dit le prof
}

In [12]:
# train with price m2
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')

def find_rf_best_estimator(train_df, target_columns):
    X_train = train_df.drop(columns=target_columns)
    y_train = train_df[['prix_m2_interieur', 'prix_m2_total']]

    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape)

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_


### Prediction functions

In [13]:
def predict_for_m2(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df['prix_m2_interieur']

    model = RandomForestRegressor(bootstrap = True, max_depth = 20, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, verbose=2, n_jobs=-1)

    model.fit(X_train, y_train)

    X_test = test_df.drop(columns=['id'])

    y_pred_m2 = model.predict(X_test) # la ca predit pour un prix au m2 !
    print(len(y_pred_m2)) # ca c'est pour du debug

    date_filename = current_time_filename()
    print(date_filename)

    filename = f"{date_filename}-cluster-prix-m2-1.csv"
    # Sauvegarder les résultats dans un fichier CSV avec ce nouveau nom
    test_df['prix_m2'] = y_pred_m2
    test_df['prix'] = test_df['prix_m2'] * test_df['m2_interieur']
    test_df[['id', 'prix']].to_csv(filename, index=False)

    print(f"Fichier sauvegardé sous : {filename}")

def predict_for_total_m2(train_df, test_df, estimator = False, is_scaled = False, m2_price_col = None, id_col = None):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df[['prix_m2_interieur', 'prix_m2_total']]
    
    if not estimator :
        model = RandomForestRegressor(bootstrap = True, max_depth = 20, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, verbose=1, n_jobs=-1)
    else :
        model = estimator

    model.fit(X_train, y_train)

    X_test = test_df.drop(columns=['id'])

    y_pred_m2 = model.predict(X_test) # la ca predit pour un prix au m2 !
    print(len(y_pred_m2)) # ca c'est pour du debug

    #print(y_pred_m2[0]) # on voit que le premier de chaque duo est le prix au m2 interieur, l'autre au total

    y_pred_m2_array = seperate_total_and_inside(y_pred_m2)

    date_filename = current_time_filename()
    print(date_filename)

    # Sauvegarder les résultats dans un fichier CSV avec ce nouveau nom
    test_df['prix_m2_tot'] = y_pred_m2_array[1]
    test_df['prix'] = test_df['prix_m2_tot'] * test_df['m2_total']

    if is_scaled :
        filename = f"{date_filename}-cluster-prix-m2tot-DeScaled.csv"
        # Initialize the MinMaxScaler
        scaler = MinMaxScaler()

        prediction = test_df['prix']

        # Fit the MinMaxScaler
        scaler.fit(m2_price_col.values.reshape(-1, 1))
        # Reshape predictions to a 2D array (necessary for inverse_transform)
        predictions_scaled_reshaped = np.array(prediction).reshape(-1, 1)
        # Reverse the Min-Max scaling (inverse transform)
        predictions_original = scaler.inverse_transform(predictions_scaled_reshaped)

        test_df['prix'] = predictions_original
        test_df['id'] = id_col

        test_df[['id', 'prix']].to_csv(filename, index=False)
    
    else :
        filename = f"{date_filename}-cluster-prix-m2tot.csv"
        test_df[['id', 'prix']].to_csv(filename, index=False)

    print(f"Fichier sauvegardé sous : {filename}")

    return y_pred_m2_array[1]

def seperate_total_and_inside(array_to_separate):
    array_inside = []
    array_tot = []
    for c in array_to_separate :
        array_inside.append(c[0])
        array_tot.append(c[1])
    return array_inside, array_tot

## On cherche les meilleurs paramètres avec grid search

Meilleurs params :
Sans zip code, sans clustering des zip code, etude du prix
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Clustering des zips, etude du prix (52min13.9s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}

Clustering, etude du prix au m2 intérieur (24min14.4s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}

Clustering, prix m2 total, sin et scale (18min58.1s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [14]:
#target_columns = ['id', 'prix', 'prix_m2_interieur', 'prix_m2_total']
#best_rf_m2_price = find_rf_best_estimator(train_df, target_columns)

In [15]:
#prediction = predict_for_total_m2(train_scaled, test_scaled, best_rf_total_m2_price, True, m2_price_col = train_new['prix_m2_total']*train_new['m2_total'], id_col = test_new['id']) 

# le code du cours de Logic

In [16]:
a # génération d'erreur pour stopper le kernel et éviter de spammer le kaggle

NameError: name 'a' is not defined

In [None]:
# Train a regression model
#model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=1, n_jobs=-1)
#model.fit(X_train, y_train)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=2, n_jobs=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [None]:
# Prepare the test data (we don't have 'Sales_Qty' for the test set)
X_test = test_df.drop(columns=['index', 'Sales_Date'])

In [None]:
# Make predictions
model = best_model
predictions = model.predict(X_test)

In [None]:
# Output predictions (You can store them in a new DataFrame and save to CSV)
test_df['Predicted_Sales_Qty'] = predictions
test_df[['index', 'Predicted_Sales_Qty']].to_csv('predictions.csv', index=False)

In [None]:
# Optionally, evaluate the model on the training set
train_predictions = model.predict(X_train)
mse = mean_squared_error(y_train, train_predictions)
print(f"Mean Squared Error on training set: {mse}")