# Intro to df

## Import libs and load dfs

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import tqdm

In [2]:
# Load the datasets
folder = "PRAMA2025/"
train_df = pd.read_csv(folder + 'train_with_clusters.csv')
test_df = pd.read_csv(folder + 'test_with_clusters.csv')

print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)

Train shape:  (17147, 24)
Test shape:  (4287, 23)


In [3]:
train_df.head()

Unnamed: 0,id,date,prix,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,...,annee_construction,annee_renovation,m2_interieur_15voisins,m2_jardin_15voisins,zipcode,cluster_tres_bas,cluster_bas,cluster_moyen,cluster_eleve,cluster_tres_eleve
0,4443800785,2014-11-21T00:00:00Z,481000,2,1.0,150.501672,360.460795,85.470085,65.031587,1.0,...,1924,0,123.560015,360.460795,98117,0,0,1,0,0
1,2722059013,2015-02-04T00:00:00Z,550000,2,1.0,117.985879,4046.822742,117.985879,0.0,1.0,...,1908,0,173.727239,646.599777,98042,0,0,0,1,0
2,8856004730,2014-09-17T00:00:00Z,199950,2,2.75,147.714604,1943.236715,147.714604,0.0,1.5,...,1920,0,121.70197,557.413601,98001,1,0,0,0,0
3,2473003210,2015-03-13T00:00:00Z,364808,3,1.75,215.533259,731.605351,150.501672,65.031587,1.0,...,1967,0,184.875511,903.010033,98058,0,1,0,0,0
4,6743700335,2014-06-04T00:00:00Z,470000,3,2.0,167.22408,1176.978818,167.22408,0.0,1.0,...,1956,1990,183.017466,908.119658,98033,0,0,1,0,0


In [4]:
# Preprocess the datasets
# Convert the Sales_Date to datetime format
train_df['date'] = pd.to_datetime(train_df['date'])
test_df['date'] = pd.to_datetime(test_df['date'])

In [5]:
train_df.head()

Unnamed: 0,id,date,prix,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,...,annee_construction,annee_renovation,m2_interieur_15voisins,m2_jardin_15voisins,zipcode,cluster_tres_bas,cluster_bas,cluster_moyen,cluster_eleve,cluster_tres_eleve
0,4443800785,2014-11-21 00:00:00+00:00,481000,2,1.0,150.501672,360.460795,85.470085,65.031587,1.0,...,1924,0,123.560015,360.460795,98117,0,0,1,0,0
1,2722059013,2015-02-04 00:00:00+00:00,550000,2,1.0,117.985879,4046.822742,117.985879,0.0,1.0,...,1908,0,173.727239,646.599777,98042,0,0,0,1,0
2,8856004730,2014-09-17 00:00:00+00:00,199950,2,2.75,147.714604,1943.236715,147.714604,0.0,1.5,...,1920,0,121.70197,557.413601,98001,1,0,0,0,0
3,2473003210,2015-03-13 00:00:00+00:00,364808,3,1.75,215.533259,731.605351,150.501672,65.031587,1.0,...,1967,0,184.875511,903.010033,98058,0,1,0,0,0
4,6743700335,2014-06-04 00:00:00+00:00,470000,3,2.0,167.22408,1176.978818,167.22408,0.0,1.0,...,1956,1990,183.017466,908.119658,98033,0,0,1,0,0


### Convert to date and add year/month/day cols

In [6]:
# Extract useful date features (e.g., year, month, day)
train_df['Year'] = train_df['date'].dt.year
train_df['Month'] = train_df['date'].dt.month
train_df['Day'] = train_df['date'].dt.day
train_df['cos_month'] = train_df['Month'].map(lambda x: np.cos(float(x)))

In [7]:
test_df['Year'] = test_df['date'].dt.year
test_df['Month'] = test_df['date'].dt.month
test_df['Day'] = test_df['date'].dt.day
test_df['cos_month'] = test_df['Month'].map(lambda x: np.cos(float(x)))

In [8]:
train_df = train_df.drop(columns=['date'])
train_df.head()

Unnamed: 0,id,prix,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,vue_mer,...,zipcode,cluster_tres_bas,cluster_bas,cluster_moyen,cluster_eleve,cluster_tres_eleve,Year,Month,Day,cos_month
0,4443800785,481000,2,1.0,150.501672,360.460795,85.470085,65.031587,1.0,0,...,98117,0,0,1,0,0,2014,11,21,0.004426
1,2722059013,550000,2,1.0,117.985879,4046.822742,117.985879,0.0,1.0,0,...,98042,0,0,0,1,0,2015,2,4,-0.416147
2,8856004730,199950,2,2.75,147.714604,1943.236715,147.714604,0.0,1.5,0,...,98001,1,0,0,0,0,2014,9,17,-0.91113
3,2473003210,364808,3,1.75,215.533259,731.605351,150.501672,65.031587,1.0,0,...,98058,0,1,0,0,0,2015,3,13,-0.989992
4,6743700335,470000,3,2.0,167.22408,1176.978818,167.22408,0.0,1.0,0,...,98033,0,0,1,0,0,2014,6,4,0.96017


In [9]:
test_df = test_df.drop(columns=['date'])
test_df.head()

Unnamed: 0,id,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,vue_mer,vue_note,...,zipcode,cluster_tres_bas,cluster_bas,cluster_moyen,cluster_eleve,cluster_tres_eleve,Year,Month,Day,cos_month
0,3600072,4,2.75,206.243032,493.311037,108.695652,97.54738,1.0,0,0,...,98144,0,0,1,0,0,2015,3,30,-0.989992
1,6200017,3,1.0,124.489038,1982.162765,124.489038,0.0,1.5,0,0,...,98032,1,0,0,0,0,2014,11,12,0.004426
2,7600136,2,2.0,104.979562,106.651802,74.321813,30.657748,2.0,0,0,...,98122,0,0,1,0,0,2014,7,18,0.753902
3,11200400,3,2.5,177.44333,416.945373,177.44333,0.0,2.0,0,0,...,98007,0,0,0,1,0,2014,9,23,-0.91113
4,11500890,3,2.5,290.784095,812.894835,290.784095,0.0,2.0,0,0,...,98052,0,0,0,1,0,2015,3,12,-0.989992


## Find categorical cols

In [10]:
def find_out_col_type(df):
    # Find out the column types
    cat_cols = [col for col in df.columns if df[col].dtype == 'object']
    num_cols = [col for col in df.columns if df[col].dtype in ['int64', 'float64']]
    return cat_cols, num_cols

In [11]:
print(find_out_col_type(train_df))
print()
print(find_out_col_type(test_df))

([], ['id', 'prix', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage', 'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note', 'design_note', 'annee_construction', 'annee_renovation', 'm2_interieur_15voisins', 'm2_jardin_15voisins', 'zipcode', 'cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve', 'cos_month'])

([], ['id', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage', 'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note', 'design_note', 'annee_construction', 'annee_renovation', 'm2_interieur_15voisins', 'm2_jardin_15voisins', 'zipcode', 'cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve', 'cos_month'])


# Seperate to have prix au metre carré

In [12]:
print(train_df.columns)
print()
print(test_df.columns)
# what we want to use is "m2_interieur", "m2_jardin", "m2_etage", "m2_soussol"

Index(['id', 'prix', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin',
       'm2_etage', 'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note',
       'etat_note', 'design_note', 'annee_construction', 'annee_renovation',
       'm2_interieur_15voisins', 'm2_jardin_15voisins', 'zipcode',
       'cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve',
       'cluster_tres_eleve', 'Year', 'Month', 'Day', 'cos_month'],
      dtype='object')

Index(['id', 'nb_chambres', 'nb_sdb', 'm2_interieur', 'm2_jardin', 'm2_etage',
       'm2_soussol', 'nb_etages', 'vue_mer', 'vue_note', 'etat_note',
       'design_note', 'annee_construction', 'annee_renovation',
       'm2_interieur_15voisins', 'm2_jardin_15voisins', 'zipcode',
       'cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve',
       'cluster_tres_eleve', 'Year', 'Month', 'Day', 'cos_month'],
      dtype='object')


In [13]:
def create_new_df_with_m2(train_df, test_df):
    # create a new data frame with everything plus a column corresponding to 1 if "m2_jardin" is >0, 0 otherwise
    # we will also do the same for m2_etage and m2_soussol
    train_new = train_df
    train_new['jardin'] = train_df['m2_jardin'].map(lambda x: 1 if x > 0 else 0)
    train_new['etage'] = train_df['m2_etage'].map(lambda x: 1 if x > 0 else 0)
    train_new['soussol'] = train_df['m2_soussol'].map(lambda x: 1 if x > 0 else 0)

    #do the same for test_df
    test_new = test_df
    test_new['jardin'] = test_df['m2_jardin'].map(lambda x: 1 if x > 0 else 0)
    test_new['etage'] = test_df['m2_etage'].map(lambda x: 1 if x > 0 else 0)
    test_new['soussol'] = test_df['m2_soussol'].map(lambda x: 1 if x > 0 else 0)

    # now add three new columns per df, one for interior m2 and one for exterior m2 and one for total m2
    # for interior we already have m2_interieur which is the sum of etage and soussol
    train_new['m2_outside'] = train_new['m2_jardin']
    test_new['m2_outside'] = test_new['m2_jardin']
    train_new['m2_total'] = train_new['m2_interieur'] + train_new['m2_outside']
    test_new['m2_total'] = test_new['m2_interieur'] + test_new['m2_outside']

    # now we add two columns per df, one for prix per m2 interior and one for prix per m2 total
    train_new['prix_m2_interieur'] = train_new['prix'] / train_new['m2_interieur']
    train_new['prix_m2_total'] = train_new['prix'] / train_new['m2_total']

    # now export
    train_name = "train_with_clusters_and_m2.csv"
    test_name = "test_with_clusters_and_m2.csv"
    train_new.to_csv(train_name, index=False)
    test_new.to_csv(test_name, index=False)

    print("Train shape: ", train_new.shape)
    print("Test shape: ", test_new.shape)

    return train_new, test_new

In [14]:
#train_new, test_new = create_new_df_with_m2(train_df, test_df)

# Load the datasets
folder = "PRAMA2025/"
train_new = pd.read_csv(folder + 'train_with_clusters_and_m2.csv')
test_new = pd.read_csv(folder + 'test_with_clusters_and_m2.csv')

In [15]:
train_new.head()

Unnamed: 0,id,prix,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,vue_mer,...,Month,Day,cos_month,jardin,etage,soussol,m2_outside,m2_total,prix_m2_interieur,prix_m2_total
0,4443800785,481000,2,1.0,150.501672,360.460795,85.470085,65.031587,1.0,0,...,11,21,0.004426,1,1,1,360.460795,510.962467,3195.977778,941.360727
1,2722059013,550000,2,1.0,117.985879,4046.822742,117.985879,0.0,1.0,0,...,2,4,-0.416147,1,1,0,4046.822742,4164.808621,4661.574803,132.058889
2,8856004730,199950,2,2.75,147.714604,1943.236715,147.714604,0.0,1.5,0,...,9,17,-0.91113,1,1,0,1943.236715,2090.951319,1353.623774,95.62633
3,2473003210,364808,3,1.75,215.533259,731.605351,150.501672,65.031587,1.0,0,...,3,13,-0.989992,1,1,1,731.605351,947.13861,1692.583324,385.168545
4,6743700335,470000,3,2.0,167.22408,1176.978818,167.22408,0.0,1.0,0,...,6,4,0.96017,1,1,0,1176.978818,1344.202899,2810.6,349.649596


# First train model

In [16]:
from datetime import datetime

def current_time_filename():
    # Obtenir la date et l'heure actuelles au format mois_jour_heure_min pour au cas où on veut spammer le kaggle
    current_time = datetime.now().strftime("%m_%d_%H_%M")

    # Et paf le nom de fichier avec la date et l'heure actuelles
    file_name = f"predictions_{current_time}"

    return file_name

In [17]:
rf = RandomForestRegressor(random_state=42)

In [18]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],  # Nombre d'arbres dans la forêt
    'max_depth': [10, 20, 30],#, None],  # Profondeur maximale des arbres
    'min_samples_split': [2, 5, 10],  # Nombre minimal d'échantillons pour une division interne
    'min_samples_leaf': [1, 2, 4],    # Nombre minimal d'échantillons par feuille
    'bootstrap': [True, False]        # Utiliser bootstrap ou non, via ce qu'à dit le prof
}

In [19]:
# train with price m2
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_absolute_error')

def train_with_total_m2_price(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df[['prix_m2_interieur', 'prix_m2_total']]

    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape)

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_

def train_with_m2_price(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df['prix_m2_interieur']

    print("X_train shape: ", X_train.shape)
    print("y_train shape: ", y_train.shape)

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_

def train_with_price(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix'])
    y_train = train_df['prix']

    grid_search.fit(X_train, y_train)

    print(f"Meilleurs paramètres : {grid_search.best_params_}")

    return grid_search.best_estimator_
    

## On cherche les meilleurs paramètres avec grid search

Meilleurs params :
Sans zip code, sans clustering des zip code, etude du prix
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}

Clustering des zips, etude du prix (52min13.9s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}

Clustering, etude du prix au m2 intérieur (24min14.4s)
- Meilleurs paramètres : {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}

In [20]:
# best_rf_m2_price = train_with_m2_price(train_new, test_new)

In [21]:
# best_rf_full_price = train_with_price(train_df, test_df)

In [22]:
best_rf_total_m2_price = train_with_total_m2_price(train_new, test_new)

X_train shape:  (17147, 30)
y_train shape:  (17147, 2)
Fitting 5 folds for each of 162 candidates, totalling 810 fits
Meilleurs paramètres : {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


In [33]:
def predict_for_m2(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df['prix_m2_interieur']

    model = RandomForestRegressor(bootstrap = True, max_depth = 20, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, verbose=2, n_jobs=-1)

    model.fit(X_train, y_train)

    X_test = test_df.drop(columns=['id'])

    y_pred_m2 = model.predict(X_test) # la ca predit pour un prix au m2 !
    print(len(y_pred_m2)) # ca c'est pour du debug

    date_filename = current_time_filename()
    print(date_filename)

    filename = f"{date_filename}-cluster-prix-m2-1.csv"
    # Sauvegarder les résultats dans un fichier CSV avec ce nouveau nom
    test_df['prix_m2'] = y_pred_m2
    test_df['prix'] = test_df['prix_m2'] * test_df['m2_interieur']
    test_df[['id', 'prix']].to_csv(filename, index=False)

    print(f"Fichier sauvegardé sous : {filename}")

def predict_for_total_m2(train_df, test_df):
    X_train = train_df.drop(columns=['id', 'prix', 'prix_m2_interieur', 'prix_m2_total'])
    y_train = train_df[['prix_m2_interieur', 'prix_m2_total']]

    model = RandomForestRegressor(bootstrap = True, max_depth = 20, min_samples_leaf = 2, min_samples_split = 2, n_estimators = 200, verbose=1, n_jobs=-1)

    model.fit(X_train, y_train)

    X_test = test_df.drop(columns=['id'])

    y_pred_m2 = model.predict(X_test) # la ca predit pour un prix au m2 !
    print(len(y_pred_m2)) # ca c'est pour du debug

    #print(y_pred_m2[0]) # on voit que le premier de chaque duo est le prix au m2 interieur, l'autre au total

    y_pred_m2_array = seperate_total_and_inside(y_pred_m2)

    date_filename = current_time_filename()
    print(date_filename)

    filename = f"{date_filename}-cluster-prix-m2-2.csv"
    # Sauvegarder les résultats dans un fichier CSV avec ce nouveau nom
    test_df['prix_m2_tot'] = y_pred_m2_array[1]
    test_df['prix'] = test_df['prix_m2_tot'] * test_df['m2_total']
    test_df[['id', 'prix']].to_csv(filename, index=False)

    print(f"Fichier sauvegardé sous : {filename}")

def seperate_total_and_inside(array_to_separate):
    array_inside = []
    array_tot = []
    for c in array_to_separate :
        array_inside.append(c[0])
        array_tot.append(c[1])
    return array_inside, array_tot

In [34]:
predict_for_total_m2(train_new, test_new)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    3.1s


4287
predictions_03_14_09_43
Fichier sauvegardé sous : predictions_03_14_09_43-cluster-prix-m2-2.csv


[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    3.7s finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    0.0s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.0s finished


In [None]:
break

# le code du cours de Logic

In [None]:
# Train a regression model
#model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=1, n_jobs=-1)
#model.fit(X_train, y_train)

In [None]:
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, verbose=2, n_jobs=5)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_

In [None]:
# Prepare the test data (we don't have 'Sales_Qty' for the test set)
X_test = test_df.drop(columns=['index', 'Sales_Date'])

In [None]:
# Make predictions
model = best_model
predictions = model.predict(X_test)

In [None]:
# Output predictions (You can store them in a new DataFrame and save to CSV)
test_df['Predicted_Sales_Qty'] = predictions
test_df[['index', 'Predicted_Sales_Qty']].to_csv('predictions.csv', index=False)

In [None]:
# Optionally, evaluate the model on the training set
train_predictions = model.predict(X_train)
mse = mean_squared_error(y_train, train_predictions)
print(f"Mean Squared Error on training set: {mse}")