# Import libs and load dfs

In [None]:
# Classic imports
import pandas as pd
import numpy as np

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

# Other imports
import tqdm

In [None]:
# Load the datasets

# folder = "PRAMA2025/"
# modify_df = pd.read_csv(folder + 'train_with_clusters.csv')
# test_df = pd.read_csv(folder + 'test_with_clusters.csv')

# for those who have the datasets in the same folder as the python code (which is the way to go ngl)
# florian is quite a flexer but we don't have to be like him

origin_train_name = "train_data"
origin_test_name = "test_data"

train_df = pd.read_csv('CSV DATA/'+origin_train_name+'.csv')  # fichier csv VIERGE d'origine du train
test_df = pd.read_csv('CSV DATA/'+origin_test_name+'.csv')  # fichier csv VIERGE d'origine du test

#little printy print 
print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)

# Transforming the data

### Attention nouvelle règle : 

- Damien is to come to the next classes with no underwear and a kilt.
- We'll add a value between 0 and 1 showcasing the evolution of the year : 0 for the beginnin of the year, 1 for the end of the year. Then, we'll multiply this by pi and use the sine of this value.
- This way we'll have values closer to 1 for 'colder' days and 0 for 'hotter' days. 

## Sinus of all date data

In [None]:
def sin_ratios_for_date(modify_df, name):
    # Preprocess the datasets
    # Convert the Sales_Date to datetime format
    modify_df['date'] = pd.to_datetime(modify_df['date'])

    # Extract useful date features (e.g., year, month, day)
    modify_df['Year'] = modify_df['date'].dt.year
    modify_df['Month'] = modify_df['date'].dt.month
    modify_df['Day'] = modify_df['date'].dt.day
    modify_df['sin_month'] = modify_df['Month'].map(lambda x: np.sin(float(x)/12))

    modify_df['day_of_year'] = modify_df['date'].dt.dayofyear
    # Calcul du nombre de jours dans l'année (365 ou 366)
    modify_df['days_in_year'] = np.where(modify_df['date'].dt.is_leap_year, 366, 365)
    # Calcul El Famoso ratio (float entre 0 et 1)
    modify_df['day_ratio'] = modify_df['day_of_year'] / modify_df['days_in_year']
    # Calculer le sinus de pi * ratio
    modify_df['sin_dayofyear'] = np.sin(np.pi * modify_df['day_ratio'])
    modify_df.drop(columns=['day_of_year', 'days_in_year', 'day_ratio'], inplace=True)
    modify_df = modify_df.drop(columns=['date', 'Year', 'Month', 'Day'])

    new_name = name + "_with_sin_ratios"

    return modify_df, new_name

## Transform df to have price/square meters

In [None]:
def create_new_df_with_m2(to_modify_df, name):
    # create a new data frame with everything plus a column corresponding to 1 if "m2_jardin" is >0, 0 otherwise
    # we will also do the same for m2_etage and m2_soussol
    modify_df = to_modify_df.copy()
    modify_df['jardin'] = train_df['m2_jardin'].map(lambda x: 1 if x > 0 else 0)
    modify_df['etage'] = train_df['m2_etage'].map(lambda x: 1 if x > 0 else 0)
    modify_df['soussol'] = train_df['m2_soussol'].map(lambda x: 1 if x > 0 else 0)

    # now add three new columns per df, one for interior m2 and one for exterior m2 and one for total m2
    # for interior we already have m2_interieur which is the sum of etage and soussol
    modify_df['m2_outside'] = modify_df['m2_jardin']
    modify_df['m2_total'] = modify_df['m2_interieur'] + modify_df['m2_outside']

    # now we add two columns per df, one for prix per m2 interior and one for prix per m2 total
    modify_df['prix_m2_interieur'] = modify_df['prix'] / modify_df['m2_interieur']
    modify_df['prix_m2_total'] = modify_df['prix'] / modify_df['m2_total']

    # now export
    train_name = "train_with_clusters_and_m2.csv"
    modify_df.to_csv(train_name, index=False)

    print("Train shape: ", modify_df.shape)

    new_name = name + "_with_m2_price"

    return modify_df, new_name

## Scaling the data

In [None]:
def scaling_data(df, name, scaler = None):
    if scaler :
        scaler = scaler
    else :
        # Initialize the MinMaxScaler
        scaler = MinMaxScaler()

    # Scale the data, but not the ID column
    columns_to_scale = df.columns.difference(['id', 'lat', 'long', 'sin_month', 'sin_dayofyear'])
    to_scale_df1 = df[columns_to_scale].copy() # to not modify the original one

    scaled_df1 = df.copy()
    scaled_df1[columns_to_scale] = scaler.fit_transform(to_scale_df1)

    scaled_name = name + "_scaled"

    return scaler, scaled_df1, scaled_name

## Export the csv

In [None]:
def exporting_df(df, name):
    df.to_csv('CSV data/'+ name + '.csv', index=False)

### Call the transfos

In [None]:
# dates to sinus function
train_sin, train_sin_name = sin_ratios_for_date(train_df, origin_train_name)
test_sin, test_sin_name = sin_ratios_for_date(test_df, origin_test_name)

# find price per square meter
train_sin_m2, train_sin_m2_name = create_new_df_with_m2(train_sin, train_sin_name)

# Scale the data
scaler1, train_sin_m2_scaled, train_sin_m2_scaled_name = scaling_data(train_sin_m2, train_sin_m2_name)
scaler1, train_sin_scaled, train_sin_scaled_name = scaling_data(train_sin, train_sin_name, scaler=scaler1)
scaler1, test_sin_scaled, test_sin_scaled_name = scaling_data(test_sin, test_sin_name, scaler=scaler1)

In [None]:
train_sin_m2_scaled.head()

In [None]:
exporting_df(train_sin_m2_scaled, train_sin_m2_scaled_name)

In [None]:
# X_train, y_train, X_test, y_test = train_test_split(train_new.drop(columns=['prix']), train_new['prix'], test_size=0.2, random_state=42)

In [None]:
# clusters = ['cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve']
# train_new.drop(columns=clusters, inplace=True)