# Import libs and load dfs

In [1]:
# Classic imports
import pandas as pd
import numpy as np

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
# - Preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Other imports
import tqdm

# Transforming the data

### Attention nouvelle règle : 

- Damien is to come to the next classes with no underwear and a kilt.
- We'll add a value between 0 and 1 showcasing the evolution of the year : 0 for the beginnin of the year, 1 for the end of the year. Then, we'll multiply this by pi and use the sine of this value.
- This way we'll have values closer to 1 for 'colder' days and 0 for 'hotter' days. 

## Sinus of all date data

In [2]:
def sin_ratios_for_date(modify_df, name):
    # Preprocess the datasets
    # Convert the Sales_Date to datetime format
    modify_df['date'] = pd.to_datetime(modify_df['date'])

    # Extract useful date features (e.g., year, month, day)
    modify_df['Year'] = modify_df['date'].dt.year
    modify_df['Month'] = modify_df['date'].dt.month
    modify_df['Day'] = modify_df['date'].dt.day
    modify_df['sin2_month'] = modify_df['Month'].map(lambda x: ((np.sin(float(x)/12)**2)))  # value will possibly be negative if standardscaler has been applied after calculus of sinus

    modify_df['day_of_year'] = modify_df['date'].dt.dayofyear
    # Calcul du nombre de jours dans l'année (365 ou 366)
    modify_df['days_in_year'] = np.where(modify_df['date'].dt.is_leap_year, 366, 365)
    # Calcul El Famoso ratio (float entre 0 et 1)
    modify_df['day_ratio'] = modify_df['day_of_year'] / modify_df['days_in_year']
    # Calculer le sinus de pi * ratio
    modify_df['sin_dayofyear'] = np.sin(np.pi * modify_df['day_ratio'])
    modify_df.drop(columns=['day_of_year', 'days_in_year', 'day_ratio'], inplace=True)
    modify_df = modify_df.drop(columns=['date', 'Year', 'Month', 'Day'])

    new_name = name + "_with_sin_ratios"

    return modify_df, new_name

## Transform df to have price/square meters

In [3]:
def create_new_df_with_m2(to_modify_df, name, include_price = False):
    # create a new data frame with everything plus a column corresponding to 1 if "m2_jardin" is >0, 0 otherwise
    # we will also do the same for m2_etage and m2_soussol
    modify_df = to_modify_df.copy()
    modify_df['jardin'] = modify_df['m2_jardin'].map(lambda x: 1 if x > 0 else 0)
    modify_df['etage'] = modify_df['m2_etage'].map(lambda x: 1 if x > 0 else 0)
    modify_df['soussol'] = modify_df['m2_soussol'].map(lambda x: 1 if x > 0 else 0)

    # now add three new columns per df, one for interior m2 and one for exterior m2 and one for total m2
    # for interior we already have m2_interieur which is the sum of etage and soussol
    modify_df['m2_outside'] = modify_df['m2_jardin']
    modify_df['m2_total'] = modify_df['m2_interieur'] + modify_df['m2_outside']

    if include_price:
        # now we add two columns per df, one for prix per m2 interior and one for prix per m2 total
        modify_df['prix_m2_interieur'] = modify_df['prix'] / modify_df['m2_interieur']
        modify_df['prix_m2_total'] = modify_df['prix'] / modify_df['m2_total']

    new_name = name + "_with_m2_price"

    return modify_df, new_name

## Scaling the data

In [19]:
def scaling_data(df_train, df_test,name_train, name_test, scaler = None, scaler_type = 'standard'):
    if scaler :
        scaler = scaler
    else :
        if scaler_type == 'minmax':
            # Initialize the MinMaxScaler
            scaler = MinMaxScaler()
            scaler2 = MinMaxScaler()
        elif scaler_type == 'standard':
            # Initialize the StandardScaler
            scaler = StandardScaler()
            scaler2 = StandardScaler()
        else:
            scaler = MinMaxScaler() # Default scaler will be minmax
            scaler2 = MinMaxScaler()
            #raise ValueError("Invalid scaler type. Please choose between 'minmax' and 'standard'.

    # Scale the data, but not the ID column
    columns_to_scale = df_train.columns.difference(['id', 'lat', 'long', 'sin_month', 'sin_dayofyear'])
    columns_to_scale_without_price = df_train.columns.difference(['id', 'lat', 'long', 'sin_month', 'sin_dayofyear', 'prix','prix_m2_interieur', 'prix_m2_total'])
    to_scale_df_train = df_train[columns_to_scale].copy() # to not modify the original one

    scaled_df_train= df_train.copy()
    scaled_df_train[columns_to_scale] = scaler.fit_transform(to_scale_df_train)

    #on entraîne un nouveau scaler sur les mêmes colonnes en enlevant le prix
    df_train_without_price = df_train[columns_to_scale_without_price].copy()
    scaler2.fit(df_train_without_price)

    # df_test_to_scale = df_test[columns_to_scale_without_price].copy()
    df_test_to_scale = df_test[columns_to_scale_without_price].copy()
    print(df_test_to_scale.shape)
    print(df_test.shape)
    scaled_df_test = df_test.copy()
    # scaled_df_test = scaler2.transform(df_test_to_scale)
    scaled_array = scaler2.transform(df_test_to_scale)
    scaled_df_test = pd.DataFrame(scaled_array, columns=df_test_to_scale.columns, index=df_test_to_scale.index)
    # scaled_df_test = pd.DataFrame(scaled_df_test_array, columns=df_test.columns, index=df_test.index)
    # scaled_df_test = df_test.copy()
    # scaled_df_test = scaler2.transform(df_test_to_scale)

    if scaler_type:
        train_scaled_name = name_train + "_scaled" + "_" + scaler_type
        test_scaled_name = name_test + "_scaled" + "_" + scaler_type
    else:
        train_scaled_name = name_train + "_scaled"
        test_scaled_name = name_test + "_scaled"

    return scaler, scaled_df_train, scaled_df_test, train_scaled_name, test_scaled_name

## Export the csv

In [5]:
def exporting_df(df, name):
    df.to_csv('CSV data/'+ name + '.csv', index=False)


# Call the transfos

In [6]:
# Load the datasets
origin_train_name = "train_with_clusters"
origin_test_name = "test_with_clusters"

train_df = pd.read_csv('CSV DATA/' + origin_train_name + '.csv')  # fichier csv VIERGE d'origine du train, ou clusterisé
test_df = pd.read_csv('CSV DATA/' + origin_test_name + '.csv')  # fichier csv VIERGE d'origine du test, ou clusterisé

#little printy print 
print("Train shape: ", train_df.shape)
print("Test shape: ", test_df.shape)

Train shape:  (17147, 24)
Test shape:  (4287, 23)


In [7]:
# dates to sinus function
train_sin, train_sin_name = sin_ratios_for_date(train_df, origin_train_name)
test_sin, test_sin_name = sin_ratios_for_date(test_df, origin_test_name)

# find price per square meter
train_sin_m2, train_sin_m2_name = create_new_df_with_m2(train_sin, train_sin_name, include_price=True)
test_sin_m2, test_sin_m2_name = create_new_df_with_m2(test_sin, test_sin_name, include_price=False)

# Scale the data
# scaler1, train_sin_m2_scaled, train_sin_m2_scaled_name = scaling_data(train_sin_m2, train_sin_m2_name, scaler_type='standard')
# scaler1, train_sin_scaled, train_sin_scaled_name = scaling_data(train_sin, train_sin_name, scaler=scaler1, scaler_type='standard')
# scaler1, test_sin_m2_scaled, test_sin_m2_scaled_name = scaling_data(test_sin_m2, test_sin_m2_name, scaler=scaler1, scaler_type='standard')
# scaler1, test_sin_scaled, test_sin_scaled_name = scaling_data(test_sin, test_sin_name, scaler=scaler1, scaler_type='standard')




In [8]:
train_sin_m2.shape, test_sin_m2.shape

((17147, 32), (4287, 29))

In [20]:
scaler, scaled_df_train, scaled_df_test, train_scaled_name, test_scaled_name = scaling_data(train_sin_m2, test_sin_m2,train_sin_m2_name, test_sin_m2_name, scaler_type = 'standard')

(4287, 27)
(4287, 29)


In [21]:
scaled_df_train.shape, scaled_df_test.shape

((17147, 32), (4287, 27))

In [23]:
scaled_df_test

Unnamed: 0,annee_construction,annee_renovation,cluster_bas,cluster_eleve,cluster_moyen,cluster_tres_bas,cluster_tres_eleve,design_note,etage,etat_note,...,m2_soussol,m2_total,nb_chambres,nb_etages,nb_sdb,sin2_month,soussol,vue_mer,vue_note,zipcode
0,-0.686064,-0.212367,-0.511420,-0.500146,2.047660,-0.499964,-0.500055,-0.565740,0.0,2.448988,...,1.709564,-0.232674,0.674174,-0.918153,0.811236,-1.113308,1.248548,-0.090074,-0.306657,1.241156
1,-0.890303,-0.212367,-0.511420,-0.500146,-0.488362,2.000146,-0.500055,-2.257975,0.0,0.909381,...,-0.656952,0.128619,-0.398785,0.004358,-1.453014,1.538983,-0.800930,-0.090074,-0.306657,-0.854090
2,1.220168,-0.212367,-0.511420,-0.500146,2.047660,-0.499964,-0.500055,1.126496,0.0,-0.630227,...,0.086810,-0.357955,-1.471744,0.926869,-0.159157,0.016459,1.248548,-0.090074,-0.306657,0.829590
3,0.913809,-0.212367,-0.511420,1.999417,-0.488362,-0.499964,-0.500055,0.280378,0.0,-0.630227,...,-0.656952,-0.259676,-0.398785,0.926869,0.487771,0.768639,-0.800930,-0.090074,-0.306657,-1.321779
4,0.675530,-0.212367,-0.511420,1.999417,-0.488362,-0.499964,-0.500055,1.972614,0.0,-0.630227,...,-0.656952,-0.128909,-0.398785,0.926869,0.487771,-1.113308,-0.800930,-0.090074,-0.306657,-0.479939
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4282,-0.788183,-0.212367,1.955338,-0.500146,-0.488362,-0.499964,-0.500055,-0.565740,0.0,2.448988,...,-0.656952,-0.278712,-0.398785,0.926869,1.134700,-1.270457,-0.800930,-0.090074,-0.306657,1.241156
4283,1.288247,-0.212367,-0.511420,-0.500146,2.047660,-0.499964,-0.500055,0.280378,0.0,-0.630227,...,0.379807,-0.353184,-0.398785,0.926869,0.164307,-0.899431,1.248548,-0.090074,-0.306657,1.241156
4284,-0.107386,-0.212367,-0.511420,-0.500146,-0.488362,2.000146,-0.500055,0.280378,0.0,-0.630227,...,0.379807,-0.172800,-0.398785,-0.918153,0.164307,-0.326610,1.248548,-0.090074,-0.306657,-0.854090
4285,-2.115737,-0.212367,-0.511420,-0.500146,-0.488362,-0.499964,1.999781,0.280378,0.0,-0.630227,...,-0.409031,-0.264161,0.674174,0.926869,-0.159157,-1.366523,1.248548,-0.090074,-0.306657,0.829590


In [24]:
scaled_df_train

Unnamed: 0,id,prix,nb_chambres,nb_sdb,m2_interieur,m2_jardin,m2_etage,m2_soussol,nb_etages,vue_mer,...,cluster_tres_eleve,sin2_month,sin_dayofyear,jardin,etage,soussol,m2_outside,m2_total,prix_m2_interieur,prix_m2_total
0,4443800785,-0.167369,-1.471744,-1.453014,-0.505115,-0.271088,-1.050087,0.920725,-0.918153,-0.090074,...,-0.500055,1.538983,0.337523,0.0,0.0,1.248548,-0.271088,-0.281097,0.290452,0.600286
1,2722059013,0.018316,-1.471744,-1.453014,-0.885037,0.679263,-0.629499,-0.656952,-0.918153,-0.090074,...,-0.500055,-1.270457,0.296713,0.0,0.0,-0.800930,0.679263,0.657080,1.520718,-1.039685
2,8856004730,-0.923697,-1.471744,0.811236,-0.537680,0.136954,-0.244962,-0.656952,0.004358,-0.090074,...,-0.500055,0.768639,0.785650,0.0,0.0,-0.800930,0.136954,0.124587,-1.256075,-1.113512
3,2473003210,-0.480051,-0.398785,-0.482621,0.254730,-0.175406,-0.208911,0.920725,-0.918153,-0.090074,...,-0.500055,-1.113308,0.580800,0.0,0.0,1.248548,-0.175406,-0.169103,-0.971542,-0.526783
4,6743700335,-0.196971,-0.398785,-0.159157,-0.309726,-0.060588,0.007391,-0.656952,-0.918153,-0.090074,...,-0.500055,-0.326610,0.972118,0.0,0.0,-0.800930,-0.060588,-0.067151,-0.033046,-0.598759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17142,6738700075,0.569987,0.674174,0.811236,0.862606,-0.268214,0.367895,1.101031,0.004358,-0.090074,...,1.999781,-0.326610,0.998880,0.0,0.0,1.248548,-0.268214,-0.248179,-0.023633,1.086345
17143,9407100500,-0.727114,-0.398785,-0.482621,-0.591954,-0.111459,-0.305046,-0.656952,0.926869,-0.090074,...,-0.500055,-1.113308,0.566702,0.0,0.0,-0.800930,-0.111459,-0.124019,-0.790578,-0.814552
17144,686300930,-0.242719,-0.398785,-0.482621,-0.526825,-0.190807,-0.232945,-0.656952,-0.918153,-0.090074,...,-0.500055,-1.113308,0.523416,0.0,0.0,-0.800930,-0.190807,-0.201616,0.165864,-0.188526
17145,8161020060,-0.268284,0.674174,0.487771,-0.049208,0.157647,0.295794,-0.656952,0.926869,-0.090074,...,-0.500055,-0.326610,0.995105,0.0,0.0,-0.800930,0.157647,0.155931,-0.427985,-0.901190


In [22]:
exporting_df(scaled_df_train, train_scaled_name)
exporting_df(scaled_df_test, test_scaled_name)

In [25]:
exporting_df(train_sin_m2, train_sin_m2_name)

In [10]:
# X_train, y_train, X_test, y_test = train_test_split(train_new.drop(columns=['prix']), train_new['prix'], test_size=0.2, random_state=42)

In [11]:
# clusters = ['cluster_tres_bas', 'cluster_bas', 'cluster_moyen', 'cluster_eleve', 'cluster_tres_eleve']
# train_new.drop(columns=clusters, inplace=True)