# IMPORT

In [2]:
import pandas as pd
import numpy as np

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

import pickle


In [3]:
df = pd.read_csv('Kangaroo.csv')
df.drop(columns=['Unnamed: 0'], axis=1, inplace=True)


# CLEANING FUNCTION

In [5]:
def cleaning_dataframe(df, df_giraffe = False, is_training = True):
    """
    This function is cleaning the dataframe. The steps are:
    #### Mapping of EPC score (A -> G) into the mean of EPC classification (kWh/m2.year) by region (Wallonia / Flanders / Brussels) & remove unwanted EPC
        df[epcScore] (object)  -> df[epc_enum] (float64)

    #### Removing ahberant values
        df['variable'] >= value (float64 / int64) -> np.nan
    
    #### Fill NaN in booleans columns and mapping true/false in boolean columns
        df['variable_bool'] (NaN) -> df['variable_bool'] = 'False' (object)
        df['variable_bool'] (object) -> df['variable_bool'] (int64)

    #### Summation of the parking counts
        df[ParkingCountIndoor] (float64) + df[ParkingCountOutdoor] (float64) -> df[ParkingCount] (float64) 
    
    #### Label-encoding for categories 
        df[subtype] = subtype (object) -> df[subtype_group] = group of subtypes (int64)
        df[province] (object) -> df[province_mapping] (int64)
        
        df[type] = type (object) -> df[isHouse] (float64)
        df[buildingCondition] (object) -> df[buildingCondition_mapping] (float64) 
        df[floodZoneType] (object) -> df[floodZoneType_mapping] (float64) 
        df[heatingType] (object) -> df[heatingType_mapping] (float64) 
        df[kitchenType] (object) -> df[kitchenType_mapping] (float64)
        
        df[facedeCount] (float64) -> df[facadeCount_mapping] (category)
        df[buildingConstructionYear] = years (float64) -> df[buildingConstructionYear_mapping] (category)

    #### Missing values for gardenSurface or terraceSurface
        0 if hasGarden or hasTerrace is 0

    #### Drop rows where there is no bathroomCount or bedroomCount 

    #### Merge 4 columns from data.csv (df_giraffe) where df_giraffe['propertyId'] == df['id']
        df_giraffe['latitude', 'longitude'] (float64)
        df_giraffe['primaryEnergyConsumptionPerSqm', 'cadastralIncome'] (int64)

    #### Filter the rows with a price margin (in a range of 50 000€ to 1 000 000€)

    #### Remove columns we don't use

    """
    
    #EPC SCORE
    epc_unwanted = ['C_A', 'F_C', 'G_C', 'D_C', 'F_D', 'E_C', 'G_E', 'E_D', 'C_B', 'X', 'G_F']
    df_epc = df[~df['epcScore'].isin(epc_unwanted)].copy()

    wallonia_provinces = ['Liège', 'Walloon Brabant', 'Namur', 'Hainaut', 'Luxembourg']
    flanders_provinces = ['Antwerp', 'Flemish Brabant', 'East Flanders', 'West Flanders', 'Limburg']

    wallonia_epc_map = {
        'A++' : 0,
        'A+' : 30,
        'A' : 65,
        'B' : 125,
        'C' : 200,
        'D' : 300,
        'E' : 375,
        'F' : 450,
        'G' : 510
    }

    flanders_epc_map = {
        'A++' : 0,
        'A+' : 0,
        'A' : 50,
        'B' : 150,
        'C' : 250,
        'D' : 350,
        'E' : 450,
        'F' : 500,
        'G' : 510,
    }

    brussels_epc_map = { 
        'A++' : 0,
        'A+' : 0,
        'A' : 45,
        'B' : 75,
        'C' : 125, 
        'D' : 175,
        'E' : 250,
        'F' : 300,
        'G' : 350,
    }

    df_epc.loc[df_epc['province'].isin(wallonia_provinces), 'epc_enum'] = df_epc['epcScore'].map(wallonia_epc_map).apply(pd.to_numeric)
    df_epc.loc[df_epc['province'].isin(flanders_provinces), 'epc_enum'] = df_epc['epcScore'].map(flanders_epc_map).apply(pd.to_numeric)
    df_epc.loc[df_epc['province'] == 'Brussels', 'epc_enum'] = df_epc['epcScore'].map(brussels_epc_map).apply(pd.to_numeric)

    # REMOVE ERROS / TOO BIG VALUES
    df_without_outliers = df_epc.copy()
    df_without_outliers.loc[df_without_outliers['bedroomCount'] >= 100, 'bedroomCount'] = np.nan
    df_without_outliers.loc[df_without_outliers['bathroomCount'] >= 100, 'bathroomCount'] = np.nan
    df_without_outliers.loc[df_without_outliers['toiletCount'] >= 25, 'toiletCount'] = np.nan
    df_without_outliers.loc[df_without_outliers['habitableSurface'] >= 600, 'habitableSurface'] = np.nan
    df_without_outliers.loc[df_without_outliers['landSurface'] >= 1000, 'landSurface'] = np.nan
    df_without_outliers.loc[df_without_outliers['gardenSurface'] >= 500, 'gardenSurface'] = np.nan
    df_without_outliers.loc[df_without_outliers['terraceSurface'] >= 250, 'terraceSurface'] = np.nan
    df_without_outliers.loc[df_without_outliers['parkingCountIndoor'] >= 10, 'parkingCountIndoor'] = 1
    df_without_outliers.loc[df_without_outliers['parkingCountOutdoor'] >= 10, 'parkingCountOutdoor'] = 1
    
    # BOOLEANS COLUMNS (FILL NaN + MAPPING)
    booleans_columns = ['hasAttic', 'hasBasement', 'hasDressingRoom', 'hasDiningRoom', 'hasLift', 'hasHeatPump', 'hasPhotovoltaicPanels', 'hasThermicPanels', 'hasLivingRoom', 'hasGarden', 'hasAirConditioning', 'hasArmoredDoor', 'hasVisiophone', 'hasOffice', 'hasSwimmingPool', 'hasFireplace', 'hasTerrace']
    df_without_outliers.loc[:, booleans_columns] = df_without_outliers[booleans_columns].fillna('False')

    boolean_to_num = {'True' : 1, 
        'true' : 1, 
        'False' : 0, 
        'false' : 0,
        False : 0,
        True: 1}

    for col in booleans_columns:
        df_without_outliers.loc[:, col] = df_without_outliers[col].replace('nan', 'false') #parfois je pense qu'il est écrit nan et c'est pas NaN
        df_without_outliers.loc[:, col] = df_without_outliers[col].map(boolean_to_num)

    
    # CREATE A PARKING COLUMN (INDOOR + OUTDOOR) 
    df_with_park = df_without_outliers.copy()
    df_with_park.loc[:, 'parkingCount'] = df_with_park[['parkingCountIndoor', 'parkingCountOutdoor']].sum(axis=1, min_count=1)

    # LABEL-ENCODING FOR CATEGORIES
    ## subgroup
    subtype_to_group = {
    "APARTMENT": 1,
    "FLAT_STUDIO": 1,
    "DUPLEX": 1,
    "TRIPLEX": 1,
    "PENTHOUSE": 1,
    "LOFT": 1,
    "SERVICE_FLAT": 1,
    "GROUND_FLOOR": 1,
    "KOT": 1,
    "MIXED_USE_BUILDING": 1,

    "HOUSE": 2,
    "TOWN_HOUSE": 2,
    "VILLA": 2,
    "CHALET": 2,
    "BUNGALOW": 2,
    "COUNTRY_COTTAGE": 2,

    "MANOR_HOUSE": 3,
    "MANSION": 3,
    "EXCEPTIONAL_PROPERTY": 3,
    "CASTLE": 3,
    "FARMHOUSE": 3,

    "APARTMENT_BLOCK": 4,
    "APARTMENT_GROUP" : 4,
    "HOUSE_GROUP": 4,

    "OTHER_PROPERTY": 5,
    "PAVILION": 5
    }

    df_subtype = df_with_park.copy()
    df_subtype.loc[:, 'subtype_group'] = df_subtype['subtype'].map(subtype_to_group).apply(pd.to_numeric)

    ## building construction year
    df_year = df_subtype.copy()
    years_bins = [1850, 1875, 1900, 1925, 1950, 1975, 2000, 2025, 2050]
    years_labels = [1, 2, 3, 4, 5, 6, 7, 8]
    df_year.loc[:, 'buildingConstructionYear_mapping'] = pd.cut(
    df_year['buildingConstructionYear'], 
    bins= years_bins,
    labels= years_labels)

    ## type
    df_type = df_year.copy()
    df_type.loc[:, 'isHouse'] = df_type['type'].map({ 
        "APARTMENT" : 0,
        "HOUSE" : 1
    }).apply(pd.to_numeric)

    ## provinces
    df_province = df_type.copy()
    df_province.province.unique()
    province_mapping = { 
        'Brussels' : 1,
        'Luxembourg' : 2,
        'Antwerp' : 3,
        'Flemish Brabant' : 4,
        'East Flanders' : 5,
        'West Flanders' : 6,
        'Liège' : 7,
        'Walloon Brabant' : 8,
        'Limburg' : 9,
        'Namur' : 10,
        'Hainaut' : 11
    }

    df_province.loc[:, 'province_mapping'] = df_province['province'].map(province_mapping).apply(pd.to_numeric)

    ## building condition
    df_condition = df_province.copy()
    condition_mapping = { 
            'GOOD' : 5,
            'TO_BE_DONE_UP' : 4,
            'AS_NEW' : 3,
            'JUST_RENOVATED' : 2,
            'TO_RENOVATE' : 1,
            'TO_RESTORE' : 0
        }

    df_condition.loc[:, 'buildingCondition_mapping'] = df_condition['buildingCondition'].map(condition_mapping).apply(pd.to_numeric)

    ## flood zone type
    df_flood = df_condition.copy()
    floodZoneType_mapping = {
            "NON_FLOOD_ZONE": 1,
            "POSSIBLE_N_CIRCUMSCRIBED_WATERSIDE_ZONE": 2,
            "CIRCUMSCRIBED_WATERSIDE_ZONE": 3,
            "POSSIBLE_N_CIRCUMSCRIBED_FLOOD_ZONE": 4,
            "POSSIBLE_FLOOD_ZONE": 5,
            "CIRCUMSCRIBED_FLOOD_ZONE": 6,
            "RECOGNIZED_FLOOD_ZONE": 7,
            "RECOGNIZED_N_CIRCUMSCRIBED_WATERSIDE_FLOOD_ZONE": 8,
            "RECOGNIZED_N_CIRCUMSCRIBED_FLOOD_ZONE": 9
            }

    df_flood.loc[:, 'floodZoneType_mapping'] = df_flood['floodZoneType'].map(floodZoneType_mapping).apply(pd.to_numeric)

    ## heating type
    df_heat = df_flood.copy()
    heatingType_mapping = { 
        'GAS' : 1, 
        'FUELOIL' : 2, 
        'ELECTRIC' : 3, 
        'PELLET' : 4, 
        'WOOD' : 4, 
        'SOLAR' : 4, 
        'CARBON' : 4
    }

    df_heat.loc[:, 'heatingType_mapping'] = df_heat['heatingType'].map(heatingType_mapping).apply(pd.to_numeric)

    ## kitchen type
    df_kitchen = df_heat.copy()
    kitchenType_mapping = {
        "NOT_INSTALLED": 0,
        "USA_UNINSTALLED": 0,

        "USA_SEMI_EQUIPPED": 1,
        "SEMI_EQUIPPED": 1,

        "USA_INSTALLED": 2,
        "INSTALLED": 2,

        "USA_HYPER_EQUIPPED": 3,
        "HYPER_EQUIPPED": 3,
        }

    df_kitchen.loc[:, 'kitchenType_mapping'] = df_kitchen.kitchenType.map(kitchenType_mapping).apply(pd.to_numeric)

    ## facade count
    df_facade = df_kitchen.copy()
    facedeCount_bins = [0, 1, 2, 3, 4, float('inf')]
    facedeCount_labels = [1, 2, 3, 4, 5]

    df_facade.loc[:, 'facadecount_mapping'] = pd.cut( 
        df_facade['facedeCount'],
        bins = facedeCount_bins,
        labels = facedeCount_labels,
        include_lowest= True
    ).apply(pd.to_numeric)

    # MISSING VALUE FOR ...SURFACE
    df_hasG_hasT = df_facade.copy()
    df_hasG_hasT.loc[df_hasG_hasT['hasGarden']  == 0, 'gardenSurface'] = 0
    df_hasG_hasT.loc[df_hasG_hasT['hasTerrace']  == 0, 'terraceSurface'] = 0

    # IF NO BATHROOM OR BEDROOM COUNT: DROP
    df_no_bed_bath = df_hasG_hasT.copy()
    if is_training: 
        df_no_bed_bath = df_no_bed_bath.dropna(subset=['bedroomCount', 'bathroomCount'])

    else:
        df_no_bed_bath = df_no_bed_bath
    
    # IMPORT FROM AN OTHER DATASET
    df_with_giraffe = df_no_bed_bath.copy()
    if is_training : 
        df_giraffe = pd.read_csv('data.csv')
        df_with_giraffe = df_with_giraffe.merge(
        df_giraffe[['propertyId', 'latitude', 'longitude', 'primaryEnergyConsumptionPerSqm', 'cadastralIncome']],  
        how='inner',
        left_on='id',
        right_on='propertyId'
        )

    else: 
        df_with_giraffe = df_with_giraffe
        giraffe_cols = [
        "latitude",
        "longitude",
        "primaryEnergyConsumptionPerSqm",
        "cadastralIncome"
        ]
        for col in giraffe_cols:
            if col not in df.columns:
                df[col] = 0

    # PRICE MARGIN
    df_margin = df_with_giraffe.copy()
    if is_training: 
        df_margin = df_margin[(df_margin['price'] >= 50000) & (df_margin['price'] <= 1000000)]
    
    else: 
        df_margin = df_margin

    # REMOVE ROWS
    df_dropped = df_margin.drop(columns=['url', 'type', 'subtype', 'province', 'monthlyCost', 'diningRoomSurface', 'buildingCondition', 'buildingConstructionYear', 'facedeCount', 'floorCount', 'streetFacadeWidth', 'floodZoneType', 'kitchenType', 'hasBalcony', 'gardenOrientation', 'terraceOrientation', 'accessibleDisabledPeople', 'epcScore', 'kitchenSurface', 'livingRoomSurface', 'roomCount', 'parkingCountIndoor', 'parkingCountOutdoor', 'locality', 'propertyId', 'hasTerrace', 'hasGarden', 'heatingType'], errors='ignore')

    return df_dropped

In [6]:
def stats(X_train):
    """ 
    Dictionnaire de stats (stats_from_X_train) stocke toutes les informations nécessaires à l’imputation des valeurs manquantes.
    
    # Fill_with_mode:
    Pour chacune de ces colonnes, elle calcule la valeur la plus fréquente (mode) dans X_train.
    
    # Fill_with_median : 
    Pour les colonnes quantitatives comme gardenSurface et terraceSurface, la fonction calcule la médiane sur X_train.

    
    # Imputation par régression pour les features importantes: 
    - toiletCount imputée via les variables bedroomCount, bathroomCount, habitableSurface
    - habitableSurface imputée via les variables bathroomCount, bedroomCount, parkingCount, isHouse
    - landSurface imputée via les variables habitableSurface, gardenSurface, parkingCount

    Chaque imputer est fit exclusivement sur X_train.
    
    #La fonction retourne stats_from_X_train, contenant :
    - Les modes calculées
    - Les médianes
    - Les trois modèles d’imputation régressive déjà entraînés
        
    """
    stats_from_X_train = {}
    stats_from_X_train['imputers'] = {}

    # fill with mode 
    fill_with_mode = [
        'heatingType_mapping', 'facadecount_mapping', 'floodZoneType_mapping',
        'buildingCondition_mapping', 'buildingConstructionYear_mapping', 'epc_enum', 'kitchenType_mapping'
    ]
    
    stats_from_X_train['mode'] = {
        col: X_train[col].mode()[0] for col in fill_with_mode
        }

    # fill with median 
    fill_with_median = ['gardenSurface', 'terraceSurface']
    stats_from_X_train['median'] = {
        col: X_train[col].median() for col in fill_with_median
        }

    # regression imputation 
    stats_from_X_train['imputers']['toiletCount'] = IterativeImputer(max_iter=10, random_state=0, initial_strategy='median').fit(
        X_train[['bedroomCount', 'bathroomCount', 'toiletCount', 'habitableSurface']]
    )
    stats_from_X_train['imputers']['habitableSurface'] = IterativeImputer(max_iter=10, random_state=0, initial_strategy='median').fit(
        X_train[['bathroomCount', 'bedroomCount', 'parkingCount', 'isHouse', 'habitableSurface']]
    )
    stats_from_X_train['imputers']['landSurface'] = IterativeImputer(max_iter=10, random_state=0, initial_strategy='median').fit(
        X_train[['habitableSurface', 'gardenSurface', 'parkingCount', 'landSurface']]
    )

    return stats_from_X_train


In [7]:
def transform_cleaning_traintestsplit(df, stats, is_training=True):
    """
    This function is for imputation of missing values.
    It has to be made on the dataframe splitted into x_test & x_train to don't have any data leakage.

    #### Regression imputation for important features 
    NaN in toiletCount, habitableSurface, parkingCount, landSurface -> mode (float64)

    #### Imputation with the mode for missing values in label variables 
    Note: facadecount_mapping & buildingConstructionYear_mapping (categrory) -> (int64)

    #### Imputation with the median for missing values in continuous variables

    #### Remove rows where there is any missing 
    """

    # REGRESSION IMPUTATION
    for feature, features_related in [('toiletCount', ['bedroomCount', 'bathroomCount', 'toiletCount', 'habitableSurface']),
                      ('habitableSurface', ['bathroomCount', 'bedroomCount', 'parkingCount', 'isHouse', 'habitableSurface']),
                      ('landSurface', ['habitableSurface', 'gardenSurface', 'parkingCount', 'landSurface'])]:
        
        if all(col in df.columns for col in features_related):
            df_imputed = stats['imputers'][feature].transform(df[features_related])
            df_imputed = pd.DataFrame(df_imputed, columns=features_related).round()
            for col in features_related:
                df[col] = df_imputed[col].values

    # MODE IMPUTATION 
    for col, mode_val in stats['mode'].items():
        if col in df.columns:
            df[col] = df[col].fillna(mode_val)

    # MEDIAN IMPUTATION 
    for col, median_val in stats['median'].items():
        if col in df.columns:
            df[col] = df[col].fillna(median_val)
   
    # CATEGORY TO INT
    for col in ['facadecount_mapping', 'buildingConstructionYear_mapping']:
        if col in df.columns:
            df[col] = df[col].astype(int, errors='ignore')

    # REMOVE ROWS WHERE NAN
    if is_training :
        df_final = df[~df.isna().any(axis=1)]

    else: 
        df_final = df

    return df_final

# TRAIN TEST SPLIT

In [1]:
df_final = cleaning_dataframe(df, df_giraffe= True)

y = df_final['price']
X = df_final.drop(['price', 'id'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'cleaning_dataframe' is not defined

In [9]:
# Cleaning X_train with the imputations 
stats_from_X_train = stats(X_train)
X_train_clean = transform_cleaning_traintestsplit(X_train, stats_from_X_train)

# Cleaning X_test with the imputations from X_train
X_test_clean  = transform_cleaning_traintestsplit(X_test, stats_from_X_train)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_clean)
X_test_scaled = scaler.transform(X_test_clean)

In [27]:
filtered = X_train_clean.loc[
    (X_train_clean['postCode'] == 1050) &
    (X_train_clean['bedroomCount'] == 3) &
    (X_train_clean['bathroomCount'] == 2)
]

filtered


Unnamed: 0,bedroomCount,bathroomCount,postCode,habitableSurface,hasAttic,hasBasement,hasDressingRoom,hasDiningRoom,hasLift,hasHeatPump,...,province_mapping,buildingCondition_mapping,floodZoneType_mapping,heatingType_mapping,kitchenType_mapping,facadecount_mapping,latitude,longitude,primaryEnergyConsumptionPerSqm,cadastralIncome
1975,3.0,2.0,1050,107.0,0,0,0,0,1,0,...,1,1.0,1.0,1.0,1.0,2,-1.0,-1.0,86,-1
2122,3.0,2.0,1050,200.0,0,1,0,0,0,0,...,1,3.0,1.0,1.0,3.0,2,-1.0,-1.0,-1,-1
1708,3.0,2.0,1050,165.0,0,0,0,0,1,0,...,1,3.0,1.0,1.0,2.0,2,50.826276,4.365497,341,-1
1705,3.0,2.0,1050,188.0,0,1,1,1,1,0,...,1,3.0,1.0,1.0,3.0,2,50.805169,4.373342,195,3549
1988,3.0,2.0,1050,145.0,0,1,0,0,0,0,...,1,5.0,1.0,1.0,3.0,2,-1.0,-1.0,204,1388
1870,3.0,2.0,1050,143.0,0,1,0,1,0,0,...,1,2.0,1.0,1.0,3.0,3,50.83305,4.379857,208,-1
1866,3.0,2.0,1050,135.0,0,1,0,0,1,0,...,1,5.0,1.0,1.0,2.0,2,50.815393,4.37851,115,3143
1780,3.0,2.0,1050,164.0,0,1,0,0,1,0,...,1,3.0,5.0,1.0,3.0,2,-1.0,-1.0,260,-1
1730,3.0,2.0,1050,196.0,0,1,1,0,0,0,...,1,5.0,1.0,1.0,2.0,2,50.815441,4.380394,331,-1
2028,3.0,2.0,1050,212.0,0,1,1,0,1,1,...,1,3.0,1.0,1.0,3.0,2,50.803128,4.39365,146,-1


In [28]:
prix_matches = y_train.loc[filtered.index]

In [29]:
filtered_test = pd.concat([filtered, prix_matches], axis=1)

filtered_test

Unnamed: 0,bedroomCount,bathroomCount,postCode,habitableSurface,hasAttic,hasBasement,hasDressingRoom,hasDiningRoom,hasLift,hasHeatPump,...,buildingCondition_mapping,floodZoneType_mapping,heatingType_mapping,kitchenType_mapping,facadecount_mapping,latitude,longitude,primaryEnergyConsumptionPerSqm,cadastralIncome,price
1975,3.0,2.0,1050,107.0,0,0,0,0,1,0,...,1.0,1.0,1.0,1.0,2,-1.0,-1.0,86,-1,395000.0
2122,3.0,2.0,1050,200.0,0,1,0,0,0,0,...,3.0,1.0,1.0,3.0,2,-1.0,-1.0,-1,-1,995000.0
1708,3.0,2.0,1050,165.0,0,0,0,0,1,0,...,3.0,1.0,1.0,2.0,2,50.826276,4.365497,341,-1,550000.0
1705,3.0,2.0,1050,188.0,0,1,1,1,1,0,...,3.0,1.0,1.0,3.0,2,50.805169,4.373342,195,3549,695000.0
1988,3.0,2.0,1050,145.0,0,1,0,0,0,0,...,5.0,1.0,1.0,3.0,2,-1.0,-1.0,204,1388,599000.0
1870,3.0,2.0,1050,143.0,0,1,0,1,0,0,...,2.0,1.0,1.0,3.0,3,50.83305,4.379857,208,-1,595000.0
1866,3.0,2.0,1050,135.0,0,1,0,0,1,0,...,5.0,1.0,1.0,2.0,2,50.815393,4.37851,115,3143,515000.0
1780,3.0,2.0,1050,164.0,0,1,0,0,1,0,...,3.0,5.0,1.0,3.0,2,-1.0,-1.0,260,-1,950000.0
1730,3.0,2.0,1050,196.0,0,1,1,0,0,0,...,5.0,1.0,1.0,2.0,2,50.815441,4.380394,331,-1,795000.0
2028,3.0,2.0,1050,212.0,0,1,1,0,1,1,...,3.0,1.0,1.0,3.0,2,50.803128,4.39365,146,-1,750000.0


In [30]:
median_test_price = filtered_test.price.median()
median_test_price

np.float64(725000.0)

# XGBOOST MODEL

In [11]:
xgb_model = xgb.XGBRegressor(n_estimators=3000, random_state=43, learning_rate=0.05, subsample= 0.8)

In [12]:
xgb_model.fit(X_train_scaled, y_train)

In [13]:
y_pred = xgb_model.predict(X_test_scaled)

In [14]:
r2 = r2_score(y_test, y_pred)
print("R2 Score:", r2)

R2 Score: 0.8350811268123159


In [15]:
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error (RMSE):", rmse, "€")

rae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (RAE):", rae, "€")

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Root Mean Squared Error (RMSE): 75067.6222361942 €
Mean Absolute Error (RAE): 50025.73513799862 €
Mean Squared Error: 5635147908.195958


# Save the model and the scaler

In [16]:
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

with open("model.pkl", "wb") as f:
    pickle.dump(xgb_model, f)