In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 20)

In [2]:
df = pd.read_pickle("../data/clean/clean_data.pkl")



In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import xgboost as xgb
from random import *
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import OneHotEncoder


def split_house_apartment(df):
    df_h = df[df['type'] == 'HOUSE']
    df_a = df[df['type'] == 'APARTMENT']
    df_h = df_h.drop('type', axis=1)
    df_a = df_a.drop('type', axis=1)

    return df_h, df_a


def encode_categorical_columns(df):

    encoder = OneHotEncoder()
    categorical_col = df.select_dtypes(include=['object']).columns

    # fit and transform the categorical data into a sparse matrix
    encoded_data = encoder.fit_transform(df[categorical_col])

    # convert into a dense array
    encoded_array = encoded_data.toarray()

    encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out(categorical_col))

    df = df.drop(columns=categorical_col)
    df_encoded = pd.concat([df.reset_index(drop=True), encoded_df], axis=1)

    return df_encoded


def split_train_test(df, random_state):
    X, y = df.drop('price', axis=1), df['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

    return X_train, X_test, y_train, y_test


def scale_features(X_train, X_test):

    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    df_scaled_X_train = pd.DataFrame(scaled_X_train, columns=X_train.columns)
    df_scaled_X_test = pd.DataFrame(scaled_X_test, columns=X_test.columns)

    return df_scaled_X_train, df_scaled_X_test


def regression_train_eval(X_train, X_test, y_train, y_test, prop_type):

    scaled_X_train, scaled_X_test = scale_features(X_train, X_test)

    model = LinearRegression()
    model.fit(scaled_X_train, y_train)
    y_pred = model.predict(X_test)

    print(f'    LINEAR REGRESSION {prop_type} model:')
    print("        Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("        Training score: %.2f" % model.score(scaled_X_train, y_train))
    print("        Test score: %.2f" % model.score(scaled_X_test, y_test))


def linear_reg(df, prop_type, random_state):
    X_train, X_test, y_train, y_test = split_train_test(df, random_state)
    regression_train_eval(X_train, X_test, y_train, y_test, prop_type)


def main_linear(df, df_h, df_a):
    linear_reg_features = ['price', 'region', 'province', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool',
                           'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift', 'kitchen', 'latitude', 'longitude']
    df_a = df_a[linear_reg_features]
    df_h = df_h[linear_reg_features]
    df = df[linear_reg_features]

    df_a = encode_categorical_columns(df_a)
    df_h = encode_categorical_columns(df_h)
    df = encode_categorical_columns(df)
    random_state = randint(1, 1000)
    print(f'RANDOM STATE: {random_state}')
    linear_reg(df, 'FULL dataset', random_state)
    linear_reg(df_h, 'HOUSE dataset', random_state)
    linear_reg(df_a, 'APARTMENT dataset', random_state)


def xgb_train_eval(X_train, X_test, y_train, y_test, prop_type):

    # scale features
    scaled_X_train, scaled_X_test = scale_features(X_train, X_test)

    model = xgb.XGBRegressor()
    model.fit(scaled_X_train, y_train)
    y_pred = model.predict(scaled_X_test)

    print(f'    XGBOOST REGRESSION {prop_type} model:')
    print("        Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("        Training score: %.2f" % model.score(scaled_X_train, y_train))
    print("        Test score: %.2f" % model.score(scaled_X_test, y_test))


def xgb_reg(df, prop_type, random_state):
    X_train, X_test, y_train, y_test = split_train_test(df, random_state)
    xgb_train_eval(X_train, X_test, y_train, y_test, prop_type)


def main_xgb(df, df_h, df_a):
    xgb_features = ['price', 'region', 'province', 'district', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount',
                    'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift', 'kitchen', 'latitude', 'longitude', 'constructionYear']
    df_a = df_a[xgb_features]
    df_h = df_h[xgb_features]
    df = df[xgb_features]

    df_a = encode_categorical_columns(df_a)
    df_h = encode_categorical_columns(df_h)
    df = encode_categorical_columns(df)

    random_state = randint(1, 1000)
    print("===========================================")
    print(f'RANDOM STATE: {random_state}')
    xgb_reg(df, 'FULL dataset', random_state)
    xgb_reg(df_h, 'HOUSE dataset', random_state)
    xgb_reg(df_a, 'APARTMENT dataset', random_state)


In [4]:
def main(clean_df):
    # raw_data_path = Path.cwd() / "data" / "raw" / "properties_data.csv"
    # df = pd.read_csv(raw_data_path)
    # clean_df = clean_data(df)
    df_h, df_a = split_house_apartment(clean_df)
    main_linear(clean_df, df_h, df_a)
    main_xgb(clean_df, df_h, df_a)

In [5]:
main(df)

RANDOM STATE: 201
    LINEAR REGRESSION FULL dataset model:
        Mean squared error: 16908354619044326.00
        Training score: 0.60
        Test score: 0.60
    LINEAR REGRESSION HOUSE dataset model:
        Mean squared error: 7296689829016031.00
        Training score: 0.60
        Test score: 0.52
    LINEAR REGRESSION APARTMENT dataset model:
        Mean squared error: 6948465737002044.00
        Training score: 0.57
        Test score: 0.54
RANDOM STATE: 534
    XGBOOST REGRESSION FULL dataset model:
        Mean squared error: 4633197696.88
        Training score: 0.88
        Test score: 0.71
    XGBOOST REGRESSION HOUSE dataset model:
        Mean squared error: 7525812288.93
        Training score: 0.90
        Test score: 0.65
    XGBOOST REGRESSION APARTMENT dataset model:
        Mean squared error: 2694671693.27
        Training score: 0.92
        Test score: 0.76
