In [15148]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.min_rows', 20)

In [15149]:
#df = pd.read_csv("../data/raw/properties_data.csv")
df = pd.read_pickle("../data/clean/clean_data.pkl")



In [15150]:
df['postalCode'] = df['postalCode'].astype('float64')
df['postalCode'] = (df['postalCode'] / 100).apply(np.floor)
df['postalCode'] = df['postalCode'].astype('Int64').astype('str')


In [15151]:
#df['constructionYear'] = df['constructionYear'].fillna('no_info')


In [15154]:
df_h = df[df['type'] == 'HOUSE']
df_a = df[df['type'] == 'APARTMENT']

df_h = df_h.drop('type', axis=1)
df_a = df_a.drop('type', axis=1)


In [15155]:
df.columns

Index(['price', 'type', 'subtype', 'region', 'province', 'district',
       'postalCode', 'latitude', 'longitude', 'bedroomCount',
       'netHabitableSurface', 'constructionYear', 'condition', 'hasLift',
       'kitchen', 'hasGarden', 'gardenSurface', 'hasTerrace', 'terraceSurface',
       'fireplaceExists', 'hasSwimmingPool', 'hasAirConditioning',
       'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'heatingType',
       'hasDoubleGlazing', 'saleType'],
      dtype='object')

In [15156]:
# features to drop 'saleType', 'latitude', 'longitude', 'constructionYear', 'fireplaceExists, 'hasAirConditioning, 'saleType'

# should probably keep 'condition'
# not sure 'kitchen', 'hasGarden'

# to keep ['price', 'region', 'province', 'district', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing',  'condition', 'hasGarden', 'gardenSurface']

# negative HOUSE test score on with random_state = 70 on this particular set of features
features_70 = ['price', 'region', 'province', 'district', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'heatingType', 'hasGarden', 'gardenSurface', 'hasTerrace']
# heatingType had some values with very low count which would cause the negative test score. if scares values are not split evenly (all in test or train)
# grouping them in 'OTHER' fixed the score 

# random_state 22 gives a negative test score on APARTMENT
features_22 = ['price', 'region', 'province', 'district', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift']

xgb_features = ['price', 'region', 'province', 'district', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift', 'kitchen', 'latitude', 'longitude', 'constructionYear']


linear_reg_features = ['price', 'region', 'province', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift', 'kitchen', 'latitude', 'longitude']

test_features = ['price', 'region', 'province', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift', 'kitchen', 'latitude', 'longitude']


# RANDOM STATE: 522
# Negative test score
# removing 'district' fixed it
features_522 = ['price', 'region', 'province', 'district', 'netHabitableSurface', 'bedroomCount', 'hasDoubleGlazing', 'condition', 'hasSwimmingPool', 'bathroomCount', 'showerRoomCount', 'parkingCountIndoor', 'hasGarden', 'gardenSurface', 'hasTerrace', 'hasLift', 'kitchen', 'latitude', 'longitude']



df_a = df_a[test_features]
df_h = df_h[test_features]

df = df[test_features]



In [15157]:
df_a = pd.get_dummies(df_a)
df_h = pd.get_dummies(df_h)
df = pd.get_dummies(df)


In [15158]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

from random import *


def split_data(df, random_state):
    X, y = df.drop('price', axis=1), df['price']
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)
    return X_train, X_test, y_train, y_test


# scale features
def scale_features(X_train, X_test):
    
    scaler = MinMaxScaler()
    scaled_X_train = scaler.fit_transform(X_train)
    scaled_X_test = scaler.transform(X_test)

    df_scaled_X_train = pd.DataFrame(scaled_X_train, columns=X_train.columns)
    df_scaled_X_test = pd.DataFrame(scaled_X_test, columns=X_test.columns)
    
    return df_scaled_X_train, df_scaled_X_test

def regression_train_eval(X_train, X_test, y_train, y_test, prop_type):
    
    scaled_X_train, scaled_X_test = scale_features(X_train, X_test)

    model = LinearRegression()
    model.fit(scaled_X_train, y_train)

    y_pred = model.predict(X_test)

    # weights associated with each features
    # print("Coefficients: \n", model.coef_)

    print(f'    {prop_type}:')
    print("        Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("        Training score: %.2f" % model.score(scaled_X_train, y_train))
    print("        Test score: %.2f" % model.score(scaled_X_test, y_test))
    
    # plt.scatter(y_test, y_pred, color="black")
    # plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="blue")
    # plt.title(f'{prop_type}')
    # plt.xlabel("Actual Price")
    # plt.ylabel("Predicted Price")
    # plt.show()
    
    
def linear_reg(df, prop_type, random_state):
    X_train, X_test, y_train, y_test = split_data(df, random_state)
    regression_train_eval(X_train, X_test, y_train, y_test, prop_type)

In [15159]:
def main_linear(df, df_h, df_a, random_state='rand'):
    if random_state == 'rand':
        random_state = randint(1, 1000)
    print(f'RANDOM STATE: {random_state}')
    linear_reg(df, 'ALL', random_state)
    linear_reg(df_h, 'HOUSE', random_state)
    linear_reg(df_a, 'APARTMENT', random_state)
    



In [15160]:
import xgboost as xgb

def xgb_train_eval(X_train, X_test, y_train, y_test, prop_type):
    

    # scale features
    scaled_X_train, scaled_X_test = scale_features(X_train, X_test)
    
    model = xgb.XGBRegressor()
    model.fit(scaled_X_train, y_train)
    y_pred = model.predict(scaled_X_test)


    print(f'    {prop_type}:')
    print("        Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
    print("        Training score: %.2f" % model.score(scaled_X_train, y_train))
    print("        Test score: %.2f" % model.score(scaled_X_test, y_test))

    # plt.scatter(y_test, y_pred, color="black")
    # plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="blue")
    # plt.title(f'{prop_type}')
    # plt.xlabel("Actual Price")
    # plt.ylabel("Predicted Price")
    # plt.show()
    
def xgb_reg(df, prop_type, random_state):
    X_train, X_test, y_train, y_test = split_data(df, random_state)
    xgb_train_eval(X_train, X_test, y_train, y_test, prop_type)

In [15161]:
def main_xgb(df, df_h, df_a, random_state):
    if random_state == 'rand':
        random_state = randint(1, 1000)
    print(f'RANDOM STATE: {random_state}')
    xgb_reg(df, 'ALL', random_state)
    xgb_reg(df_h, 'HOUSE', random_state)
    xgb_reg(df_a, 'APARTMENT', random_state)
    
main_xgb(df, df_h, df_a, 'rand')
main_xgb(df, df_h, df_a, 'rand')



RANDOM STATE: 444
    ALL:
        Mean squared error: 4808159271.12
        Training score: 0.90
        Test score: 0.70
    HOUSE:
        Mean squared error: 7377212796.01
        Training score: 0.93
        Test score: 0.67
    APARTMENT:
        Mean squared error: 2942453582.03
        Training score: 0.94
        Test score: 0.71
RANDOM STATE: 212
    ALL:
        Mean squared error: 5139051931.72
        Training score: 0.90
        Test score: 0.69
    HOUSE:
        Mean squared error: 7373415970.20
        Training score: 0.94
        Test score: 0.66
    APARTMENT:
        Mean squared error: 3064920295.18
        Training score: 0.94
        Test score: 0.71


In [15162]:
main_xgb(df, df_h, df_a, 'rand')
main_xgb(df, df_h, df_a, 'rand')

RANDOM STATE: 850
    ALL:
        Mean squared error: 5203969393.47
        Training score: 0.90
        Test score: 0.70
    HOUSE:
        Mean squared error: 7578165432.94
        Training score: 0.93
        Test score: 0.65
    APARTMENT:
        Mean squared error: 2978626620.33
        Training score: 0.94
        Test score: 0.71
RANDOM STATE: 796
    ALL:
        Mean squared error: 5420327624.63
        Training score: 0.90
        Test score: 0.68
    HOUSE:
        Mean squared error: 7401270925.04
        Training score: 0.93
        Test score: 0.65
    APARTMENT:
        Mean squared error: 2917663522.06
        Training score: 0.94
        Test score: 0.72
