In [1]:
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import time

from catboost import CatBoostRegressor

import re

from statsmodels.stats.outliers_influence import variance_inflation_factor
from xgboost import XGBRegressor
from sklearn.preprocessing import MinMaxScaler


In [2]:
pd.set_option('display.max_rows', 500)

In [3]:
df = pd.read_csv('data/ames_housing_price_data_v5.csv')

In [4]:
#drop outlier rows
df = df[(df['PID'] != 902207130) & (df['PID'] != 908154205)]
df = df[(df['SaleCondition'] == 'Normal') | (df['SaleCondition'] == 'Partial')]
df = df[df['BedroomAbvGr'] != 0]
df = df[df['MSZoning_com'] != 'Nonresidential']
df.reset_index(drop=True, inplace = True)

In [5]:
#separate price from table
price=df['SalePrice']
df=df.drop(['SalePrice_log'],axis=1) #'SalePrice',

In [6]:
#add radial average price feature
avg_price_df=pd.read_csv('data/house_surrounding_avg_prices.csv')
avg_price_df2=avg_price_df[['PID','AvgPrice-0.5']].drop_duplicates() #,'AvgPrice-0.25'

df2=df.merge(avg_price_df2,how='left')


In [7]:
#add geographical features
radial = pd.read_csv('data/house_coordinates_1.0.csv')
radial.drop(columns = ('2204_park'), inplace = True)
for col in radial.columns:
    prefix = str(col)[0:4]
    if re.search('^\d\d\d\d_', str(col)):
        radial.rename(columns = {col: col[5:]}, inplace = True)
rad_drops = [
    'Address',
    'Coords4',
    'latitude',
    'longitude',
    'town_hall',
    'cemetery',
    'motel',
    'camp_site',
    'general',
    'picnic_site',
    'wastewater_plant',
    'spring',
    'beach',
    'street_lamp',
    'helipad',
    'vineyard',
    'crossing',
    'tree',
    'grass',
    'christian',
    'bus_stop',
    'parking',
    'toilet',
    'bench',
    'commercial',
    'waste_basket',
    'drinking_water',
    'convenience',
    'camera_surveillance',
    'comms_tower',
    'residential',
    'gift_shop',
    'jeweller',
    'hairdresser',
    'bookshop',
    'clothes',
    'retail',
    'food_court',
    'artwork',
    'cafe',
    'traffic_signals',
    'beauty_shop',
    'sports_shop',
    'weir',
    'track',
    'turning_circle',
    'computer_shop',
    'bicycle_shop',
    'department_store',
    'parking_bicycle',
    'golf_course',
    'tower',
    'beverages',
    'university'
]
radial.drop(columns = rad_drops, inplace = True)

df2=df2.merge(radial, how='left')




In [8]:
#other rows to drop:
droplist=['PID','GarageFinish','SaleCondition','GarageType_com','Garage_age_bin','sold_datetime']
df2=df2.drop(droplist,axis=1)

In [9]:
#fillnas
df2=df2.fillna(0)

In [10]:
scaler = MinMaxScaler()

def fit_scale(col):
    scaler.fit(df2[[col]])
    df2[[col]]=scaler.transform(df2[[col]])

fit_scale('OverallQual')
fit_scale('ExterQual')
fit_scale('OverallCond')
fit_scale('KitchenQual')

df2['PorchSF']=df2['OpenPorchSF']+df2['EnclosedPorch']+df2['3SsnPorch']+df2['ScreenPorch']


df2['SaleTypeNew']=(df2['SaleType']=='New')
df2['SaleTypeNew']=df2['SaleTypeNew'].apply(lambda x: 1 if x==True else 0)


df2['BSMT_LowQual']=df2['TotalBsmtSF']-df2['BSMT_GLQ']-df2['BSMT_ALQ']
df2['BSMT_HighQual']=df2['BSMT_GLQ']+df2['BSMT_ALQ']



In [11]:
features_to_use=['GrLivArea', 'LotArea', 'OverallQual',
    'BSMT_LowQual', 'house_age_years', 'GarageCars','MasVnrType',
                 'FullBath','HalfBath',
                'BsmtExposure_ord','SaleTypeNew',
                 'Neighborhood',
                 'BldgType','PorchSF',
                 'BSMT_HighQual',
                 
                'Fireplaces','Pool','BedroomAbvGr',
                 
                  'water_tower', 'graveyard', 'police', 
                 'optician', 'slipway',  'bar', 'cinema', 'supermarket' ,'hotel','stop','farmyard',
                 'christian_catholic', 
         'jewish', 'muslim','garden_centre','christian_lutheran',
                 
                 'ExterQual','OverallCond','KitchenQual',
                ] 


In [12]:
front_end=df2[features_to_use]

front_end.to_csv('data/ames_housing_price_data_v6.csv')

front_end

Unnamed: 0,GrLivArea,LotArea,OverallQual,BSMT_LowQual,house_age_years,GarageCars,MasVnrType,FullBath,HalfBath,BsmtExposure_ord,...,stop,farmyard,christian_catholic,jewish,muslim,garden_centre,christian_lutheran,ExterQual,OverallCond,KitchenQual
0,856,7890,0.428571,856.0,71.210959,2.0,,1,0,1,...,0,0,1,0,0,0,3,0.333333,0.500000,0.333333
1,1049,4235,0.285714,104.0,25.104110,1.0,Brick Face,2,0,2,...,0,0,1,0,1,0,2,0.666667,0.333333,0.666667
2,1039,8146,0.142857,405.0,109.402740,1.0,,1,0,1,...,0,1,0,0,0,0,3,0.666667,0.833333,0.333333
3,1665,8400,0.714286,167.0,8.838356,2.0,,2,1,1,...,0,0,0,1,0,0,3,0.666667,0.500000,0.666667
4,1922,7301,0.571429,0.0,6.501370,2.0,Brick Face,3,0,0,...,1,0,0,0,0,0,1,0.666667,0.333333,0.666667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,952,8854,0.428571,952.0,93.394521,1.0,,1,0,1,...,0,0,0,0,0,0,3,0.333333,0.500000,0.000000
2467,1733,13680,0.000000,0.0,54.452055,2.0,,2,0,0,...,0,0,0,0,1,0,1,0.333333,0.333333,0.333333
2468,2002,6270,0.285714,1001.0,58.619178,3.0,,2,0,1,...,0,0,1,0,0,0,2,0.333333,0.500000,0.333333
2469,1842,8826,0.571429,144.0,7.501370,2.0,Brick Face,2,1,1,...,1,13,0,0,0,0,0,0.666667,0.333333,0.666667


In [13]:
back_end=front_end.copy()
back_end['ExterQualDisc']=back_end['ExterQual']-back_end['OverallQual']
back_end['OverallCondDisc']=back_end['OverallCond']-back_end['OverallQual']
back_end['KitchenQualDisc']=back_end['KitchenQual']-back_end['OverallQual']
back_end=back_end.drop(['ExterQual','OverallCond','KitchenQual'],axis=1)

to_dummify2=['Neighborhood', 'BldgType', 'MasVnrType']
back_end = pd.get_dummies(back_end, columns = to_dummify2, drop_first = True)
back_end
cbl = CatBoostRegressor();
cbl.load_model("Moritz/HousePriceCatBoost", "cbm")
cbl.predict(back_end)


array([116221.48192247, 136969.10870812, 109278.94280232, ...,
       159091.87826614, 221253.1853162 , 223256.17699558])