# Preparation

In [1]:
# Data Management/Investigation
import pandas as pd
import numpy as np
import missingno as miss
from plotnine import *
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# For pre-processing data 
from sklearn import preprocessing as pp 
from sklearn.compose import ColumnTransformer 

# For splits and CV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold # Cross validation 
from sklearn.model_selection import cross_validate # Cross validation 
from sklearn.model_selection import GridSearchCV # Cross validation + param. tuning.

# Machine learning methods 
from sklearn.linear_model import LinearRegression as LM
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.ensemble import RandomForestRegressor as RF

# For evaluating our model's performance
import sklearn.metrics as m

# Pipeline to combine modeling elements
from sklearn.pipeline import Pipeline

pd.set_option('display.max_columns', 100)

# Data Preprocessing

In [2]:
#Load the data.
test = pd.read_csv('../test_data.csv')
train = pd.read_csv('../train_data.csv')

In [3]:
#Take a look at it.
train.sample(10)

Unnamed: 0,OBJECTID,SSL,BATHRM,HF_BATHRM,HEAT,HEAT_D,AC,NUM_UNITS,ROOMS,BEDRM,AYB,YR_RMDL,EYB,STORIES,SALEDATE,QUALIFIED,SALE_NUM,GBA,BLDG_NUM,STYLE,STYLE_D,STRUCT,STRUCT_D,GRADE,GRADE_D,CNDTN,CNDTN_D,EXTWALL,EXTWALL_D,ROOF,ROOF_D,INTWALL,INTWALL_D,KITCHENS,FIREPLACES,USECODE,LANDAREA,GIS_LAST_MOD_DTTM,LN_PRICE
39637,64504,3516 0111,3.0,1.0,1.0,Forced Air,Y,2.0,7.0,4.0,1905.0,2015.0,1967,2.0,2014-03-19T00:00:00.000Z,U,1,1714,1,4.0,2 Story,7.0,Row Inside,4.0,Above Average,5.0,Very Good,14.0,Common Brick,2.0,Built Up,11.0,Hardwood/Carp,2.0,0.0,24,1533,2018-07-22T18:01:43.000Z,13.122363
13397,1390,0029 0801,2.0,1.0,1.0,Forced Air,Y,1.0,6.0,2.0,1900.0,1960.0,1950,2.0,2003-11-25T00:00:00.000Z,Q,1,1976,1,4.0,2 Story,7.0,Row Inside,5.0,Good Quality,3.0,Average,20.0,Brick/Stone,2.0,Built Up,6.0,Hardwood,1.0,1.0,11,1600,2018-07-22T18:01:43.000Z,13.31546
37107,105137,6117 0056,4.0,0.0,8.0,Ht Pump,Y,1.0,11.0,5.0,2009.0,,2012,2.0,2010-10-05T00:00:00.000Z,Q,1,2166,1,4.0,2 Story,5.0,Town Inside,3.0,Average,4.0,Good,22.0,Brick/Siding,2.0,Built Up,11.0,Hardwood/Carp,1.0,0.0,11,2167,2018-07-22T18:01:43.000Z,12.384219
2160,21570,1111 0098,1.0,2.0,1.0,Forced Air,Y,1.0,7.0,3.0,1926.0,,1957,2.0,2006-04-11T00:00:00.000Z,Q,1,1336,1,4.0,2 Story,7.0,Row Inside,4.0,Above Average,3.0,Average,14.0,Common Brick,6.0,Metal- Sms,6.0,Hardwood,1.0,0.0,11,2462,2018-07-22T18:01:43.000Z,12.860999
35713,80597,4470 0134,1.0,0.0,13.0,Hot Water Rad,N,1.0,7.0,3.0,1938.0,,1954,2.0,1996-12-23T00:00:00.000Z,Q,1,992,1,4.0,2 Story,7.0,Row Inside,3.0,Average,3.0,Average,14.0,Common Brick,6.0,Metal- Sms,6.0,Hardwood,1.0,0.0,11,1224,2018-07-22T18:01:43.000Z,11.487608
1640,90726,5290 0095,1.0,1.0,1.0,Forced Air,Y,1.0,6.0,2.0,1947.0,2016.0,1967,2.0,2016-07-21T00:00:00.000Z,Q,3,800,1,4.0,2 Story,8.0,Semi-Detached,3.0,Average,4.0,Good,14.0,Common Brick,1.0,Comp Shingle,6.0,Hardwood,1.0,0.0,13,2670,2018-07-22T18:01:43.000Z,12.496875
19751,11137,0876 0828,1.0,0.0,1.0,Forced Air,Y,1.0,6.0,3.0,1900.0,1960.0,1954,2.0,2001-04-24T00:00:00.000Z,Q,1,1248,1,4.0,2 Story,7.0,Row Inside,3.0,Average,3.0,Average,14.0,Common Brick,6.0,Metal- Sms,3.0,Wood Floor,1.0,1.0,11,1480,2018-07-22T18:01:43.000Z,12.628067
9061,61827,3292 0061,3.0,0.0,1.0,Forced Air,Y,1.0,8.0,4.0,1931.0,2014.0,1964,2.0,2013-11-20T00:00:00.000Z,Q,1,1966,1,4.0,2 Story,8.0,Semi-Detached,3.0,Average,4.0,Good,14.0,Common Brick,2.0,Built Up,6.0,Hardwood,1.0,1.0,13,4091,2018-07-22T18:01:43.000Z,13.426903
26806,6533,0618 0063,4.0,0.0,1.0,Forced Air,Y,4.0,12.0,4.0,1910.0,1996.0,1957,3.0,2016-12-28T00:00:00.000Z,U,2,1921,1,7.0,3 Story,6.0,Row End,4.0,Above Average,4.0,Good,14.0,Common Brick,6.0,Metal- Sms,2.0,Carpet,1.0,0.0,24,923,2018-07-22T18:01:43.000Z,11.318746
6808,60268,3248 0077,2.0,1.0,1.0,Forced Air,Y,1.0,7.0,4.0,1923.0,2006.0,1964,2.0,2009-08-27T00:00:00.000Z,U,1,1586,1,4.0,2 Story,6.0,Row End,3.0,Average,4.0,Good,14.0,Common Brick,6.0,Metal- Sms,6.0,Hardwood,1.0,1.0,11,2044,2018-07-22T18:01:43.000Z,12.672946


In [4]:
#Understand the structure and type of data.
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43590 entries, 0 to 43589
Data columns (total 39 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   OBJECTID           43590 non-null  int64  
 1   SSL                43590 non-null  object 
 2   BATHRM             43589 non-null  float64
 3   HF_BATHRM          43588 non-null  float64
 4   HEAT               43589 non-null  float64
 5   HEAT_D             43589 non-null  object 
 6   AC                 43589 non-null  object 
 7   NUM_UNITS          43589 non-null  float64
 8   ROOMS              43579 non-null  float64
 9   BEDRM              43586 non-null  float64
 10  AYB                43586 non-null  float64
 11  YR_RMDL            25007 non-null  float64
 12  EYB                43590 non-null  int64  
 13  STORIES            43561 non-null  float64
 14  SALEDATE           43590 non-null  object 
 15  QUALIFIED          43590 non-null  object 
 16  SALE_NUM           435

In [5]:
#See the unique value of each columns.
train.nunique()

OBJECTID             43590
SSL                  43584
BATHRM                  14
HF_BATHRM                8
HEAT                    14
HEAT_D                  14
AC                       3
NUM_UNITS                7
ROOMS                   30
BEDRM                   21
AYB                    196
YR_RMDL                 98
EYB                     84
STORIES                 28
SALEDATE              6320
QUALIFIED                2
SALE_NUM                15
GBA                   3731
BLDG_NUM                 2
STYLE                   17
STYLE_D                 17
STRUCT                   8
STRUCT_D                 8
GRADE                   13
GRADE_D                 13
CNDTN                    7
CNDTN_D                  7
EXTWALL                 24
EXTWALL_D               24
ROOF                    15
ROOF_D                  15
INTWALL                 12
INTWALL_D               12
KITCHENS                 8
FIREPLACES              14
USECODE                  8
LANDAREA              7673
G

In [6]:
#Check the missing values of every columns.
train.isnull().sum()

OBJECTID                 0
SSL                      0
BATHRM                   1
HF_BATHRM                2
HEAT                     1
HEAT_D                   1
AC                       1
NUM_UNITS                1
ROOMS                   11
BEDRM                    4
AYB                      4
YR_RMDL              18583
EYB                      0
STORIES                 29
SALEDATE                 0
QUALIFIED                0
SALE_NUM                 0
GBA                      0
BLDG_NUM                 0
STYLE                    1
STYLE_D                  1
STRUCT                   1
STRUCT_D                 1
GRADE                    1
GRADE_D                  1
CNDTN                    1
CNDTN_D                  1
EXTWALL                  1
EXTWALL_D                1
ROOF                     1
ROOF_D                   1
INTWALL                  1
INTWALL_D                1
KITCHENS                 2
FIREPLACES               1
USECODE                  0
LANDAREA                 0
G

We can see that YR_RMDL has nearly half of its entries being empty, so I am going to drop that column from the data. For the rest of the missing values, I will drop them as well. It is because we have enough row and feature to work with even when we drop all the missing values.

In [7]:
#Create a list that contains the columns I do not need.
to_drop = ['OBJECTID','SSL','HEAT_D','YR_RMDL','SALEDATE','SALE_NUM','STYLE_D','STRUCT_D','GRADE_D','CNDTN_D','EXTWALL_D','ROOF_D','INTWALL_D','USECODE','GIS_LAST_MOD_DTTM']

In [8]:
#Drop the columns and reset the index.
train = train.drop(to_drop, axis=1).dropna().reset_index()
test = test.drop(to_drop, axis=1).dropna().reset_index()
train = train.drop(['index'], axis = 1)
test = test.drop(['index'], axis = 1)

In [10]:
#Transform the categorical features into 1s and 0s.
train['AC'] = np.where(train['AC']=='Y', 1, 0)
test['AC'] = np.where(test['AC']=='Y', 1, 0)
train['QUALIFIED'] = np.where(train['QUALIFIED']=='Q', 1, 0)
test['QUALIFIED'] = np.where(test['QUALIFIED']=='Q', 1, 0)

# Model Selection

In [20]:
#Seperate the features from the target column.
train_X = train.drop(['LN_PRICE'], axis = 1)
train_y = train['LN_PRICE']
test_X = test.drop(['LN_PRICE'], axis = 1)
test_y = test['LN_PRICE']

In [22]:
# (1) Set the folds index to ensure comparable samples
fold_generator = KFold(n_splits=5, shuffle=True,random_state=111)

# (2) Next specify the preprocessing steps
preprocess = ColumnTransformer(transformers=[('num', pp.MinMaxScaler(), ['GBA','LANDAREA'])])

# (3) Next Let's create our model pipe
pipe = Pipeline(steps=[('pre_process', preprocess),
                       ('model',None)])

# (4) Specify the models and their repsective tuning parameters. 
search_space = [
    # Linear Model
    {'model' : [LM()]},
    
    # KNN with K tuning param
    {'model' : [KNN()],
     'model__n_neighbors':[10,15,20,25,30]},
    
    # Random forest with the N Estimators tuning param
    {'model' : [RF()],
     'model__max_depth':[1,5,10],
     'model__n_estimators':[500,1000,1250]},
]

# (5) Put it all together in the grid search
search = GridSearchCV(pipe, search_space, 
                      cv = fold_generator,
                      scoring='neg_mean_squared_error',
                      n_jobs=4)

# (6) Fit the model to the training data
search.fit(train_X,train_y)

GridSearchCV(cv=KFold(n_splits=5, random_state=111, shuffle=True),
             estimator=Pipeline(steps=[('pre_process',
                                        ColumnTransformer(transformers=[('num',
                                                                         MinMaxScaler(),
                                                                         ['GBA',
                                                                          'LANDAREA'])])),
                                       ('model', None)]),
             n_jobs=4,
             param_grid=[{'model': [LinearRegression()]},
                         {'model': [KNeighborsRegressor()],
                          'model__n_neighbors': [10, 15, 20, 25, 30]},
                         {'model': [RandomForestRegressor(max_depth=10,
                                                          n_estimators=1000)],
                          'model__max_depth': [1, 5, 10],
                          'model__n_estimators': [500, 100

In [23]:
#See the mean score of cross valedation.
search.best_score_ 

-0.5335736953649469

In [24]:
#Show the best model and parameters
search.best_params_

{'model': RandomForestRegressor(max_depth=10, n_estimators=1000),
 'model__max_depth': 10,
 'model__n_estimators': 1000}

In [25]:
#Run the model on the test data.
pred_y = search.predict(test_X)

In [26]:
#Show the mean square error.
m.mean_squared_error(test_y,pred_y)

0.5263238156235672