In [1]:
import pandas_profiling as pp
import pandas as pd
import numpy as np

## Loading in the Data 

In [2]:
reno_df = pd.read_csv('model_data.csv')

In [3]:
reno_df = reno_df.set_index('PID')

## Exploratory Data Analysis 

In [4]:
reno_df.columns

Index(['MSSubClass', 'Foundation', 'PavedDrive', 'BsmtUnfSF', 'AllBathBsmt',
       'AllBathAbv', 'HeatingQC', 'Neighborhood', 'YearBuilt', 'SalePrice',
       'GarageCars', 'PorchArea', 'GoodLivArea', 'CentralAir', 'KitchenQual',
       'ExterQual', 'BsmtCond', 'FireplaceQu', 'GarageQual', 'HasPool'],
      dtype='object')

In [5]:
reno_df.head(3)

Unnamed: 0_level_0,MSSubClass,Foundation,PavedDrive,BsmtUnfSF,AllBathBsmt,AllBathAbv,HeatingQC,Neighborhood,YearBuilt,SalePrice,GarageCars,PorchArea,GoodLivArea,CentralAir,KitchenQual,ExterQual,BsmtCond,FireplaceQu,GarageQual,HasPool
PID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
526301100,1Fl,CBlock,P,441.0,1.0,1.0,Fa,NAmes,1960,215000,2.0,272,2295.0,Y,TA,TA,Gd,Gd,TA,0
526302030,1Fl,CBlock,Y,171.0,1.0,2.0,Gd,NAmes,1954,149900,2.0,280,2300.0,Y,TA,TA,TA,,TA,0
526302040,1Fl,CBlock,Y,235.0,1.0,1.0,TA,NAmes,1956,157500,1.0,280,1797.0,Y,Gd,TA,TA,TA,TA,0


In [6]:
pp.ProfileReport(reno_df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [7]:
reno_df['Neighborhood'].value_counts()

NAmes      410
CollgCr    236
OldTown    213
Edwards    165
Gilbert    143
Somerst    143
Sawyer     139
NWAmes     123
NridgHt    121
SawyerW    113
Mitchel    104
BrkSide    103
Crawfor     92
IDOTRR      76
NoRidge     67
Timber      54
StoneBr     43
SWISU       42
ClearCr     40
MeadowV     34
BrDale      29
Veenker     23
Blmngtn     23
NPkVill     22
Blueste     10
Greens       8
GrnHill      2
Landmrk      1
Name: Neighborhood, dtype: int64

In [8]:
reno_df['MSSubClass'].value_counts()

1Fl        1343
2Fl         639
1FlPUD      170
SPLIT       152
DUP2FAM     140
2FlPUD      135
Name: MSSubClass, dtype: int64

In [9]:
reno_df['Foundation'].value_counts()

CBlock    1139
PConc     1105
BrkTil     278
Slab        42
Stone       10
Wood         5
Name: Foundation, dtype: int64

## Data Preprocessing

In [10]:
from sklearn.preprocessing import LabelEncoder

lencoder = LabelEncoder()
reno_df['Neighborhood'] = lencoder.fit_transform(reno_df['Neighborhood'])
reno_df['MSSubClass'] = lencoder.fit_transform(reno_df['MSSubClass'])
reno_df['Foundation'] = lencoder.fit_transform(reno_df['Foundation'])
reno_df = reno_df.replace({'None':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5})
reno_df = reno_df.replace({'N':0,'P':1,'Y':2})

In [11]:
reno_df['Neighborhood'].value_counts()

15    410
5     236
20    213
7     165
8     143
24    143
22    139
17    123
19    121
23    113
14    104
3     103
6      92
11     76
18     67
26     54
25     43
21     42
4      40
13     34
2      29
0      23
27     23
16     22
1      10
9       8
10      2
12      1
Name: Neighborhood, dtype: int64

In [12]:
reno_df['MSSubClass'].value_counts()

0    1343
2     639
1     170
5     152
4     140
3     135
Name: MSSubClass, dtype: int64

In [13]:
reno_df['Foundation'].value_counts()

1    1139
2    1105
0     278
3      42
4      10
5       5
Name: Foundation, dtype: int64

In [14]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_validate

In [53]:
X = reno_df.drop(columns = 'SalePrice')
y = np.log10(reno_df['SalePrice'])

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [48]:
from catboost import CatBoostRegressor

In [55]:
params = {'iterations': 10000,
          'learning_rate': 0.001,
          'depth': 3,
          'loss_function': 'RMSE',
          'eval_metric': 'RMSE',
          'random_seed': 2,
          'metric_period': 1000,
          'od_type': 'Iter',
          'od_wait': 20,
          'verbose': True,
          'use_best_model': True}

model = CatBoostRegressor(**params)

model.fit(X_train, y_train, 
             eval_set = (X_test, y_test),
             use_best_model = True,
             plot = True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	learn: 0.1697271	test: 0.1642635	best: 0.1642635 (0)	total: 458us	remaining: 4.58s
1000:	learn: 0.1055204	test: 0.1016702	best: 0.1016702 (1000)	total: 384ms	remaining: 3.45s
2000:	learn: 0.0803512	test: 0.0789773	best: 0.0789773 (2000)	total: 847ms	remaining: 3.38s
3000:	learn: 0.0696518	test: 0.0702800	best: 0.0702800 (3000)	total: 1.28s	remaining: 2.99s
4000:	learn: 0.0644866	test: 0.0664192	best: 0.0664192 (4000)	total: 1.7s	remaining: 2.54s
5000:	learn: 0.0615582	test: 0.0645081	best: 0.0645081 (5000)	total: 2.09s	remaining: 2.09s
6000:	learn: 0.0596194	test: 0.0634394	best: 0.0634394 (5998)	total: 2.57s	remaining: 1.71s
7000:	learn: 0.0580587	test: 0.0626877	best: 0.0626876 (6999)	total: 2.98s	remaining: 1.28s
8000:	learn: 0.0567676	test: 0.0621628	best: 0.0621625 (7999)	total: 3.34s	remaining: 834ms
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.06214692869
bestIteration = 8047

Shrink model to first 8048 iterations.


<catboost.core.CatBoostRegressor at 0x7fc1100f2700>

In [56]:
feat_import = [item for item in zip(list(X.columns), model.get_feature_importance())]
feat_import_df = pd.DataFrame(feat_import, columns = ['Feature', 'VarImport'])
feat_import_df = feat_import_df.sort_values('VarImport', ascending = False)
feat_import_df.head(10)

Unnamed: 0,Feature,VarImport
11,GoodLivArea,37.130735
14,ExterQual,11.250752
8,YearBuilt,10.77576
9,GarageCars,9.205007
13,KitchenQual,7.955826
16,FireplaceQu,6.785077
12,CentralAir,3.584609
5,AllBathAbv,3.284424
3,BsmtUnfSF,2.2155
0,MSSubClass,1.352408


In [57]:
model.get_best_score()

{'learn': {'RMSE': 0.056767569173189175},
 'validation': {'RMSE': 0.06214692869483018}}

In [58]:
print("The training r2: %.5f" % (model.score(X_train, y_train)))
print("The test     r2: %.5f" % (model.score(X_test, y_test)))

The training r2: 0.88847
The test     r2: 0.85699
