In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/41/24e14322b9986cf72a8763e0a0a69cc256cf963cf9502c8f0044a62c1ae8/catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2MB)
[K     |████████████████████████████████| 69.2MB 44kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [8]:
# Packages
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.dummy import DummyRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.model_selection import cross_val_score, cross_val_predict
from catboost import CatBoostRegressor
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

# Settings
sns.set()
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
%config InlineBackend.figure_format = 'svg'
np.random.seed(111)

In [6]:
dev = pd.read_csv('https://raw.githubusercontent.com/elvanselvano/purwadhika-final-project/main/dataset/dev_preprocessed_final.csv')
dev.drop('Unnamed: 0', inplace=True, axis=1)
dev.head()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,BEDRM,AYB,EYB,SALEDATE,PRICE,GBA,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,LANDAREA,WARD,SALEYEAR,RMDL
0,4,0,Warm Cool,1,4,1910.0,1972,2003-11-25,1095000.0,2522.0,3 Story,Row Inside,Very Good,4,Common Brick,Metal- Sms,Hardwood,2,5,1680,Ward 2,2003,1
1,3,1,Hot Water Rad,1,5,1900.0,1984,2006-07-12,1602000.0,2484.0,3 Story,Row Inside,Very Good,4,Common Brick,Built Up,Hardwood,2,3,1680,Ward 2,2006,1
2,3,2,Hot Water Rad,1,5,1913.0,1972,2010-02-26,1950000.0,5344.0,4 Story,Row Inside,Very Good,4,Common Brick,Built Up,Hardwood,1,4,2196,Ward 2,2010,0
3,3,1,Hot Water Rad,1,4,1906.0,1972,2011-09-29,1050000.0,2401.0,3 Story,Row Inside,Very Good,3,Common Brick,Metal- Sms,Hardwood,2,1,1627,Ward 2,2011,1
4,3,1,Warm Cool,1,3,1917.0,1967,2011-09-30,1325000.0,2692.0,2 Story,Row Inside,Above Average,5,Stucco,Metal- Sms,Hardwood,2,1,1815,Ward 2,2011,1


In [13]:
test = pd.read_csv('https://raw.githubusercontent.com/elvanselvano/purwadhika-final-project/main/dataset/test_preprocessed_final.csv')
test.drop('Unnamed: 0', inplace=True, axis=1)
test.head()

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,BEDRM,AYB,EYB,SALEDATE,PRICE,GBA,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,LANDAREA,WARD,SALEYEAR,RMDL
0,3,1,Hot Water Rad,1,5,1910.0,1984,2016-06-21,2100000.0,2522.0,3 Story,Row Inside,Very Good,5,Common Brick,Built Up,Hardwood,2,4,1680,Ward 2,2016,1
1,3,1,Warm Cool,1,3,1908.0,1967,2018-05-03,1430000.0,1488.0,2 Story,Row Inside,Above Average,5,Common Brick,Built Up,Hardwood,2,1,1424,Ward 2,2018,1
2,3,1,Warm Cool,1,4,1880.0,1967,2016-09-14,1550000.0,2552.0,3 Story,Row Inside,Above Average,5,Common Brick,Metal- Sms,Hardwood,2,2,1853,Ward 2,2016,1
3,3,1,Forced Air,1,4,1800.0,1967,2018-03-30,1700000.0,2382.0,3 Story,Row Inside,Above Average,6,Common Brick,Metal- Sms,Hardwood,2,3,1853,Ward 2,2018,1
4,3,2,Forced Air,1,4,1800.0,1967,2017-05-01,1700000.0,2520.0,3 Story,Row Inside,Above Average,5,Common Brick,Built Up,Hardwood,2,2,1853,Ward 2,2017,1


In [14]:
X = dev.drop('PRICE', axis=1)
y = dev['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Make Mean Absolute Error scorer
mae_scorer = make_scorer(mean_absolute_error)

# Function to print cross-validated mean abs deviation
def cv_mae(regressor, x, y, cv=3, scorer=mae_scorer):
    scores = cross_val_score(regressor, 
                             x, y, cv=cv,
                             scoring=scorer)
    print('MAE:', scores.mean())

In [15]:
cv_mae(DummyRegressor(), X_train, y_train)

MAE: 333422.46155354084


In [16]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]

BATHRM          int64
HF_BATHRM       int64
HEAT           object
AC              int64
BEDRM           int64
AYB           float64
EYB             int64
SALEDATE       object
GBA           float64
STYLE          object
STRUCT         object
GRADE          object
CNDTN           int64
EXTWALL        object
ROOF           object
INTWALL        object
KITCHENS        int64
FIREPLACES      int64
LANDAREA        int64
WARD           object
SALEYEAR        int64
RMDL            int64
dtype: object


In [17]:
model = CatBoostRegressor(
    random_seed=42,
    logging_level='Silent'
)

In [18]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_test, y_test),
#     logging_level='Verbose',  # you can uncomment this for text output
    plot=True
);


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [21]:
from catboost import CatBoostClassifier, Pool, metrics, cv

In [22]:
cv_params = model.get_params()

cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [23]:
preds_raw = model.predict(X_test)

In [24]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_test, preds_raw)

78783.57087734844

In [25]:
gbrL = CatBoostRegressor(loss_function='Quantile:alpha=0.025', cat_features=categorical_features_indices, random_seed=42)
gbr = CatBoostRegressor(loss_function='Quantile:alpha=0.5', cat_features=categorical_features_indices, random_seed=42)
gbrH = CatBoostRegressor(loss_function='Quantile:alpha=0.975', cat_features=categorical_features_indices, random_seed=42)

gbrL.fit(X, y)
gbr.fit(X, y)
gbrH.fit(X, y)

unseen_features = test.drop('PRICE', axis=1)
y_predL = gbrL.predict(unseen_features)
y_pred = gbr.predict(unseen_features)
y_predH = gbrH.predict(unseen_features)

unseen_features['low'] = y_predL
unseen_features['mid'] = y_pred
unseen_features['high'] = y_predH
print('Testing Score: ', mean_absolute_error(test['PRICE'], y_pred))

unseen_features.head()

0:	learn: 12279.0938703	total: 139ms	remaining: 2m 18s
1:	learn: 12174.5239588	total: 275ms	remaining: 2m 17s
2:	learn: 12078.8836102	total: 400ms	remaining: 2m 12s
3:	learn: 11979.6719918	total: 510ms	remaining: 2m 6s
4:	learn: 11907.8185138	total: 628ms	remaining: 2m 4s
5:	learn: 11813.5474453	total: 754ms	remaining: 2m 4s
6:	learn: 11730.8261631	total: 879ms	remaining: 2m 4s
7:	learn: 11656.8365593	total: 1s	remaining: 2m 4s
8:	learn: 11582.9001649	total: 1.13s	remaining: 2m 4s
9:	learn: 11500.7889260	total: 1.24s	remaining: 2m 3s
10:	learn: 11426.7892696	total: 1.37s	remaining: 2m 3s
11:	learn: 11342.8249302	total: 1.49s	remaining: 2m 2s
12:	learn: 11267.5607350	total: 1.61s	remaining: 2m 2s
13:	learn: 11205.3351178	total: 1.73s	remaining: 2m 1s
14:	learn: 11139.1424161	total: 1.85s	remaining: 2m 1s
15:	learn: 11060.3925091	total: 1.98s	remaining: 2m 1s
16:	learn: 11002.9727320	total: 2.09s	remaining: 2m
17:	learn: 10928.2321427	total: 2.21s	remaining: 2m
18:	learn: 10852.6068771	t

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,BEDRM,AYB,EYB,SALEDATE,GBA,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,LANDAREA,WARD,SALEYEAR,RMDL,low,mid,high
0,3,1,Hot Water Rad,1,5,1910.0,1984,2016-06-21,2522.0,3 Story,Row Inside,Very Good,5,Common Brick,Built Up,Hardwood,2,4,1680,Ward 2,2016,1,500917.631957,982321.296498,1681903.0
1,3,1,Warm Cool,1,3,1908.0,1967,2018-05-03,1488.0,2 Story,Row Inside,Above Average,5,Common Brick,Built Up,Hardwood,2,1,1424,Ward 2,2018,1,308110.736197,550072.780155,1025443.0
2,3,1,Warm Cool,1,4,1880.0,1967,2016-09-14,2552.0,3 Story,Row Inside,Above Average,5,Common Brick,Metal- Sms,Hardwood,2,2,1853,Ward 2,2016,1,356730.460068,690623.513757,1476445.0
3,3,1,Forced Air,1,4,1800.0,1967,2018-03-30,2382.0,3 Story,Row Inside,Above Average,6,Common Brick,Metal- Sms,Hardwood,2,3,1853,Ward 2,2018,1,391464.247337,749965.838051,1580055.0
4,3,2,Forced Air,1,4,1800.0,1967,2017-05-01,2520.0,3 Story,Row Inside,Above Average,5,Common Brick,Built Up,Hardwood,2,2,1853,Ward 2,2017,1,391252.556491,698349.82933,1405119.0


In [33]:
unseen_features.sample(10)

Unnamed: 0,BATHRM,HF_BATHRM,HEAT,AC,BEDRM,AYB,EYB,SALEDATE,GBA,STYLE,STRUCT,GRADE,CNDTN,EXTWALL,ROOF,INTWALL,KITCHENS,FIREPLACES,LANDAREA,WARD,SALEYEAR,RMDL,low,mid,high
10017,2,1,Warm Cool,1,4,2016.0,2017,2016-06-23,2264.0,2 Story,Single,Good Quality,5,Brick/Siding,Comp Shingle,Hardwood,1,1,7031,Ward 8,2016,0,158749.148476,241437.72201,535918.4
4163,3,1,Hot Water Rad,1,4,1915.0,1969,2017-05-08,1948.0,2 Story,Row Inside,Good Quality,4,Common Brick,Metal- Sms,Hardwood,2,0,1613,Ward 4,2017,1,194467.707313,332478.805444,654724.5
6471,3,1,Forced Air,1,4,1910.0,1964,2017-07-24,1520.0,2 Story,Row Inside,Average,4,Common Brick,Built Up,Hardwood,1,0,1600,Ward 5,2017,1,90812.364414,201750.893953,357035.6
4812,2,1,Warm Cool,1,4,1910.0,1964,2017-10-20,1248.0,2 Story,Row End,Average,5,Common Brick,Metal- Sms,Wood Floor,1,0,1393,Ward 1,2017,1,144265.191011,254204.427984,618645.9
99,4,0,Warm Cool,1,3,1914.0,1980,2017-07-13,1544.0,3 Story,Row Inside,Above Average,4,Common Brick,Metal- Sms,Hardwood,2,1,1473,Ward 2,2017,1,285371.652496,403691.697184,782332.5
2858,2,1,Warm Cool,1,3,1980.0,1995,2017-12-05,2610.0,2 Story,Row End,Very Good,4,Common Brick,Comp Shingle,Hardwood,1,1,5860,Ward 3,2017,0,353336.511072,511260.329936,1132585.0
653,2,1,Forced Air,1,3,1900.0,1964,2017-09-08,1314.0,2 Story,Row End,Average,4,Common Brick,Built Up,Hardwood,1,0,915,Ward 5,2017,1,67688.028508,154110.209897,322300.8
8290,1,0,Hot Water Rad,0,2,1939.0,1954,2018-01-11,832.0,2 Story,Row Inside,Average,3,Shingle,Built Up,Hardwood,1,0,1711,Ward 7,2018,0,26245.993557,28671.414006,179274.3
4235,2,0,Warm Cool,1,3,1951.0,1972,2016-06-22,1601.0,Split Level,Single,Above Average,3,Common Brick,Comp Shingle,Hardwood,1,1,7382,Ward 4,2016,1,137891.515637,224826.955406,479193.1
5316,3,1,Forced Air,1,4,1923.0,1982,2016-12-07,1472.0,2 Story,Row Inside,Average,5,Common Brick,Built Up,Hardwood,1,0,2353,Ward 4,2016,1,165423.725497,297044.65291,566017.9


In [29]:
unseen_features[unseen_features['mid'] < unseen_features['low']].shape

(911, 25)

In [30]:
unseen_features[unseen_features['high'] < unseen_features['low']].shape

(5, 25)

In [32]:
unseen_features[unseen_features['high'] < unseen_features['mid']].shape

(0, 25)

In [28]:
gbrL.save_model('gbrl')
gbr.save_model('gbr')
gbrH.save_model('gbrh')