In [None]:
!pip install pycaret[full]

In [None]:
!pip install category_encoders

In [21]:
import category_encoders as ce
import numpy as np

  import pandas.util.testing as tm


## Import Dev and Test

In [18]:
dev_path = 'https://raw.githubusercontent.com/elvanselvano/purwadhika-final-project/main/dataset/dev.csv'
test_path = 'https://raw.githubusercontent.com/elvanselvano/purwadhika-final-project/main/dataset/test.csv'

In [22]:
import pandas as pd

dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
def casting(df):
  df['NUM_UNITS'] = df['NUM_UNITS'].astype(pd.Int64Dtype())
  df['KITCHENS'] = df['KITCHENS'].astype(pd.Int64Dtype())
  df['USECODE'] = df['USECODE'].astype(pd.Int64Dtype()).astype('category')
  df['ZIPCODE'] = df['ZIPCODE'].astype(pd.Int64Dtype()).astype('category')
  df['CMPLX_NUM'] = df['CMPLX_NUM'].astype(pd.Int64Dtype())
  df['CENSUS_TRACT'] = df['CMPLX_NUM'].astype(pd.Int64Dtype())
  df['SALEDATE'] = pd.to_datetime(df['SALEDATE'])
  df['SALEYEAR'] = df['SALEDATE'].dt.year
  return df

def drop_qualified(df):
  df = df[df['QUALIFIED']=='Q']
  df = df.drop('QUALIFIED', axis=1)
  return df

def drop_miss_val1(df):
  df_clean = df.drop(['CMPLX_NUM','LIVING_GBA','CENSUS_TRACT',
                        'ASSESSMENT_SUBNBHD','FULLADDRESS','NATIONALGRID',
                        'CENSUS_BLOCK','CITY','STATE','X','Y','QUADRANT',
                        'GIS_LAST_MOD_DTTM','SOURCE','STORIES','ZIPCODE',
                        'ASSESSMENT_NBHD','SQUARE','LONGITUDE','LATITUDE',
                        'ROOMS','SALE_NUM','NUM_UNITS','BLDG_NUM','USECODE'],axis=1)
  return df_clean

def yr_rmdl(df):
  df['RMDL'] = np.where(df['YR_RMDL'].isna(),0,1)
  df = df.drop('YR_RMDL',axis=1)
  return df

def drop_all(df):
  df = df.dropna()
  return df

def encoding_categorical(df):
  df['AC'] = np.where(df['AC']=='Y',1,0)
  df = df[df['HEAT']!='No Data']
  ordinal_mapping = [{'col':'CNDTN','mapping':{'Poor':1,'Fair':2,'Average':3,'Good':4,'Very Good':5,'Excellent':6}}]
  ordinal_encoder = ce.OrdinalEncoder(cols ='CNDTN',mapping = ordinal_mapping) 
  df = ordinal_encoder.fit_transform(df)                                                
  return df

## Preprocess Dev and Test (Exactly Same Pipeline)

In [23]:
print(dev.shape)
dev = casting(dev)
dev = drop_qualified(dev)
dev = drop_miss_val1(dev)
dev = yr_rmdl(dev)
dev = drop_all(dev)
dev = encoding_categorical(dev)
print(dev.shape)

(102661, 49)
(35369, 23)


In [24]:
print(test.shape)
test = casting(test)
test = drop_qualified(test)
test = drop_miss_val1(test)
test = yr_rmdl(test)
test = drop_all(test)
test = encoding_categorical(test)
print(test.shape)

(29526, 49)
(10435, 23)


## Setup PyCaret

In [25]:
import pycaret.regression as reg

In [26]:
regression_setup = reg.setup(data=dev, target='PRICE', session_id=42, use_gpu=True, 
                         categorical_features=['HEAT', 'STYLE', 'STRUCT', 'GRADE', 'EXTWALL', 'ROOF', 'INTWALL', 'WARD', 'AC'],
                         numeric_features=['BATHRM', 'HF_BATHRM', 'BEDRM', 'AYB', 'EYB', 'GBA', 'CNDTN', 'KITCHENS', 'FIREPLACES', 'LANDAREA', 'RMDL', 'SALEYEAR'])  

Unnamed: 0,Description,Value
0,session_id,42
1,Target,PRICE
2,Original Data,"(35369, 23)"
3,Missing Values,False
4,Numeric Features,12
5,Categorical Features,9
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(24758, 141)"


## Train Models Using Exactly Same Setup and Same Random State

In [30]:
catboost_low = reg.create_model('catboost', loss_function='Quantile:alpha=0.025', random_state=42)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,224847.6139,154041400000.0,392481.1421,0.3768,0.5543,0.625
1,215721.7982,192106800000.0,438299.8901,0.3407,0.5261,0.352
2,229863.4419,235557200000.0,485342.3642,0.4462,0.5372,0.3565
3,219736.3765,151509800000.0,389242.614,0.378,0.5223,0.3477
4,232075.2958,349396300000.0,591097.5102,0.2526,0.5322,0.3531
5,228773.2655,164284500000.0,405320.2069,0.4015,0.5382,0.3617
6,227617.2579,173351800000.0,416355.3198,0.4076,0.5335,0.363
7,226493.9288,150803800000.0,388334.7024,0.3739,0.5282,0.3539
8,221889.7177,149450200000.0,386587.8367,0.403,0.5312,0.3611
9,239718.9926,217497700000.0,466366.4772,0.3611,0.5417,0.3756


In [31]:
catboost_mid = reg.create_model('catboost', loss_function='Quantile:alpha=0.5', random_state=42)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,72752.9791,18516360000.0,136074.8465,0.9251,0.2716,0.6727
1,75910.7106,53801390000.0,231951.2742,0.8154,0.2142,0.1584
2,84728.2727,109089300000.0,330286.7184,0.7435,0.215,0.1595
3,77453.8653,21780360000.0,147581.6954,0.9106,0.211,0.1589
4,81339.869,140477700000.0,374803.5504,0.6995,0.2004,0.1509
5,74279.6075,18742900000.0,136904.7183,0.9317,0.209,0.156
6,79276.3331,34277640000.0,185142.203,0.8829,0.2218,0.1633
7,75503.9555,23160430000.0,152185.4997,0.9038,0.2164,0.1581
8,74836.7785,23449020000.0,153130.7143,0.9063,0.218,0.1631
9,82660.4457,45904180000.0,214252.6082,0.8651,0.2194,0.1648


In [32]:
catboost_high = reg.create_model('catboost', loss_function='Quantile:alpha=0.975', random_state=42)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,235529.5193,117881900000.0,343339.3204,0.5231,0.5383,1.517
1,230078.5825,126916000000.0,356252.7181,0.5645,0.4935,0.5718
2,244747.9922,171764500000.0,414444.8407,0.5962,0.5163,0.6074
3,243183.3477,121760000000.0,348941.2422,0.5002,0.5144,0.6062
4,245894.0007,232301200000.0,481976.3454,0.5031,0.5103,0.6005
5,244622.0062,142058100000.0,376905.9223,0.4825,0.5108,0.6016
6,238031.7253,123103200000.0,350860.5968,0.5793,0.5036,0.5893
7,254897.2189,143378500000.0,378653.5239,0.4047,0.5334,0.6367
8,239640.3796,139527700000.0,373534.0672,0.4426,0.5114,0.6123
9,249575.9975,137874600000.0,371314.6808,0.595,0.5164,0.6159


## Predictions on Unseen Data (Skip Finalize Model)

In [33]:
data_unseen_features = test.drop('PRICE', axis=1)
data_unseen_target = test['PRICE']

In [35]:
predictions_low = reg.predict_model(catboost_low, data=data_unseen_features)
predictions_mid = reg.predict_model(catboost_mid, data=data_unseen_features)
predictions_high = reg.predict_model(catboost_high, data=data_unseen_features)

In [39]:
predictions_low['Label'].describe()

count    1.043500e+04
mean     4.764291e+05
std      2.529189e+05
min      6.771662e+04
25%      2.852796e+05
50%      4.253855e+05
75%      6.135230e+05
max      3.009498e+06
Name: Label, dtype: float64

In [40]:
predictions_mid['Label'].describe()

count    1.043500e+04
mean     7.479521e+05
std      5.256905e+05
min      1.386231e+05
25%      4.134811e+05
50%      6.389574e+05
75%      9.048996e+05
max      6.657032e+06
Name: Label, dtype: float64

In [41]:
predictions_high['Label'].describe()

count    1.043500e+04
mean     1.026438e+06
std      7.193845e+05
min      2.901156e+05
25%      6.140511e+05
50%      8.671584e+05
75%      1.176592e+06
max      1.084488e+07
Name: Label, dtype: float64

## Check Invalid Predictions

In [42]:
mid_low = predictions_mid['Label'] - predictions_low['Label']
mid_low[mid_low < 0]

3955     -34757.116501
4490     -94774.445127
4857     -37678.705031
5214    -341757.319510
5325    -135464.713922
             ...      
17305    -70765.645440
17306   -120973.734946
17307    -93930.421431
17430   -140449.667476
17431   -133993.514317
Name: Label, Length: 223, dtype: float64

In [43]:
high_low = predictions_high['Label'] - predictions_low['Label']
high_low[high_low < 0]

10279   -144308.024237
10340    -12033.022448
11653    -16608.034537
11654    -30604.922050
11659    -28880.483536
15113    -35290.730512
15629   -735097.841814
16153    -38863.569093
16205    -20719.100908
16318    -27727.134445
16319    -12963.819634
17028     -9441.859949
17050    -21436.957463
Name: Label, dtype: float64