In [57]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_log_error



pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', 500)

In [61]:
# read the train, test and macro files and combine macro information with train and test
train_df = pd.read_csv("train.csv", parse_dates=['timestamp'])
test_df = pd.read_csv("test.csv", parse_dates=['timestamp'])
macro_df = pd.read_csv("macro.csv", parse_dates=['timestamp'])
train_df = pd.merge(train_df, macro_df, how='left', on='timestamp')
test_df = pd.merge(test_df, macro_df, how='left', on='timestamp')
print(train_df.shape, test_df.shape)

# truncate the extreme values in price_doc #
ulimit = np.percentile(train_df.price_doc.values, 99)
llimit = np.percentile(train_df.price_doc.values, 1)
train_df['price_doc'].iloc[train_df['price_doc']>ulimit] = ulimit
train_df['price_doc'].iloc[train_df['price_doc']<llimit] = llimit

(30471, 391) (7662, 390)


In [62]:
# there are few categorical variables in the train data
# we can convert them into numerical variables by label encoding

cat_features = []
for f in train_df.columns:
    if train_df[f].dtype=='object':
        cat_features.append(f)

for f in cat_features:
    print(f)
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_df[f].values.astype('str')) + list(test_df[f].values.astype('str')))
    train_df[f] = lbl.transform(list(train_df[f].values.astype('str')))
    test_df[f] = lbl.transform(list(test_df[f].values.astype('str')))

product_type
sub_area
culture_objects_top_25
thermal_power_plant_raion
incineration_raion
oil_chemistry_raion
radiation_raion
railroad_terminal_raion
big_market_raion
nuclear_reactor_raion
detention_facility_raion
water_1line
big_road1_1line
railroad_1line
ecology
child_on_acc_pre_school
modern_education_share
old_education_build_share


In [24]:
# we also have some null values in the dataset
# one feature idea could be to use the count of nulls in the row.
train_df["null_count"] = train_df.isnull().sum(axis=1)
test_df["null_count"] = test_df.isnull().sum(axis=1)

In [25]:
# impute the missing values with some value which is outside the range of values of the column, say -99.
train_df.fillna(-99, inplace=True)
test_df.fillna(-99, inplace=True)

We have a timestamp variable in the dataset and time could be one of an important factor determining the price. So let us extract some features out of the timestamp variable.

In [26]:
# year and month #
train_df["yearmonth"] = train_df["timestamp"].dt.year*100 + train_df["timestamp"].dt.month
test_df["yearmonth"] = test_df["timestamp"].dt.year*100 + test_df["timestamp"].dt.month

# year and week #
train_df["yearweek"] = train_df["timestamp"].dt.year*100 + train_df["timestamp"].dt.weekofyear
test_df["yearweek"] = test_df["timestamp"].dt.year*100 + test_df["timestamp"].dt.weekofyear

# year #
train_df["year"] = train_df["timestamp"].dt.year
test_df["year"] = test_df["timestamp"].dt.year

# month of year #
train_df["month_of_year"] = train_df["timestamp"].dt.month
test_df["month_of_year"] = test_df["timestamp"].dt.month

# week of year #
train_df["week_of_year"] = train_df["timestamp"].dt.weekofyear
test_df["week_of_year"] = test_df["timestamp"].dt.weekofyear

# day of week #
train_df["day_of_week"] = train_df["timestamp"].dt.weekday
test_df["day_of_week"] = test_df["timestamp"].dt.weekday


In [27]:
# ratio of living area to full area #
train_df["ratio_life_sq_full_sq"] = train_df["life_sq"] / np.maximum(train_df["full_sq"].astype("float"),1)
test_df["ratio_life_sq_full_sq"] = test_df["life_sq"] / np.maximum(test_df["full_sq"].astype("float"),1)
train_df["ratio_life_sq_full_sq"].iloc[train_df["ratio_life_sq_full_sq"]<0] = 0
train_df["ratio_life_sq_full_sq"].iloc[train_df["ratio_life_sq_full_sq"]>1] = 1
test_df["ratio_life_sq_full_sq"].iloc[test_df["ratio_life_sq_full_sq"]<0] = 0
test_df["ratio_life_sq_full_sq"].iloc[test_df["ratio_life_sq_full_sq"]>1] = 1

# ratio of kitchen area to living area #
train_df["ratio_kitch_sq_life_sq"] = train_df["kitch_sq"] / np.maximum(train_df["life_sq"].astype("float"),1)
test_df["ratio_kitch_sq_life_sq"] = test_df["kitch_sq"] / np.maximum(test_df["life_sq"].astype("float"),1)
train_df["ratio_kitch_sq_life_sq"].iloc[train_df["ratio_kitch_sq_life_sq"]<0] = 0
train_df["ratio_kitch_sq_life_sq"].iloc[train_df["ratio_kitch_sq_life_sq"]>1] = 1
test_df["ratio_kitch_sq_life_sq"].iloc[test_df["ratio_kitch_sq_life_sq"]<0] = 0
test_df["ratio_kitch_sq_life_sq"].iloc[test_df["ratio_kitch_sq_life_sq"]>1] = 1

# ratio of kitchen area to full area #
train_df["ratio_kitch_sq_full_sq"] = train_df["kitch_sq"] / np.maximum(train_df["full_sq"].astype("float"),1)
test_df["ratio_kitch_sq_full_sq"] = test_df["kitch_sq"] / np.maximum(test_df["full_sq"].astype("float"),1)
train_df["ratio_kitch_sq_full_sq"].iloc[train_df["ratio_kitch_sq_full_sq"]<0] = 0
train_df["ratio_kitch_sq_full_sq"].iloc[train_df["ratio_kitch_sq_full_sq"]>1] = 1
test_df["ratio_kitch_sq_full_sq"].iloc[test_df["ratio_kitch_sq_full_sq"]<0] = 0
test_df["ratio_kitch_sq_full_sq"].iloc[test_df["ratio_kitch_sq_full_sq"]>1] = 1


Also the next important variables are floor and max_floor. So let us create two variables

 1. Floor number of the house to the total number of floors
 2. Number of floor from the top

In [28]:
# floor of the house to the total number of floors in the house #
train_df["ratio_floor_max_floor"] = train_df["floor"] / train_df["max_floor"].astype("float")
test_df["ratio_floor_max_floor"] = test_df["floor"] / test_df["max_floor"].astype("float")

# num of floor from top #
train_df["floor_from_top"] = train_df["max_floor"] - train_df["floor"]
test_df["floor_from_top"] = test_df["max_floor"] - test_df["floor"]

One more variable from floor area could be the difference between full area and living area.

In [29]:
train_df["extra_sq"] = train_df["full_sq"] - train_df["life_sq"]
test_df["extra_sq"] = test_df["full_sq"] - test_df["life_sq"]

Age of building might have an impact in the rental price and so we can add that one as well.

In [40]:
train_df["age_of_building"] =  train_df["year"] - train_df["build_year"]
test_df["age_of_building"] = test_df["year"] - test_df["build_year"]

Price of the house could also be affected by the availability of other houses at the same time period. So creating a count variable on the number of houses at the given time period might help.

In [31]:
def add_count(df, group_col):
    grouped_df = df.groupby(group_col)["id"].aggregate("count").reset_index()
    grouped_df.columns = [group_col, "count_"+group_col]
    df = pd.merge(df, grouped_df, on=group_col, how="left")
    return df

train_df = add_count(train_df, "yearmonth")
test_df = add_count(test_df, "yearmonth")

train_df = add_count(train_df, "yearweek")
test_df = add_count(test_df, "yearweek")

Since schools generally play an important role in house hunting, let us create some variables around school.

In [32]:
train_df["ratio_preschool"] = train_df["children_preschool"] / train_df["preschool_quota"].astype("float")
test_df["ratio_preschool"] = test_df["children_preschool"] / test_df["preschool_quota"].astype("float")

train_df["ratio_school"] = train_df["children_school"] / train_df["school_quota"].astype("float")
test_df["ratio_school"] = test_df["children_school"] / test_df["school_quota"].astype("float")

In [47]:
# drop the variables which are not needed in model building
train_X = train_df.drop(["id", "timestamp", "price_doc"], axis=1)
test_X = test_df.drop(["id", "timestamp"] , axis=1)

In [49]:
 # use log of the target variable for model building rather than using the actual target variable.
train_y = np.log1p(train_df.price_doc.values)

In this competition, the train and test set are from different time periods and so let us use the last 1 year as validation set for building our models and rest as model development set.

In [77]:
val_time = 201407
dev_indices = np.where(train_X["yearmonth"]<val_time)
val_indices = np.where(train_X["yearmonth"]>=val_time)
dev_X = train_X.iloc[dev_indices]
val_X = train_X.iloc[val_indices]
dev_y = train_y[dev_indices]
val_y = train_y[val_indices]
print(dev_X.shape, val_X.shape)

(20483, 406) (9988, 406)


Обучаем модель catboost на получившихся признаках и смотрим ошибку.

In [64]:
cat = CatBoostRegressor()

In [78]:
cat.fit(dev_X, dev_y, cat_features=cat_features)

0:	learn: 0.5907255	total: 164ms	remaining: 2m 43s
1:	learn: 0.5906770	total: 196ms	remaining: 1m 37s
2:	learn: 0.5906584	total: 228ms	remaining: 1m 15s
3:	learn: 0.5905953	total: 273ms	remaining: 1m 7s
4:	learn: 0.5905868	total: 293ms	remaining: 58.4s
5:	learn: 0.5905504	total: 316ms	remaining: 52.3s
6:	learn: 0.5905344	total: 342ms	remaining: 48.5s
7:	learn: 0.5905193	total: 374ms	remaining: 46.3s
8:	learn: 0.5905166	total: 399ms	remaining: 44s
9:	learn: 0.5904761	total: 432ms	remaining: 42.7s
10:	learn: 0.5904380	total: 460ms	remaining: 41.4s
11:	learn: 0.5903877	total: 560ms	remaining: 46.1s
12:	learn: 0.5903855	total: 581ms	remaining: 44.1s
13:	learn: 0.5903383	total: 628ms	remaining: 44.2s
14:	learn: 0.5903067	total: 656ms	remaining: 43.1s
15:	learn: 0.5902998	total: 679ms	remaining: 41.7s
16:	learn: 0.5902880	total: 706ms	remaining: 40.8s
17:	learn: 0.5902558	total: 733ms	remaining: 40s
18:	learn: 0.5902273	total: 764ms	remaining: 39.5s
19:	learn: 0.5901893	total: 819ms	remainin

162:	learn: 0.5895908	total: 4.45s	remaining: 22.8s
163:	learn: 0.5895902	total: 4.47s	remaining: 22.8s
164:	learn: 0.5895901	total: 4.5s	remaining: 22.8s
165:	learn: 0.5895901	total: 4.51s	remaining: 22.7s
166:	learn: 0.5895901	total: 4.53s	remaining: 22.6s
167:	learn: 0.5895899	total: 4.56s	remaining: 22.6s
168:	learn: 0.5895897	total: 4.58s	remaining: 22.5s
169:	learn: 0.5895890	total: 4.61s	remaining: 22.5s
170:	learn: 0.5895883	total: 4.64s	remaining: 22.5s
171:	learn: 0.5895883	total: 4.66s	remaining: 22.4s
172:	learn: 0.5895882	total: 4.68s	remaining: 22.4s
173:	learn: 0.5895876	total: 4.71s	remaining: 22.4s
174:	learn: 0.5895876	total: 4.72s	remaining: 22.3s
175:	learn: 0.5895876	total: 4.74s	remaining: 22.2s
176:	learn: 0.5895871	total: 4.76s	remaining: 22.1s
177:	learn: 0.5895870	total: 4.79s	remaining: 22.1s
178:	learn: 0.5895866	total: 4.82s	remaining: 22.1s
179:	learn: 0.5895865	total: 4.84s	remaining: 22s
180:	learn: 0.5895864	total: 4.86s	remaining: 22s
181:	learn: 0.589

322:	learn: 0.5522474	total: 17.5s	remaining: 36.8s
323:	learn: 0.5503323	total: 17.6s	remaining: 36.8s
324:	learn: 0.5501525	total: 17.8s	remaining: 36.9s
325:	learn: 0.5501295	total: 18s	remaining: 37.1s
326:	learn: 0.5501124	total: 18.1s	remaining: 37.2s
327:	learn: 0.5500940	total: 18.2s	remaining: 37.2s
328:	learn: 0.5500722	total: 18.3s	remaining: 37.2s
329:	learn: 0.5500564	total: 18.4s	remaining: 37.3s
330:	learn: 0.5500446	total: 18.4s	remaining: 37.3s
331:	learn: 0.5498617	total: 18.6s	remaining: 37.5s
332:	learn: 0.5475831	total: 18.7s	remaining: 37.5s
333:	learn: 0.5472715	total: 18.8s	remaining: 37.6s
334:	learn: 0.5471669	total: 19s	remaining: 37.6s
335:	learn: 0.5471024	total: 19.1s	remaining: 37.7s
336:	learn: 0.5469220	total: 19.2s	remaining: 37.8s
337:	learn: 0.5468299	total: 19.4s	remaining: 38s
338:	learn: 0.5458122	total: 19.5s	remaining: 38s
339:	learn: 0.5456233	total: 19.6s	remaining: 38s
340:	learn: 0.5455200	total: 19.7s	remaining: 38.1s
341:	learn: 0.5453606	

481:	learn: 0.4911903	total: 36.5s	remaining: 39.3s
482:	learn: 0.4911892	total: 36.6s	remaining: 39.2s
483:	learn: 0.4911885	total: 36.7s	remaining: 39.1s
484:	learn: 0.4911569	total: 36.8s	remaining: 39.1s
485:	learn: 0.4910489	total: 36.9s	remaining: 39s
486:	learn: 0.4910482	total: 37s	remaining: 38.9s
487:	learn: 0.4910315	total: 37.1s	remaining: 39s
488:	learn: 0.4910309	total: 37.2s	remaining: 38.9s
489:	learn: 0.4909240	total: 37.3s	remaining: 38.9s
490:	learn: 0.4908309	total: 37.4s	remaining: 38.8s
491:	learn: 0.4907553	total: 37.5s	remaining: 38.7s
492:	learn: 0.4907249	total: 37.6s	remaining: 38.7s
493:	learn: 0.4905614	total: 37.8s	remaining: 38.7s
494:	learn: 0.4905601	total: 37.9s	remaining: 38.6s
495:	learn: 0.4905591	total: 38s	remaining: 38.6s
496:	learn: 0.4905581	total: 38.1s	remaining: 38.5s
497:	learn: 0.4905575	total: 38.1s	remaining: 38.4s
498:	learn: 0.4905420	total: 38.2s	remaining: 38.4s
499:	learn: 0.4904307	total: 38.4s	remaining: 38.4s
500:	learn: 0.490429

640:	learn: 0.4889430	total: 53.6s	remaining: 30s
641:	learn: 0.4889423	total: 53.7s	remaining: 29.9s
642:	learn: 0.4889420	total: 53.8s	remaining: 29.9s
643:	learn: 0.4889412	total: 53.9s	remaining: 29.8s
644:	learn: 0.4889405	total: 53.9s	remaining: 29.7s
645:	learn: 0.4889403	total: 54s	remaining: 29.6s
646:	learn: 0.4889396	total: 54.2s	remaining: 29.6s
647:	learn: 0.4889389	total: 54.3s	remaining: 29.5s
648:	learn: 0.4889384	total: 54.4s	remaining: 29.4s
649:	learn: 0.4889382	total: 54.5s	remaining: 29.3s
650:	learn: 0.4889377	total: 54.6s	remaining: 29.2s
651:	learn: 0.4889373	total: 54.6s	remaining: 29.2s
652:	learn: 0.4889368	total: 54.8s	remaining: 29.1s
653:	learn: 0.4889364	total: 54.9s	remaining: 29.1s
654:	learn: 0.4889360	total: 55s	remaining: 29s
655:	learn: 0.4889352	total: 55.1s	remaining: 28.9s
656:	learn: 0.4889351	total: 55.2s	remaining: 28.8s
657:	learn: 0.4889343	total: 55.3s	remaining: 28.7s
658:	learn: 0.4889340	total: 55.5s	remaining: 28.7s
659:	learn: 0.488933

801:	learn: 0.4888766	total: 1m 11s	remaining: 17.7s
802:	learn: 0.4888764	total: 1m 11s	remaining: 17.6s
803:	learn: 0.4888758	total: 1m 11s	remaining: 17.5s
804:	learn: 0.4888752	total: 1m 12s	remaining: 17.4s
805:	learn: 0.4888751	total: 1m 12s	remaining: 17.4s
806:	learn: 0.4888749	total: 1m 12s	remaining: 17.3s
807:	learn: 0.4888745	total: 1m 12s	remaining: 17.2s
808:	learn: 0.4888743	total: 1m 12s	remaining: 17.1s
809:	learn: 0.4888738	total: 1m 12s	remaining: 17s
810:	learn: 0.4888737	total: 1m 12s	remaining: 17s
811:	learn: 0.4888735	total: 1m 12s	remaining: 16.9s
812:	learn: 0.4888734	total: 1m 12s	remaining: 16.8s
813:	learn: 0.4888731	total: 1m 13s	remaining: 16.7s
814:	learn: 0.4888730	total: 1m 13s	remaining: 16.6s
815:	learn: 0.4888728	total: 1m 13s	remaining: 16.5s
816:	learn: 0.4888723	total: 1m 13s	remaining: 16.4s
817:	learn: 0.4888721	total: 1m 13s	remaining: 16.3s
818:	learn: 0.4888716	total: 1m 13s	remaining: 16.3s
819:	learn: 0.4888714	total: 1m 13s	remaining: 16.

957:	learn: 0.4888322	total: 1m 30s	remaining: 3.97s
958:	learn: 0.4888321	total: 1m 30s	remaining: 3.87s
959:	learn: 0.4888317	total: 1m 30s	remaining: 3.78s
960:	learn: 0.4888315	total: 1m 30s	remaining: 3.69s
961:	learn: 0.4888315	total: 1m 31s	remaining: 3.6s
962:	learn: 0.4888314	total: 1m 31s	remaining: 3.5s
963:	learn: 0.4888305	total: 1m 31s	remaining: 3.41s
964:	learn: 0.4888302	total: 1m 31s	remaining: 3.31s
965:	learn: 0.4888290	total: 1m 31s	remaining: 3.22s
966:	learn: 0.4888288	total: 1m 31s	remaining: 3.13s
967:	learn: 0.4888286	total: 1m 31s	remaining: 3.03s
968:	learn: 0.4888282	total: 1m 31s	remaining: 2.94s
969:	learn: 0.4888280	total: 1m 31s	remaining: 2.84s
970:	learn: 0.4888277	total: 1m 32s	remaining: 2.75s
971:	learn: 0.4888275	total: 1m 32s	remaining: 2.66s
972:	learn: 0.4888275	total: 1m 32s	remaining: 2.56s
973:	learn: 0.4888265	total: 1m 32s	remaining: 2.47s
974:	learn: 0.4888262	total: 1m 32s	remaining: 2.37s
975:	learn: 0.4888260	total: 1m 32s	remaining: 2

<catboost.core.CatBoostRegressor at 0x7faf769206d8>

In [79]:
print('Train RMSLE:', np.sqrt(mean_squared_log_error(dev_y, cat.predict(dev_X))))
print('Test RMSLE:', np.sqrt(mean_squared_log_error(val_y, cat.predict(val_X))))


Train RMSLE: 0.030134504744437005
Test RMSLE: 0.0316381536514007


In [82]:
dict(zip(cat.feature_names_, cat.feature_importances_))

{'full_sq': 33.767229462951526,
 'life_sq': 3.627527758001588,
 'floor': 0.14626184553875302,
 'max_floor': 0.20902912665699241,
 'material': 0.0,
 'build_year': 0.0036586077708640986,
 'num_room': 0.20548366713635396,
 'kitch_sq': 0.2615065575459655,
 'state': 0.4096132019729149,
 'product_type': 1.2559579846532096,
 'sub_area': 23.239668465648155,
 'area_m': 0.0021870725326889767,
 'raion_popul': 0.0,
 'green_zone_part': 0.026999770113223093,
 'indust_part': 0.3522823793810761,
 'children_preschool': 0.0,
 'preschool_quota': 0.06214458712678636,
 'preschool_education_centers_raion': 0.004785541283179527,
 'children_school': 0.0,
 'school_quota': 0.0,
 'school_education_centers_raion': 0.0026813461414880065,
 'school_education_centers_top_20_raion': 0.039450126616804665,
 'hospital_beds_raion': 0.05227361639150078,
 'healthcare_centers_raion': 0.13144437383776372,
 'university_top_20_raion': 0.0,
 'sport_objects_raion': 0.025445259994527906,
 'additional_education_raion': 0.0,
 'cultu