In [39]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor, Pool

df = pd.read_csv('train.csv', index_col=0)

df = df[df.price_doc > 1000000]

cat_features = ['state', 'product_type', 'year','radiation_raion']

for col in cat_features:
    df[col] = df[col].astype('string')

X = df.drop('price_doc', axis=1)  
y = df['price_doc']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=1)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1)

best_rmse =float('inf')
best_params = None
best_model = None

param_combinations = [
    {'depth': 5, 'learning_rate': 0.25, 'iterations': 1400,'bagging_temperature': 0.8},
    {'depth': 5, 'learning_rate': 0.25, 'iterations': 1425,'bagging_temperature': 0.8},
    {'depth': 5, 'learning_rate': 0.26, 'iterations': 1400,'bagging_temperature': 0.8},
    {'depth': 5, 'learning_rate': 0.24, 'iterations': 1400,'bagging_temperature': 0.8}
]

for params in param_combinations:
    model = CatBoostRegressor(
        cat_features=cat_features,
        iterations=params['iterations'],
        learning_rate=params['learning_rate'],
        depth=params['depth'], 
        bagging_temperature=params['bagging_temperature'],
        
    )

    eval_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)

    model.fit(X_train, y_train, eval_set=eval_pool, early_stopping_rounds=300)

    test_preds = model.predict(X_test)

    rms = mean_squared_error(y_test, test_preds, squared=False)
    print(f'Params: {params} - Root Mean Squared Error: {rms}')
    
    if rms < best_rmse:
        best_rmse = rms
        best_params = params
        best_model = model

print(f'Best Params: {best_params} - Best Root Mean Squared Error: {best_rmse}')

test_preds = best_model.predict(X_test)

rms = mean_squared_error(y_test, test_preds, squared=False)
print(f'Final Root Mean Squared Error with Best Model: {rms}')


0:	learn: 4084543.8861170	test: 4049793.4227262	best: 4049793.4227262 (0)	total: 31.6ms	remaining: 44.1s
1:	learn: 3667447.0792008	test: 3644668.6898955	best: 3644668.6898955 (1)	total: 68.3ms	remaining: 47.7s
2:	learn: 3380015.1587656	test: 3392824.8114805	best: 3392824.8114805 (2)	total: 112ms	remaining: 52.4s
3:	learn: 3161522.9496446	test: 3201240.9861776	best: 3201240.9861776 (3)	total: 146ms	remaining: 50.8s
4:	learn: 3007778.2710278	test: 3087413.6268348	best: 3087413.6268348 (4)	total: 187ms	remaining: 52.1s
5:	learn: 2893404.9515632	test: 2987190.6042187	best: 2987190.6042187 (5)	total: 227ms	remaining: 52.8s
6:	learn: 2800857.8119496	test: 2916853.0812572	best: 2916853.0812572 (6)	total: 282ms	remaining: 56.2s
7:	learn: 2737057.6347798	test: 2852728.6532097	best: 2852728.6532097 (7)	total: 339ms	remaining: 58.9s
8:	learn: 2683501.1197155	test: 2812220.2894758	best: 2812220.2894758 (8)	total: 390ms	remaining: 1m
9:	learn: 2646887.7233820	test: 2800767.8879410	best: 2800767.887



0:	learn: 4084543.8861170	test: 4049793.4227262	best: 4049793.4227262 (0)	total: 41.6ms	remaining: 59.3s
1:	learn: 3667447.0792008	test: 3644668.6898955	best: 3644668.6898955 (1)	total: 85.4ms	remaining: 1m
2:	learn: 3380015.1587656	test: 3392824.8114805	best: 3392824.8114805 (2)	total: 137ms	remaining: 1m 4s
3:	learn: 3161522.9496446	test: 3201240.9861776	best: 3201240.9861776 (3)	total: 188ms	remaining: 1m 6s
4:	learn: 3007778.2710278	test: 3087413.6268348	best: 3087413.6268348 (4)	total: 228ms	remaining: 1m 4s
5:	learn: 2893404.9515632	test: 2987190.6042187	best: 2987190.6042187 (5)	total: 346ms	remaining: 1m 21s
6:	learn: 2800857.8119496	test: 2916853.0812572	best: 2916853.0812572 (6)	total: 436ms	remaining: 1m 28s
7:	learn: 2737057.6347798	test: 2852728.6532097	best: 2852728.6532097 (7)	total: 484ms	remaining: 1m 25s
8:	learn: 2683501.1197155	test: 2812220.2894758	best: 2812220.2894758 (8)	total: 536ms	remaining: 1m 24s
9:	learn: 2646887.7233820	test: 2800767.8879410	best: 2800767



0:	learn: 4063880.8544219	test: 4030234.7009702	best: 4030234.7009702 (0)	total: 51.7ms	remaining: 1m 12s
1:	learn: 3637217.1600994	test: 3615788.5126833	best: 3615788.5126833 (1)	total: 91.5ms	remaining: 1m 3s
2:	learn: 3348821.0623045	test: 3364011.2088642	best: 3364011.2088642 (2)	total: 130ms	remaining: 1m
3:	learn: 3133799.2808204	test: 3175035.0964062	best: 3175035.0964062 (3)	total: 172ms	remaining: 1m
4:	learn: 2983074.5889884	test: 3065825.6517183	best: 3065825.6517183 (4)	total: 213ms	remaining: 59.5s
5:	learn: 2871617.3191929	test: 2969099.0377765	best: 2969099.0377765 (5)	total: 256ms	remaining: 59.6s
6:	learn: 2780400.8737781	test: 2897512.7961676	best: 2897512.7961676 (6)	total: 303ms	remaining: 1m
7:	learn: 2718102.9405188	test: 2834857.1630330	best: 2834857.1630330 (7)	total: 350ms	remaining: 1m
8:	learn: 2667291.6669602	test: 2796842.9962353	best: 2796842.9962353 (8)	total: 391ms	remaining: 1m
9:	learn: 2621268.9097281	test: 2769459.4154306	best: 2769459.4154306 (9)	to



Params: {'depth': 5, 'learning_rate': 0.26, 'iterations': 1400, 'bagging_temperature': 0.8} - Root Mean Squared Error: 2297729.4464563304
0:	learn: 4105373.4243471	test: 4069529.0548779	best: 4069529.0548779 (0)	total: 29.4ms	remaining: 41.1s
1:	learn: 3698379.7426369	test: 3674243.4417756	best: 3674243.4417756 (1)	total: 59.8ms	remaining: 41.8s
2:	learn: 3412551.4426122	test: 3422938.0030694	best: 3422938.0030694 (2)	total: 90.8ms	remaining: 42.3s
3:	learn: 3191967.8578563	test: 3228911.1871061	best: 3228911.1871061 (3)	total: 119ms	remaining: 41.4s
4:	learn: 3035147.5979774	test: 3110829.8012449	best: 3110829.8012449 (4)	total: 155ms	remaining: 43.3s
5:	learn: 2921611.4882629	test: 3012748.3374685	best: 3012748.3374685 (5)	total: 192ms	remaining: 44.6s
6:	learn: 2831133.7756206	test: 2937389.5066438	best: 2937389.5066438 (6)	total: 228ms	remaining: 45.3s
7:	learn: 2762232.3706777	test: 2870893.1546587	best: 2870893.1546587 (7)	total: 286ms	remaining: 49.8s
8:	learn: 2705470.5916948	t



In [40]:
submission = pd.read_csv('submission.csv')
test_df = pd.read_csv('test.csv', index_col=0)
for col in cat_features:
    test_df[col] = test_df[col].astype('string')
test_preds = model.predict(test_df[X.columns])
submission['price_doc'] = test_preds
print(len(test_preds))
print(submission.shape)
submission.to_csv('submission.csv')


9142
(9142, 3)
