# CatBoost

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

In [3]:
df = pd.read_csv("Hitters.csv")
df = df.dropna()
dms = pd.get_dummies(df[['League','Division','NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary','League','Division','NewLeague'],axis=1).astype('float64')
X = pd.concat([X_,dms[['League_N','Division_W','NewLeague_N']]],axis=1)
X_train,X_test,y_train,y_test = train_test_split(X,
                                                y,
                                                test_size=0.25,
                                                random_state=42)

### Model & Tahmin

In [10]:
from catboost import CatBoostRegressor

In [11]:
catb_model=CatBoostRegressor().fit(X_train, y_train)

Learning rate set to 0.029229
0:	learn: 438.1974206	total: 2.67ms	remaining: 2.67s
1:	learn: 432.4168868	total: 4.66ms	remaining: 2.33s
2:	learn: 426.3836690	total: 6.88ms	remaining: 2.28s
3:	learn: 420.2261014	total: 9.5ms	remaining: 2.37s
4:	learn: 414.9976675	total: 11.3ms	remaining: 2.25s
5:	learn: 409.6125323	total: 12.7ms	remaining: 2.1s
6:	learn: 403.9277911	total: 14.1ms	remaining: 2s
7:	learn: 398.4395285	total: 15.4ms	remaining: 1.91s
8:	learn: 392.4517081	total: 16.5ms	remaining: 1.82s
9:	learn: 387.4871123	total: 18.6ms	remaining: 1.84s
10:	learn: 382.6230510	total: 20.3ms	remaining: 1.82s
11:	learn: 378.1012454	total: 21.4ms	remaining: 1.76s
12:	learn: 372.6002306	total: 22.5ms	remaining: 1.7s
13:	learn: 368.4682192	total: 23.5ms	remaining: 1.65s
14:	learn: 364.0565766	total: 24.5ms	remaining: 1.61s
15:	learn: 359.5683249	total: 25.6ms	remaining: 1.57s
16:	learn: 355.1782794	total: 27ms	remaining: 1.56s
17:	learn: 350.4689946	total: 28.7ms	remaining: 1.56s
18:	learn: 346.2

In [12]:
y_pred = catb_model.predict(X_test)
np.sqrt(mean_squared_error(y_test,y_pred))

350.2683163098795

In [13]:
catb_params= {"iterations":[200,500,1000],
              "learning_rate":[0.01,0.1],
              "depth":[3,6,8]}

In [14]:
catb_model = CatBoostRegressor()

In [15]:
catb_cv_model = GridSearchCV(catb_model, catb_params,cv=5,n_jobs=-1, verbose=2).fit(X_train,y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   16.5s


0:	learn: 425.7900818	total: 954us	remaining: 190ms
1:	learn: 404.8723520	total: 1.54ms	remaining: 153ms
2:	learn: 387.4057666	total: 2.08ms	remaining: 137ms
3:	learn: 372.2801584	total: 2.52ms	remaining: 124ms
4:	learn: 358.9204229	total: 3.19ms	remaining: 125ms
5:	learn: 347.0083933	total: 3.77ms	remaining: 122ms
6:	learn: 336.0130818	total: 4.42ms	remaining: 122ms
7:	learn: 324.3923300	total: 4.87ms	remaining: 117ms
8:	learn: 314.8690957	total: 5.38ms	remaining: 114ms
9:	learn: 308.5075563	total: 5.96ms	remaining: 113ms
10:	learn: 298.8587285	total: 6.45ms	remaining: 111ms
11:	learn: 294.7655438	total: 6.89ms	remaining: 108ms
12:	learn: 288.0697862	total: 7.32ms	remaining: 105ms
13:	learn: 282.6697154	total: 7.74ms	remaining: 103ms
14:	learn: 277.6121667	total: 8.12ms	remaining: 100ms
15:	learn: 273.4383979	total: 8.51ms	remaining: 97.9ms
16:	learn: 269.1556201	total: 8.88ms	remaining: 95.6ms
17:	learn: 264.8098704	total: 9.27ms	remaining: 93.7ms
18:	learn: 261.6700768	total: 9.64ms

[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:  1.8min finished


In [16]:
catb_cv_model.best_params_

{'depth': 3, 'iterations': 200, 'learning_rate': 0.1}

In [17]:
catb_tuned=CatBoostRegressor(depth=3, iterations=200, learning_rate=0.1).fit(X_train,y_train)

0:	learn: 425.7900818	total: 658us	remaining: 131ms
1:	learn: 404.8723520	total: 1.23ms	remaining: 122ms
2:	learn: 387.4057666	total: 1.96ms	remaining: 129ms
3:	learn: 372.2801584	total: 3.08ms	remaining: 151ms
4:	learn: 358.9204229	total: 4.01ms	remaining: 156ms
5:	learn: 347.0083933	total: 4.54ms	remaining: 147ms
6:	learn: 336.0130818	total: 5.36ms	remaining: 148ms
7:	learn: 324.3923300	total: 5.92ms	remaining: 142ms
8:	learn: 314.8690957	total: 6.37ms	remaining: 135ms
9:	learn: 308.5075563	total: 6.92ms	remaining: 131ms
10:	learn: 298.8587285	total: 7.51ms	remaining: 129ms
11:	learn: 294.7655438	total: 8.11ms	remaining: 127ms
12:	learn: 288.0697862	total: 8.57ms	remaining: 123ms
13:	learn: 282.6697154	total: 9.03ms	remaining: 120ms
14:	learn: 277.6121667	total: 9.56ms	remaining: 118ms
15:	learn: 273.4383979	total: 10.1ms	remaining: 116ms
16:	learn: 269.1556201	total: 10.9ms	remaining: 117ms
17:	learn: 264.8098704	total: 11.8ms	remaining: 119ms
18:	learn: 261.6700768	total: 12.3ms	re

In [18]:
y_pred=catb_tuned.predict(X_test)

In [19]:
np.sqrt(mean_squared_error(y_test, y_pred))

344.3125832615482