# Catboost

## Packages

In [3]:
import pandas as pd

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from catboost import CatBoostRegressor



## Import Data

In [4]:
df = pd.read_csv('train_df.csv')
df.head()

Unnamed: 0,age,Pedu,traveltime,studytime,failures,famrel,gooutAlc,health,sex,addressInternet,...,Mjob_services,Mjob_teacher,Fjob_at_home,Fjob_health,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_reputation,G3
0,1.021751,-0.940091,-0.642435,-1.233786,0.895343,-1.053136,1.570863,1.039751,1,1,...,1,0,0,0,1,0,0,0,0,8
1,0.238078,-0.940091,-0.642435,-0.042232,-0.449374,0.062115,1.570863,0.320484,1,1,...,0,0,0,0,0,0,0,1,0,13
2,0.238078,-0.940091,0.791247,-0.042232,-0.449374,0.062115,0.657033,1.039751,0,1,...,1,0,0,0,0,0,1,0,0,12
3,-0.545595,1.02465,-0.642435,-1.233786,-0.449374,-1.053136,-1.170628,1.039751,1,1,...,0,1,0,0,0,1,1,0,0,0
4,1.805423,-1.922461,-0.642435,-0.042232,0.895343,0.062115,-0.256798,-0.398784,0,1,...,0,0,0,0,0,0,0,0,1,10


In [5]:
X = df.drop(['G3'], axis = 1)
y = df[['G3']]

## Decision Tree

In [6]:
model = CatBoostRegressor(iterations=500, depth=10, learning_rate=0.05, loss_function='RMSE')
model.fit(X, y, verbose=100)


0:	learn: 4.6434800	total: 55.5ms	remaining: 27.7s
100:	learn: 2.0478722	total: 1.09s	remaining: 4.32s
200:	learn: 0.9164319	total: 2.24s	remaining: 3.33s
300:	learn: 0.4564437	total: 3.39s	remaining: 2.24s
400:	learn: 0.2679980	total: 4.54s	remaining: 1.12s
499:	learn: 0.1760712	total: 5.71s	remaining: 0us


<catboost.core.CatBoostRegressor at 0x280481540>

In [8]:
model = CatBoostRegressor(iterations=500, depth=15, learning_rate=0.05, loss_function='RMSE', verbose = 100)

mse = cross_val_score(model, X, y, cv=8, scoring='neg_mean_squared_error')
r2 = cross_val_score(model, X, y, cv=8, scoring='r2')

print(f"CatBoost: \tMSE: {mse.mean()} \tR2: {r2.mean()}")

CatBoost: 	MSE: -20.70176087520251 	R2: 0.026617043366829282


## Subset of features

In [9]:
features = ['Pedu', 'studytime', 'failures', 'gooutAlc', 'sex', 'addressInternet', 'famsize', 'schoolsup', 'higher', 'romantic', 'Mjob_health', 'Mjob_services', 'Mjob_teacher']

model = CatBoostRegressor(verbose = 100)

param_grid = {
    'iterations': [500, 1000],
    'depth': [5, 10, 15],
    'learning_rate': [0.01, 0.05, 0.1],
    'loss_function': ['RMSE'],
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring = 'r2')
grid_search.fit(X[features], y.values.ravel())

print(grid_search.best_score_)
print(grid_search.best_params_)


0:	learn: 4.6521064	total: 583us	remaining: 291ms
1:	learn: 4.6433794	total: 1.13ms	remaining: 282ms
2:	learn: 4.6370865	total: 1.26ms	remaining: 208ms
3:	learn: 4.6314498	total: 1.67ms	remaining: 207ms
4:	learn: 4.6236725	total: 2.08ms	remaining: 206ms
5:	learn: 4.6184342	total: 2.52ms	remaining: 207ms
6:	learn: 4.6123185	total: 2.99ms	remaining: 211ms
7:	learn: 4.6067436	total: 3.23ms	remaining: 198ms
8:	learn: 4.5983913	total: 3.65ms	remaining: 199ms
9:	learn: 4.5918513	total: 4.09ms	remaining: 200ms
10:	learn: 4.5851181	total: 4.47ms	remaining: 199ms
11:	learn: 4.5792761	total: 4.91ms	remaining: 200ms
12:	learn: 4.5726734	total: 5.35ms	remaining: 201ms
13:	learn: 4.5670250	total: 5.6ms	remaining: 194ms
14:	learn: 4.5596770	total: 5.99ms	remaining: 194ms
15:	learn: 4.5541376	total: 6.39ms	remaining: 193ms
16:	learn: 4.5494752	total: 6.77ms	remaining: 192ms
17:	learn: 4.5454086	total: 7.06ms	remaining: 189ms
18:	learn: 4.5388802	total: 7.45ms	remaining: 189ms
19:	learn: 4.5336319	tot