In [None]:
!pip install -r requirements.txt

In [31]:
import numpy as np
import pandas as pd
from catboost import Pool, CatBoostRegressor
import warnings
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)
%matplotlib notebook

In [6]:
dataset_path = "../EEG/physionet.org/files/chbmit/1.0.0/"
data = pd.read_csv("dataset_regresion.csv")

In [7]:
data = data.drop('Unnamed: 0', axis=1)
data = data.set_index('name')

In [8]:
X, y = data.drop('shift', axis=1), data['shift']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, shuffle=True, random_state=10)

In [21]:
from sklearn.model_selection import KFold
n_fold = 5
cv = KFold(n_splits=n_fold, shuffle=True, random_state=10)
rmse, r2 = [], []
catreg = []
params = {
            'iterations':300,
            'learning_rate':0.03,
            'depth':6,
            'eval_metric':'RMSE'
}

for fold, (train_index, val_index) in enumerate(cv.split(X)):
    X_train = X.iloc[train_index,:]
    X_val = X.iloc[val_index,:]

    y_train = y.iloc[train_index]
    y_val = y.iloc[val_index]

    clf = CatBoostRegressor(**params)

    train_dataset = Pool(data=X_train,
                     label=y_train,
                     )

    eval_dataset = Pool(data=X_val,
                    label=y_val,
                    )

    clf.fit(train_dataset,
              use_best_model=True,
              verbose = 3,
              eval_set=eval_dataset)

    y_pred = clf.predict(Pool(data=X_test))

    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2.append(r2_score(y_test, y_pred))
    catreg.append(clf)

    print(f"fold: {fold}, rmse: {np.sqrt(mean_squared_error(y_test, y_pred))}")
    print(f"fold: {fold}, r2: {r2_score(y_test, y_pred)}")

print('CV mean rmse:  {0:.4f}, std: {1:.4f}.'.format(np.mean(rmse), np.std(rmse)))
print('CV mean r2: {0:.4f}, std: {1:.4f}.'.format(np.mean(r2), np.std(r2)))

0:	learn: 7.8686290	test: 8.0358084	best: 8.0358084 (0)	total: 660ms	remaining: 3m 17s
3:	learn: 7.7655885	test: 8.0095770	best: 8.0087145 (2)	total: 2.41s	remaining: 2m 58s
6:	learn: 7.6727242	test: 8.0004088	best: 8.0004088 (6)	total: 4.23s	remaining: 2m 57s
9:	learn: 7.5800314	test: 7.9772535	best: 7.9772535 (9)	total: 6.07s	remaining: 2m 56s
12:	learn: 7.4797501	test: 7.9521721	best: 7.9521721 (12)	total: 8.03s	remaining: 2m 57s
15:	learn: 7.3888221	test: 7.9261222	best: 7.9261222 (15)	total: 9.87s	remaining: 2m 55s
18:	learn: 7.2875244	test: 7.9114414	best: 7.9099956 (17)	total: 11.7s	remaining: 2m 53s
21:	learn: 7.1760176	test: 7.9103042	best: 7.8983480 (19)	total: 13.6s	remaining: 2m 52s
24:	learn: 7.0961114	test: 7.8933748	best: 7.8933748 (24)	total: 15.4s	remaining: 2m 49s
27:	learn: 7.0275765	test: 7.8967492	best: 7.8933748 (24)	total: 17.2s	remaining: 2m 47s
30:	learn: 6.9521042	test: 7.9008733	best: 7.8933748 (24)	total: 19.1s	remaining: 2m 45s
33:	learn: 6.8648655	test: 7.

In [22]:
for n in range(5):
    pred = catreg[n].predict(Pool(data=X_test))
    rmse, r2 = np.sqrt(mean_squared_error(y_test, pred)), r2_score(y_test,pred)
    print(f"Catboost fold: {n} - rmse: {rmse}, r2: {r2}")

Catboost fold: 0 - rmse: 7.232887711581439, r2: 0.2258102847880581
Catboost fold: 1 - rmse: 5.978964074674302, r2: 0.47097543382857365
Catboost fold: 2 - rmse: 1.8974755687808993, r2: 0.9467185129186519
Catboost fold: 3 - rmse: 3.0644816050384285, r2: 0.8610245818388288
Catboost fold: 4 - rmse: 3.913338613276957, r2: 0.7733692645247063


In [23]:
for n in range(5):
    pred = catreg[n].predict(Pool(data=X))
    rmse, r2 = np.sqrt(mean_squared_error(y, pred)), r2_score(y, pred)
    print(f"Catboost fold: {n} - rmse: {rmse}, r2: {r2}")

Catboost fold: 0 - rmse: 4.703738919009714, r2: 0.6473661074367784
Catboost fold: 1 - rmse: 5.51668439871411, r2: 0.5149417301516284
Catboost fold: 2 - rmse: 3.7158544203053148, r2: 0.7799329527022918
Catboost fold: 3 - rmse: 4.174881055452411, r2: 0.7222040894772226
Catboost fold: 4 - rmse: 4.728936722032946, r2: 0.64357788726722


In [34]:
# Посмотрим на ошибки модели
catreg[2].predict(Pool(data=X))-y

name
chb01/chb01_03.edf_766976    12.126272
chb01/chb01_03.edf_762368    -0.807984
chb01/chb01_03.edf_761856    -4.648831
chb01/chb01_04.edf_375552     1.423164
chb01/chb01_04.edf_370944    -1.095303
                               ...    
chb24/chb24_17.edf_895232    -0.869604
chb24/chb24_17.edf_894720    -4.754936
chb24/chb24_21.edf_717824    10.061084
chb24/chb24_21.edf_713216    -0.745795
chb24/chb24_21.edf_712704    -1.175741
Name: shift, Length: 540, dtype: float64

In [42]:
print(f"Модель предсказывает начало раньше в {round(sum(catreg[2].predict(Pool(data=X))-y < 0)/len(X)*100, 2)}%")

Модель предсказывает начало раньше в 56.67%


In [32]:
catreg[2].save_model("start_time.cbm")