In [47]:
from enum import Enum
from datetime import datetime

import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
from tensorflow import keras
from stellargraph import StellarGraph
from stellargraph.layer import Node2Vec, link_classification, GraphSAGE, link_regression
from stellargraph.data import BiasedRandomWalk, UnsupervisedSampler
from stellargraph.mapper import Node2VecLinkGenerator, Node2VecNodeGenerator

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import plotly.express as px

from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import optuna
from sklearn.model_selection import KFold

# Задание - Предсказание уровня экспресси белка

<img src='https://www.researchgate.net/publication/313504607/figure/fig3/AS:459880453677066@1486655453033/Protein-protein-interaction-PPI-network-of-DEGs-by-STRING-The-interaction-score-was.png'>




<div class="alert alert-info">
<b>Про биологию</b>
    
Экспрессия — процесс, в ходе которого наследственная информация от гена (последовательности нуклеотидов ДНК) преобразуется в функциональный продукт — белок. Уровнем экспрессии называют - количество белка, производящегося в этом процессе. Чем выше экспрессия белка, тем большее количество этого белка появляется в клетках человека. 
    
    

<div class="alert alert-info">    
<b>Важность задачи</b>
    
Существует множество причин необходимости в знании уровня экспресии белка. Например - это позволяет ученым разрабатывать лекарственные средства и оптимизировать их разработку. Теперь вам предстоит побыть в роли биоинформатика и помочь науке!
    
</div>


<div class="alert alert-info">
<b>Про Датасет</b>
    
Датасет представляет собой граф взаимойдествия белков. Где узлы это белки, взаимодействие между белками это ребро. 

Для каждого белка известен уровень его экспрессии. Ниже приведен список ребер `edges`. Информация по экспрессии белков, разбитая на `train` и `test`.
   
    
</div>

In [2]:
#Список ребер графа 
edges = pd.read_csv("./edges.csv", sep=",", header=None) # Подгрузим данные
edges.head()

Unnamed: 0,0,1
0,344,50
1,344,153
2,344,532
3,344,679
4,344,986


In [3]:
#Подгрузим тренирочную выборку
train = pd.read_csv("./train.csv", sep=",") # Подгрузим данные
train.head()

Unnamed: 0,target,node
0,0.251968,11142
1,0.689541,2243
2,0.678245,15514
3,0.2725,20944
4,0.248888,8721


In [4]:
# Подгрузим отложенную выборку для валидации
test = pd.read_csv("./test.csv", sep=",")
test.head()

Unnamed: 0,target,node
0,0.279231,817
1,0.380795,9574
2,0.686527,1607
3,0.303594,4782
4,0.367374,24125


<div class="alert alert-info">
<b>Про Задачу</b>
    
Вам предлагается предсказать экспрессию белков (`target`) по приведенным данным для отложенной выборки. Ответы в отложенной выборке `test` даны вам для самостоятельной валидации.


    
   
    

<div class="alert alert-info">
<b>Замечание и комментарии</b>
    
    

По ряду причин датасет был упрощен так, чтобы выполнялись следующие условия:
* у графа одна компонента связанности. 
* удалены слишком крупные хабы
* плотность связей графа уменьшена
* решить задачу можно классическими ML подходами
    
   

<div class="alert alert-info">
<b>Оценка результатов</b>
    


Оценка точности модели будет оцениваться по метрике MSE на отложенной выборке `test`
        
</div>

<div class="alert alert-info">
<b>Автор задачи</b>

По всем дополнительным вопросами писать Александру Миленькину
* Телеграмм: Alerin75infskin
* Почта: milenkin.aa@phystech.edu
        
</div>

## EDA

In [5]:
px.histogram(train['target'], nbins=100, histnorm='percent')

In [6]:
high_expression_counts = edges[edges[0].isin(train[train['target'] > 2.0]['node'])].groupby(0).agg(count=(1, 'count'))
high_expression_counts[['count']].quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Unnamed: 0,count
0.05,27.45
0.25,130.75
0.5,297.5
0.75,450.0
0.95,973.1


In [8]:
low_expression_counts = edges[edges[0].isin(train[train['target'] <= 2.0]['node'])].groupby(0).agg(count=(1, 'count'))
low_expression_counts[['count']].quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Unnamed: 0,count
0.05,4.0
0.25,19.0
0.5,37.0
0.75,62.0
0.95,148.0


In [9]:
px.histogram(test['target'], nbins=100, histnorm='percent')

In [10]:
test_high_expression_counts = edges[edges[0].isin(test[test['target'] > 2.0]['node'])].groupby(0).agg(count=(1, 'count'))
test_high_expression_counts[['count']].quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Unnamed: 0,count
0.05,44.4
0.25,226.0
0.5,402.0
0.75,624.0
0.95,1294.75


In [11]:
test_low_expression_counts = edges[edges[0].isin(test[test['target'] <= 2.0]['node'])].groupby(0).agg(count=(1, 'count'))
test_low_expression_counts[['count']].quantile([0.05, 0.25, 0.5, 0.75, 0.95])

Unnamed: 0,count
0.05,4.0
0.25,18.0
0.5,36.0
0.75,62.0
0.95,162.3


Из данных выше можно сделать предположение, что количество связей коррелирует с уровнем экспрессии:

In [12]:
counts = edges.groupby(0).agg(count=(1, 'count')).reset_index()
node_count_target = pd.merge(train, counts, left_on='node', right_on=0, how='left')[['node', 'count', 'target']]
node_count_target['count'] = node_count_target['count'].fillna(0)

node_count_target[['target', 'count']].corr()

Unnamed: 0,target,count
target,1.0,0.791109
count,0.791109,1.0


In [13]:
print(f'Mean target in train set: {train["target"].mean():.3f}')
print(f'Median target in train set: {train["target"].median():.3f}')

Mean target in train set: 0.531
Median target in train set: 0.340


In [14]:
print(f'Mean target in test set: {test["target"].mean():.3f}')
print(f'Median target in test set: {test["target"].median():.3f}')

Mean target in test set: 0.569
Median target in test set: 0.342


## Homework

### Baseline

In [15]:
# baseline
mean_expression = train['target'].mean()
print(f'Mean expression (target): {mean_expression:.3f}')

Mean expression (target): 0.531


In [16]:
print(f'Baseline MSE: {mean_squared_error(test["target"], [mean_expression] * test.shape[0]):.3f}')

Baseline MSE: 0.754


Хорошая модель должна давать MSE ниже 0.754

### ML model

In [35]:
class NotebookMode(Enum):
    FromScratch = 0
    Predefined = 1

mode = NotebookMode.Predefined

In [17]:
edges = nx.read_edgelist('./edges.csv', delimiter=',')
print(nx.info(edges))

Name: 
Type: Graph
Number of nodes: 10000
Number of edges: 594174
Average degree: 118.8348


In [40]:
G = StellarGraph.from_networkx(edges)

In [41]:
def train_model(batch_size, epochs, emb_size, walk_number, walk_length):
    rw = BiasedRandomWalk(G, n=walk_number, length=walk_length, p=0.5, q=2.0, seed=1)
    unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=rw)
    generator = Node2VecLinkGenerator(G, batch_size)
    node2vec = Node2Vec(emb_size, generator=generator)
    x_inp, x_out = node2vec.in_out_tensors()

    prediction = link_regression(output_dim=1, edge_embedding_method="dot")(x_out)

    model = keras.Model(inputs=x_inp, outputs=prediction)

    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=keras.losses.MSE)

    history = model.fit(
        generator.flow(unsupervised_samples), epochs=epochs, shuffle=True
    )

    timestamp = datetime.strftime(datetime.now(), '%Y-%m-%dT%H-%M-%S')
    model.save_weights(f'./model-{timestamp}-batch_size-{batch_size}-emb_size-{emb_size}.weights')

    x_inp_src = x_inp[0]
    x_out_src = x_out[0]
    embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

    node_gen = Node2VecNodeGenerator(G, batch_size).flow([str(a) for a in G.nodes()])
    node_embeddings = embedding_model.predict(node_gen, verbose=0)

    df = pd.DataFrame(node_embeddings)
    df['node'] = pd.Series(G.nodes(), dtype=np.int64)

    df_train = df[df['node'].isin(train['node'])]
    df_train = pd.merge(df, train, on='node')

    df_test = df[df['node'].isin(test['node'])]
    df_test = pd.merge(df, test, on='node')

    dt = DecisionTreeRegressor(random_state=1)
    dt.fit(df_train[list(df_train)[:-2]], df_train['target'])

    mse = mean_squared_error(dt.predict(df_test[list(df_test)[:-2]]), df_test['target'])

    return dt, mse

Грубо оценим параметры embedding-ов

In [36]:
batch_size = 64
epochs = 2
emb_size = 64
walk_number = 10
walk_length = 5

mses = []

if mode == NotebookMode.FromScratch:
    for i in tqdm(range(1, 5)):
        for j in tqdm(range(1, 5)):
            model, mse = train_model(64 * i, epochs, 64 * j, walk_number, walk_length)
            mses.append({'batch_size': 64 * i, 'emb_size': 64 * j, 'mse': mse})
else:
    mses=[
        {'batch_size': 64, 'emb_size': 64, 'mse': 1.3511612146704406},
        {'batch_size': 64, 'emb_size': 128, 'mse': 1.2261683918563566},
        {'batch_size': 64, 'emb_size': 192, 'mse': 1.5015654672647705},
        {'batch_size': 64, 'emb_size': 256, 'mse': 1.3650220678643608},
        {'batch_size': 128, 'emb_size': 64, 'mse': 1.20258656355635},
        {'batch_size': 128, 'emb_size': 128, 'mse': 1.2764228195233913},
        {'batch_size': 128, 'emb_size': 192, 'mse': 1.35218135321154},
        {'batch_size': 128, 'emb_size': 256, 'mse': 1.1116913482023314},
        {'batch_size': 192, 'emb_size': 64, 'mse': 1.3060879966580796},
        {'batch_size': 192, 'emb_size': 128, 'mse': 1.2750152115094364},
        {'batch_size': 192, 'emb_size': 192, 'mse': 1.3842525803808972},
        {'batch_size': 192, 'emb_size': 256, 'mse': 1.3887266921972257},
        {'batch_size': 256, 'emb_size': 64, 'mse': 1.3721984636750395},
        {'batch_size': 256, 'emb_size': 128, 'mse': 1.4770992893051553},
        {'batch_size': 256, 'emb_size': 192, 'mse': 1.3716800160102811},
        {'batch_size': 256, 'emb_size': 256, 'mse': 1.6835760014453267}
    ]

In [37]:
sorted(mses, key=lambda x: x['mse'])

[{'batch_size': 128, 'emb_size': 256, 'mse': 1.1116913482023314},
 {'batch_size': 128, 'emb_size': 64, 'mse': 1.20258656355635},
 {'batch_size': 64, 'emb_size': 128, 'mse': 1.2261683918563566},
 {'batch_size': 192, 'emb_size': 128, 'mse': 1.2750152115094364},
 {'batch_size': 128, 'emb_size': 128, 'mse': 1.2764228195233913},
 {'batch_size': 192, 'emb_size': 64, 'mse': 1.3060879966580796},
 {'batch_size': 64, 'emb_size': 64, 'mse': 1.3511612146704406},
 {'batch_size': 128, 'emb_size': 192, 'mse': 1.35218135321154},
 {'batch_size': 64, 'emb_size': 256, 'mse': 1.3650220678643608},
 {'batch_size': 256, 'emb_size': 192, 'mse': 1.3716800160102811},
 {'batch_size': 256, 'emb_size': 64, 'mse': 1.3721984636750395},
 {'batch_size': 192, 'emb_size': 192, 'mse': 1.3842525803808972},
 {'batch_size': 192, 'emb_size': 256, 'mse': 1.3887266921972257},
 {'batch_size': 256, 'emb_size': 128, 'mse': 1.4770992893051553},
 {'batch_size': 64, 'emb_size': 192, 'mse': 1.5015654672647705},
 {'batch_size': 256, '

Наиоблее удачными получились `batch_size` равный 128 и `emb_size` равный 256. Обучим embedding-и с этими параметрами на большем числе эпох

In [100]:
batch_size = 128
epochs = 3
emb_size = 256
walk_number = 10
walk_length = 5

rw = BiasedRandomWalk(G, n=walk_number, length=walk_length, p=0.5, q=2.0, seed=1)
unsupervised_samples = UnsupervisedSampler(G, nodes=list(G.nodes()), walker=rw)
generator = Node2VecLinkGenerator(G, batch_size)
node2vec = Node2Vec(emb_size, generator=generator)
x_inp, x_out = node2vec.in_out_tensors()

prediction = link_regression(output_dim=1, edge_embedding_method="dot")(x_out)

model = keras.Model(inputs=x_inp, outputs=prediction)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=keras.losses.MSE)

if mode == NotebookMode.FromScratch:
    history = model.fit(
        generator.flow(unsupervised_samples), epochs=epochs, shuffle=True
    )

    timestamp = datetime.strftime(datetime.now(), '%Y-%m-%dT%H-%M-%S')
    model.save_weights(f'./model.weights')
else:
    model.load_weights(f'./model.weights')

x_inp_src = x_inp[0]
x_out_src = x_out[0]
embedding_model = keras.Model(inputs=x_inp_src, outputs=x_out_src)

node_gen = Node2VecNodeGenerator(G, batch_size).flow([str(a) for a in G.nodes()])
node_embeddings = embedding_model.predict(node_gen, verbose=0)

link_regression: using 'dot' method to combine node embeddings into edge embeddings


#### Модель `DecisionTreeRegressor` только с embedding-ами и параметрами по умолчанию

In [101]:
def create_folds(df: pd.DataFrame) -> pd.DataFrame:
    n_splits: int = 5
    df["kfold"] = -1
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    for fold, (train_indicies, valid_indicies) in enumerate(kf.split(X=df)):
        df.loc[valid_indicies, "kfold"] = fold
    return df

In [117]:
df = pd.DataFrame(node_embeddings)
df['node'] = pd.Series(G.nodes(), dtype=np.int64)
df = pd.merge(df, counts, left_on='node', right_on=0, how='left')
df = df.rename(columns={'0_x': 0})
df = df.drop(['0_x', '0_y'], axis='columns', errors='ignore')
df['count'] = df['count'].fillna(0)
df = create_folds(df)

target_column = 'target'
emb_columns = [x for x in range(256)]
kfold_column = 'kfold'
count_column = ['count']

In [121]:
df_train = df[df['node'].isin(train['node'])]
df_train = pd.merge(df_train, train, on='node')

df_test = df[df['node'].isin(test['node'])]
df_test = pd.merge(df_test, test, on='node')

total_sec = 0
val_scores = []
test_scores = []

print(f'{"fold":<6} | {"val score":<12} | {"test score":<12} | {"elapsed (sec)":<10}')

for fold in sorted(df[kfold_column].unique()):
    start = datetime.now()
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain[target_column]
    yvalid = xvalid[target_column]
    
    xtrain = xtrain[emb_columns]
    xvalid = xvalid[emb_columns]

    model = DecisionTreeRegressor(random_state=1)
    model.fit(xtrain, ytrain)

    preds_valid = model.predict(xvalid)
    mse_valid = mean_squared_error(yvalid, preds_valid)

    mse_tst = mean_squared_error(df_test[target_column], model.predict(df_test[emb_columns]))

    sec_elapsed = (datetime.now() - start).total_seconds()
    total_sec += sec_elapsed
    print('{:<6} | {:<12} | {:<12} | {:<10}'.format(fold, format(mse_valid, '.5f'), format(mse_tst, '.5f'), format(sec_elapsed, '.5f')))
    val_scores.append(mse_valid)
    test_scores.append(mse_tst)

val_mean_metric: float = np.mean(val_scores)
val_std: float = np.std(val_scores)

test_mean_metric: float = np.mean(test_scores)
test_std: float = np.std(test_scores)

total_training_time: float = total_sec / 60

print('-' * 30)
print(f'Mean val score: {val_mean_metric:5f}')
print(f'Std val: {val_std:5f}')
print(f'Mean test score: {test_mean_metric:5f}')
print(f'Std test: {test_std:5f}')
print(f'Total elapsed (min): {total_training_time:5f}')

fold   | val score    | test score   | elapsed (sec)
0      | 1.04350      | 1.59020      | 11.57365  
1      | 1.04441      | 1.38573      | 12.69134  
2      | 0.97755      | 1.34441      | 12.40381  
3      | 1.02528      | 1.38428      | 14.31029  
4      | 0.71688      | 1.56583      | 11.52680  
------------------------------
Mean val score: 0.961524
Std val: 0.124709
Mean test score: 1.454089
Std test: 0.102553
Total elapsed (min): 1.041765


#### Модель `DecisionTreeRegressor` с embedding-ами + дополнительной фичёй "количество связей" и параметрами по умолчанию

In [122]:
df_train = df[df['node'].isin(train['node'])]
df_train = pd.merge(df, train, on='node')

df_test = df[df['node'].isin(test['node'])]
df_test = pd.merge(df, test, on='node')

total_sec = 0
val_scores = []
test_scores = []

print(f'{"fold":<6} | {"val score":<12} | {"test score":<12} | {"elapsed (sec)":<10}')

for fold in sorted(df[kfold_column].unique()):
    start = datetime.now()
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain[target_column]
    yvalid = xvalid[target_column]
    
    xtrain = xtrain[emb_columns + count_column]
    xvalid = xvalid[emb_columns + count_column]

    model = DecisionTreeRegressor(random_state=1)
    model.fit(xtrain, ytrain)

    preds_valid = model.predict(xvalid)
    mse_valid = mean_squared_error(yvalid, preds_valid)

    mse_tst = mean_squared_error(df_test[target_column], model.predict(df_test[emb_columns + count_column]))

    sec_elapsed = (datetime.now() - start).total_seconds()
    total_sec += sec_elapsed
    print('{:<6} | {:<12} | {:<12} | {:<10}'.format(fold, format(mse_valid, '.5f'), format(mse_tst, '.5f'), format(sec_elapsed, '.5f')))
    val_scores.append(mse_valid)
    test_scores.append(mse_tst)

val_mean_metric: float = np.mean(val_scores)
val_std: float = np.std(val_scores)

test_mean_metric: float = np.mean(test_scores)
test_std: float = np.std(test_scores)

total_training_time: float = total_sec / 60

print('-' * 30)
print(f'Mean val score: {val_mean_metric:5f}')
print(f'Std val: {val_std:5f}')
print(f'Mean test score: {test_mean_metric:5f}')
print(f'Std test: {test_std:5f}')
print(f'Total elapsed (min): {total_training_time:5f}')

fold   | val score    | test score   | elapsed (sec)
0      | 0.26012      | 0.63882      | 7.86585   
1      | 0.38395      | 0.48325      | 8.10773   
2      | 0.41519      | 0.50760      | 8.21514   
3      | 0.33332      | 0.44087      | 7.41453   
4      | 0.29717      | 0.52012      | 9.01373   
------------------------------
Mean val score: 0.337951
Std val: 0.056259
Mean test score: 0.518130
Std test: 0.066133
Total elapsed (min): 0.676950


In [30]:
df_train.to_csv('./train_emb.csv', index=None)
df_test.to_csv('./test_emb.csv', index=None)

In [28]:
regressors = [GradientBoostingRegressor, CatBoostRegressor, LGBMRegressor, XGBRegressor]

models = {}
for reg in tqdm(regressors):
    model = reg(random_state=1)
    model.fit(df_train.drop(['target', 'node'], axis='columns'), df_train['target'])
    mse = mean_squared_error(model.predict(df_test.drop(['target', 'node'], axis='columns')), df_test['target'])
    models[type(model).__name__] = mse

for name, mse in models.items():
    print(f'{name} = {mse:.5f}')

 25%|██▌       | 1/4 [01:04<03:13, 64.38s/it]

Learning rate set to 0.056868
0:	learn: 0.6539479	total: 84.3ms	remaining: 1m 24s
1:	learn: 0.6369086	total: 116ms	remaining: 57.8s
2:	learn: 0.6215903	total: 142ms	remaining: 47.3s
3:	learn: 0.6053108	total: 168ms	remaining: 41.9s
4:	learn: 0.5896859	total: 194ms	remaining: 38.6s
5:	learn: 0.5749495	total: 226ms	remaining: 37.5s
6:	learn: 0.5619029	total: 262ms	remaining: 37.2s
7:	learn: 0.5498287	total: 291ms	remaining: 36.1s
8:	learn: 0.5381778	total: 319ms	remaining: 35.1s
9:	learn: 0.5270785	total: 346ms	remaining: 34.2s
10:	learn: 0.5166052	total: 373ms	remaining: 33.6s
11:	learn: 0.5071540	total: 400ms	remaining: 32.9s
12:	learn: 0.4972954	total: 428ms	remaining: 32.5s
13:	learn: 0.4902496	total: 459ms	remaining: 32.4s
14:	learn: 0.4825290	total: 495ms	remaining: 32.5s
15:	learn: 0.4754537	total: 526ms	remaining: 32.4s
16:	learn: 0.4677756	total: 555ms	remaining: 32.1s
17:	learn: 0.4613365	total: 581ms	remaining: 31.7s
18:	learn: 0.4553377	total: 609ms	remaining: 31.4s
19:	learn

100%|██████████| 4/4 [01:50<00:00, 27.51s/it]

GradientBoostingRegressor = 0.30095
CatBoostRegressor = 0.30737
LGBMRegressor = 0.31346
XGBRegressor = 0.32156





In [126]:
def objective(trial):
    df_train = df[df['node'].isin(train['node'])]
    df_train = pd.merge(df, train, on='node')

    X = df_train[df_train[kfold_column] != 1][emb_columns + count_column]
    y = df_train[df_train[kfold_column] != 1][target_column]
    
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=0.3)

    param = {
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.01, 0.1),
        'depth': trial.suggest_int('depth', 1, 12),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'bootstrap_type': trial.suggest_categorical(
            'bootstrap_type', ['Bayesian', 'Bernoulli', 'MVS']
        ),
        'random_state': 1
    }

    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 0, 10)
    elif param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = trial.suggest_float('subsample', 0.1, 1)

    gbm = CatBoostRegressor(**param)

    gbm.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], verbose=0, early_stopping_rounds=100)

    pred_y = gbm.predict(valid_x)
    score = mean_squared_error(valid_y, pred_y)
    return score

In [127]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=60*60)

[32m[I 2022-04-17 00:40:34,109][0m A new study created in memory with name: no-name-00fa01c0-5816-4903-957c-25e01b0da14d[0m
[32m[I 2022-04-17 00:40:40,091][0m Trial 0 finished with value: 0.14499392758651286 and parameters: {'colsample_bylevel': 0.043452950266044926, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.9836409720579784}. Best is trial 0 with value: 0.14499392758651286.[0m
[32m[I 2022-04-17 00:40:41,999][0m Trial 1 finished with value: 0.18257259064184814 and parameters: {'colsample_bylevel': 0.08803063806912659, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.6475143720691334}. Best is trial 0 with value: 0.14499392758651286.[0m
[32m[I 2022-04-17 00:40:43,939][0m Trial 2 finished with value: 0.1382242928476628 and parameters: {'colsample_bylevel': 0.05755573460492932, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.8267094768160684}. Best 

In [128]:
print(f'Number of finished trials: {len(study.trials)}')

print('Best trial:')
trial = study.best_trial

print(f'\tValue: {trial.value}')

print('\tParams: ')
for key, value in trial.params.items():
    print(f'\t\t{key}: {value}')

Number of finished trials: 50
Best trial:
	Value: 0.11247919127810914
	Params: 
		colsample_bylevel: 0.0728563451458045
		depth: 4
		boosting_type: Plain
		bootstrap_type: MVS


In [129]:
df_train = df[df['node'].isin(train['node'])]
df_train = pd.merge(df, train, on='node')

df_test = df[df['node'].isin(test['node'])]
df_test = pd.merge(df, test, on='node')

total_sec = 0
val_scores = []
test_scores = []

print(f'{"fold":<6} | {"val score":<12} | {"test score":<12} | {"elapsed (sec)":<10}')

for fold in sorted(df[kfold_column].unique()):
    start = datetime.now()
    xtrain =  df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)

    ytrain = xtrain[target_column]
    yvalid = xvalid[target_column]
    
    xtrain = xtrain[emb_columns + count_column]
    xvalid = xvalid[emb_columns + count_column]

    model = CatBoostRegressor(random_state=1, **trial.params)
    model.fit(xtrain, ytrain, silent=True)

    preds_valid = model.predict(xvalid)
    mse_valid = mean_squared_error(yvalid, preds_valid)

    mse_tst = mean_squared_error(df_test[target_column], model.predict(df_test[emb_columns + count_column]))

    sec_elapsed = (datetime.now() - start).total_seconds()
    total_sec += sec_elapsed
    print('{:<6} | {:<12} | {:<12} | {:<10}'.format(fold, format(mse_valid, '.5f'), format(mse_tst, '.5f'), format(sec_elapsed, '.5f')))
    val_scores.append(mse_valid)
    test_scores.append(mse_tst)

val_mean_metric: float = np.mean(val_scores)
val_std: float = np.std(val_scores)

test_mean_metric: float = np.mean(test_scores)
test_std: float = np.std(test_scores)

total_training_time: float = total_sec / 60

print('-' * 30)
print(f'Mean val score: {val_mean_metric:5f}')
print(f'Std val: {val_std:5f}')
print(f'Mean test score: {test_mean_metric:5f}')
print(f'Std test: {test_std:5f}')
print(f'Total elapsed (min): {total_training_time:5f}')

fold   | val score    | test score   | elapsed (sec)
0      | 0.12796      | 0.30242      | 4.05563   
1      | 0.16170      | 0.30128      | 3.81658   
2      | 0.22124      | 0.30690      | 3.80824   
3      | 0.20481      | 0.30228      | 3.86723   
4      | 0.16134      | 0.30292      | 3.77861   
------------------------------
Mean val score: 0.175410
Std val: 0.033473
Mean test score: 0.303162
Std test: 0.001945
Total elapsed (min): 0.322105


Итоговая модель получилась в два раза лучше baseline модели