In [36]:
import gc
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import typing as ty
from torch import Tensor
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from tqdm import trange
from IPython.display import clear_output
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error

In [37]:
def DataLoadAndScale(dataset_folder, normalizer='standard', regression=False, seed=777, noise=1e-3):
    X_train = np.load(f'data_tabular/{dataset_folder}/N_train.npy')
    y_train = np.load(f'data_tabular/{dataset_folder}/y_train.npy')
    X_test = np.load(f'data_tabular/{dataset_folder}/N_test.npy')
    y_test = np.load(f'data_tabular/{dataset_folder}/y_test.npy')
    X_val = np.load(f'data_tabular/{dataset_folder}/N_val.npy')
    y_val = np.load(f'data_tabular/{dataset_folder}/y_val.npy')

    m, s = y_train.mean(), y_train.std()
    if regression == True:
        y_train = (y_train - m) / s
        y_val = (y_val - m) / s
        y_test = (y_test - m) / s

    data_cb = {
        'X_train' : X_train,
        'X_val' : X_val,
        'X_test' : X_test,
        'y_train' : y_train,
        'y_val' : y_val,
        'y_test' : y_test,
        'mean' : m,
        'std' : s
    }
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_val = imputer.transform(X_val)
    X_test = imputer.transform(X_test)
    
    data_model = {}
    if normalizer == 'standard':
        scaler = StandardScaler()
        data_model['X_train'] = torch.from_numpy(scaler.fit_transform(X_train))
        data_model['X_test'] = torch.from_numpy(scaler.transform(X_test))
        data_model['X_val'] = torch.from_numpy(scaler.transform(X_val))
    else:
        quantile_transformer_params = {
            'output_distribution' : 'normal',
            'n_quantiles' : max(min(X_train.shape[0] // 30, 100), 10), # можно поменять здесь
            'subsample' : int(1e9)
        }
        stds = np.std(X_train, axis=0, keepdims=True)
        noise_std = noise / np.maximum(stds, noise)  # type: ignore[code]
        X_train_new = X_train + noise_std * np.random.default_rng(seed).standard_normal(  # type: ignore[code]
                X_train.shape
        ).astype('float32')
        transformer = QuantileTransformer(**quantile_transformer_params)
        data_model['X_train'] = torch.from_numpy(transformer.fit_transform(X_train_new))
        data_model['X_test'] = torch.from_numpy(transformer.transform(X_test))
        data_model['X_val'] = torch.from_numpy(transformer.transform(X_val))

    data_model['y_train'] = torch.from_numpy(y_train)
    data_model['y_test'] = torch.from_numpy(y_test)
    data_model['y_val']= torch.from_numpy(y_val)
    data_model['mean'] = m
    data_model['std'] = s
    
    return data_model, data_cb

In [38]:
def model_num_params(model):
    sum_params = 0
    for param in model.named_parameters():
        num_params = np.prod(param[1].shape)
        print('{: <19} ~  {: <7} params'.format(param[0], num_params))
        sum_params += num_params
    print(f'\nIn total: {sum_params} params')
    return sum_params

In [39]:
def create_model_and_optimizer(model_class, model_params, optimizer_params, device=device):
    model = model_class(**model_params)
    model = model.to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), **optimizer_params)
    return model, optimizer

In [40]:
# def get_results(model, criterion, X_train, y_train, X_val, y_val, X_test, y_test, metric, regression=False):
#     model.eval()
#     X_train_dev = X_train.to(device)
#     X_val_dev = X_val.to(device)
#     X_test_dev = X_test.to(device)
#     y_val_dev = y_val.to(device)
#     y_test_dev = y_test.to(device)
#     with torch.no_grad():
#         preds_val = model(X_val_dev, None)
#         preds_test = model(X_test_dev, None)
#         preds_train = model(X_train_dev, None)
#         val_loss = criterion(preds_val, y_val_dev)
#         test_loss = criterion(preds_test, y_test_dev)
        
#         if regression == False:
#             preds_train = torch.argmax(torch.softmax(preds_train, dim=-1), dim=-1)
#             preds_val = torch.argmax(torch.softmax(preds_val, dim=-1), dim=-1)
#             preds_test = torch.argmax(torch.softmax(preds_test, dim=-1), dim=-1)
        
#         preds_train = preds_train.detach().cpu().numpy()
#         preds_val = preds_val.detach().cpu().numpy()
#         preds_test = preds_test.detach().cpu().numpy()

#         return metric(y_val, preds_val), val_loss, metric(y_test, preds_test), test_loss, metric(y_train, preds_train)

In [41]:
def epoch_eval(model, data, labels, batch_size, criterion, metric, regression=False):
    num_batches = (data.shape[0] + batch_size - 1) // batch_size
    model.eval()
    average_loss = 0.0
    average_metric = 0.0
    with torch.no_grad():
        for batch_index in range(num_batches):
            batch_X = data[batch_index * batch_size : min((batch_index + 1) * batch_size, data.shape[0])].to(device)
            batch_y = labels[batch_index * batch_size : min((batch_index + 1) * batch_size, data.shape[0])].to(device)
            preds = model(batch_X, None)
            loss = criterion(preds, batch_y)
            batch_y = batch_y.detach().cpu().numpy()
            
            if batch_index < num_batches - 1:
                average_loss += loss.item()
                
            if regression == False:
                preds = torch.argmax(torch.softmax(preds, dim=-1), dim=-1).detach().cpu().numpy()
                if batch_index < num_batches - 1:
                    average_metric += metric(batch_y, preds)
            else:
                preds = preds.detach().cpu().numpy()
                if batch_index < num_batches - 1:
                    average_metric += metric(batch_y, preds)**(0.5)
        return average_loss / (num_batches - 1), average_metric / (num_batches - 1)

In [51]:
def epoch_eval_labels(model, data, batch_size, regression=False):
    num_batches = (data.shape[0] + batch_size - 1) // batch_size
    model.eval()
    labels = []
    with torch.no_grad():
        for batch_index in range(num_batches):
            batch_X = data[batch_index * batch_size : min((batch_index + 1) * batch_size, data.shape[0])].to(device)
            preds = model(batch_X, None)
  
            if regression == False:
                preds = torch.argmax(torch.softmax(preds, dim=-1), dim=-1).detach().cpu().numpy()
            else:
                preds = preds.detach().cpu().numpy()
            labels.extend(preds)
        labels = torch.tensor(labels, dtype=torch.int32) if regression==False else torch.tensor(labels, dtype=torch.float32)
        return labels    

In [43]:
def get_results(model, criterion, X_train, y_train, X_val, y_val, X_test, y_test, metric, eval_batch_size, regression=False):
    train_loss, train_metric = epoch_eval(model, X_train, y_train, eval_batch_size, criterion, metric, regression)
    val_loss, val_metric = epoch_eval(model, X_val, y_val, eval_batch_size, criterion, metric, regression)
    test_loss, test_metric = epoch_eval(model, X_test, y_test, eval_batch_size, criterion, metric, regression)
    return val_metric, val_loss, test_metric, test_loss, train_metric

In [1]:
def train_loop(model, optimizer, X_train, y_train, X_val, y_val, X_test, y_test, mean, std,
               criterion, num_epochs, batch_size, eval_batch_size, patience, metric, file_to_load, model_num=None, regression=False):
    epoch_val_accuracy = []
    epoch_test_accuracy = []
    epoch_train_accuracy = []
    train_losses = []
    val_losses = []
    test_losses = []
    num_batches = (X_train.shape[0] + batch_size - 1) // batch_size
    max_val_accuracy = -10 if regression == False else 1e9
    for epoch_index in trange(num_epochs):
        model.train(True)
        epoch_average_loss = 0.0
        for batch_index in range(num_batches):
            batch_X = X_train[batch_index * batch_size : min((batch_index + 1) * batch_size, X_train.shape[0])].to(device)
            batch_y = y_train[batch_index * batch_size : min((batch_index + 1) * batch_size, X_train.shape[0])].to(device)
            preds = model(batch_X, None)
    
            loss = criterion(preds, batch_y)
            if batch_index < num_batches - 1:
                epoch_average_loss += loss.item()
                
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        train_losses.append(epoch_average_loss / (num_batches - 1))
        
        val_accuracy, val_loss, test_accuracy, test_loss, train_accuracy = get_results(model,criterion, X_train, y_train,
                                                                                       X_val, y_val, X_test, y_test, metric, eval_batch_size,
                                                                                       regression=regression)
        
        epoch_val_accuracy.append(val_accuracy)
        if val_accuracy > max_val_accuracy and regression == False:
            torch.save(model.state_dict(), f'{file_to_load}')
            max_val_accuracy = val_accuracy
        elif val_accuracy < max_val_accuracy and regression == True:
            torch.save(model.state_dict(), f'{file_to_load}')
            max_val_accuracy = val_accuracy
            
        epoch_test_accuracy.append(test_accuracy)
        epoch_train_accuracy.append(train_accuracy)
        val_losses.append(val_loss)
        test_losses.append(test_loss)
        
                
        clear_output(True)
        fig, ax = plt.subplots(1, 2, figsize=(20, 10))
        fig.suptitle(f'#{epoch_index}/{num_epochs}:')
        plt.subplot(121)
        plt.title('loss' if model_num == None else f"loss_model{model_num}")
        plt.plot(train_losses, 'r.-', label='train')
        plt.plot(val_losses, 'g.-', label='val')
        plt.plot(test_losses, 'y.-', label='test')
        plt.legend()
        plt.grid()
        plt.subplot(122)
        plt.title('accuracy' if model_num == None else f"accuracy_model{model_num}")
        plt.plot(epoch_train_accuracy, 'r.-', label='train')
        plt.plot(epoch_val_accuracy, 'g.-', label='val')
        plt.plot(epoch_test_accuracy, 'y.-', label='test')
        plt.legend()
        plt.grid()
        plt.show()        
    return epoch_test_accuracy, val_losses

In [2]:
def LearnNModels(N, model_name, model_params, optimizer_params, data, train_params, metric, file_to_load, regression=False):
    models = []
    y_train_labels = []
    y_val_labels = []
    y_test_labels = []
    accuracy = []
    for i in range(N):
        model, opt = create_model_and_optimizer(model_name, model_params, optimizer_params)
        epoch_test_accuracy, val_losses = train_loop(model, opt, **data, **train_params, metric=metric,
                                                     model_num = i + 1, file_to_load=file_to_load, regression=regression)
        model.load_state_dict(torch.load(f'{file_to_load}'))
        model.eval()
        with torch.no_grad():
            train_labels = epoch_eval_labels(model, data['X_train'], train_params['eval_batch_size'], regression)
            val_labels = epoch_eval_labels(model, data['X_val'], train_params['eval_batch_size'], regression)
            test_labels = epoch_eval_labels(model, data['X_test'], train_params['eval_batch_size'], regression)
                
            y_train_labels.append(train_labels)
            y_val_labels.append(val_labels)
            y_test_labels.append(test_labels)
            acc = metric(data['y_test'], test_labels)

            accuracy.append(acc)
        model = model.cpu()    
        models.append(model)
    return {'train_labels' : y_train_labels,
            'val_labels' : y_val_labels,
            'test_labels' : y_test_labels,
            'accuracy' : accuracy,
            'models' : models} 

In [3]:
def CatBoostNLearning(teacher_training, cb_type, catboost_params, data_cb, metric, N=5, ensemble_size=5, regression=False, ensemble_N=3):
    accuracy = {}
    cb = cb_type(**catboost_params)
    cb.fit(data_cb['X_train'], data_cb['y_train'], eval_set=(data_cb['X_val'], data_cb['y_val']))
    preds = cb.predict(data_cb['X_test'])
    accuracy['original'] = metric(data_cb['y_test'], preds)
    
    for i in trange(len(teacher_training['models'][:N])):
        cb = cb_type(**catboost_params)
        train_teacher_labels = teacher_training['train_labels'][i]
        val_teacher_labels = teacher_training['val_labels'][i]
        test_teacher_labels = teacher_training['test_labels'][i]
        cb.fit(data_cb['X_train'], train_teacher_labels.numpy(), eval_set=(data_cb['X_val'], val_teacher_labels.numpy()))
        predicts = cb.predict(data_cb['X_test'])
        accuracy[f'distill_{i + 1}'] = metric(data_cb['y_test'], predicts)
    
    agg_func = torch.mode if regression == False else torch.mean
    train_labels = []
    val_labels = []
    for i in range(ensemble_N):
        train_labels.append(agg_func(torch.stack(teacher_training['train_labels'][i * ensemble_size:(i+1)*ensemble_size], dim=-1), dim=-1))
    #train_labels2 = agg_func(torch.stack(teacher_training['train_labels'][ensemble_size:2*ensemble_size], dim=-1), dim=-1)
    #train_labels3 = agg_func(torch.stack(teacher_training['train_labels'][2*ensemble_size:3*ensemble_size], dim=-1), dim=-1)
        val_labels.append(agg_func(torch.stack(teacher_training['val_labels'][i*(ensemble_size):(i+1)*ensemble_size], dim=-1), dim=-1))
    #val_labels1 = agg_func(torch.stack(teacher_training['val_labels'][:ensemble_size], dim=-1), dim=-1)
    #val_labels2 = agg_func(torch.stack(teacher_training['val_labels'][ensemble_size:2*ensemble_size], dim=-1), dim=-1)
    #val_labels3 = agg_func(torch.stack(teacher_training['val_labels'][2*ensemble_size:3*ensemble_size], dim=-1), dim=-1)

    if regression == False:
        for i in range(ensemble_N):
            train_labels[i] = train_labels[i].values
            val_labels[i] = val_labels[i].values

    #train_labels = [train_labels1, train_labels2, train_labels3]
    #val_labels = [val_labels1, val_labels2, val_labels3]
    accuracy_ensemble = []

    for i in range(ensemble_N):
        cb = cb_type(**catboost_params)
        cb.fit(data_cb['X_train'], train_labels[i].numpy(), eval_set=(data_cb['X_val'], val_labels[i].numpy()))
        preds = cb.predict(data_cb['X_test'])
        accuracy_ensemble.append(metric(data_cb['y_test'], preds))

    # cb = cb_type(**catboost_params)
    # cb.fit(data_cb['X_train'], train_labels2.numpy(), eval_set=(data_cb['X_val'], val_labels2.numpy()))
    # preds = cb.predict(data_cb['X_test'])
    # accuracy_ensemble.append(metric(data_cb['y_test'], preds))

    # cb = cb_type(**catboost_params)
    # cb.fit(data_cb['X_train'], train_labels3.numpy(), eval_set=(data_cb['X_val'], val_labels3.numpy()))
    # preds = cb.predict(data_cb['X_test'])
    # accuracy_ensemble.append(metric(data_cb['y_test'], preds))
    return {'accuracy' : accuracy, 'accuracy_ensemble' : accuracy_ensemble}

In [49]:
def PrintResults(results_models_learning, results_cb_learning, regression=False, std = None):
    for i in range(len(results_cb_learning['accuracy']) - 1):
        model_metric = results_models_learning['accuracy'][i]
        cb_original_metric = results_cb_learning['accuracy']['original']
        cb_metric = results_cb_learning['accuracy'][f'distill_{i + 1}']
        if regression == True:
            model_metric = model_metric**(0.5) * std
            cb_original_metric = cb_original_metric**(0.5) * std
            cb_metric = cb_metric**(0.5) * std
        print(f'model_{i}_metric : {model_metric}')
        print(f'distill_{i} : {cb_original_metric} ---------> {cb_metric}')
    average = 0.0
    for i in range(len(results_cb_learning['accuracy_ensemble'])):
        a = results_cb_learning['accuracy_ensemble'][i]
        if regression == True:
            a = a**(0.5) * std
        average += a 
    print('averaged ensembles accuracy : ', average / len(results_cb_learning['accuracy_ensemble'])) 