In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import Pool, CatBoostClassifier, cv, CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score, f1_score
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import RobustScaler
from scipy.interpolate import interp1d
from scipy.fft import fft
from tqdm import tqdm
from joblib import dump, load

import os
import copy

import pickle


In [2]:
class Ensemble:
    def __init__(self, models, train, top_5_least_important, line=1, task="regr"):
        
        x_coord = train['Дальность (м)'] * np.sin(train['Азимут']) * np.cos(train['У.М.'])
        y_coord = train['Дальность (м)'] * np.cos(train['Азимут']) * np.cos(train['У.М.'])
        z_coord = train['Дальность (м)'] * np.sin(train['У.М.'])
        
        train['x_coord'] = x_coord
        train['y_coord'] = y_coord
        train['z_coord'] = z_coord
                
        
        self.train = train
        self.models = models
        self.top_5_least_important = top_5_least_important
        self.task = task   
        
        
        if self.task == 'regr':
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.train.drop('Доля сигнала в ВП', axis=1),
                                                    train['Доля сигнала в ВП'], test_size=0.1, shuffle=True)
        if self.task == 'class':
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.train.drop('Тип марсианина', axis=1),
                                                    train['Тип марсианина'], test_size=0.1, shuffle=True)
        self.train_cb = Pool(data=self.X_train,
             label=self.y_train)
        self.test_cb = Pool(data=self.X_test,
             label=self.y_test)
        self.line = line
        #self.meta_model = meta_model
    
    
    def fit(self):
        scores = []
        df = self.X_test.copy()
        sums = 0
        n = len(self.models)
        for model, name in tqdm(self.models, desc='Обучение моделей', ncols=100):
            if name[:2] != 'cb':
                
                model.fit(self.X_train, self.y_train)
            else:
                model.fit(self.train_cb)
            preds = model.predict(self.X_test)
            if self.task == 'regr':
                score = r2_score(np.array(self.y_test), preds)*70
            if self.task == 'class':
                score = mean_squared_error(np.array(self.y_test), preds)
            scores.append({name: score})
            sums += preds
        
        if self.task == 'class':
            sums /= n
            sums[sums <= self.line] = 0
            sums[sums > self.line] = 1
            total_score = f1_score(np.array(self.y_test), sums)*30
        if self.task == 'regr':
            sums /= n
#             self.pub_tests['Доля сигнала в ВП'] = sums
            total_score = r2_score(np.array(self.y_test), sums)*70
        

            
        return scores, total_score
    
    def predict(self, test):
        n = len(self.models)
        
        x_coord = test['Дальность (м)'] * np.sin(test['Азимут']) * np.cos(test['У.М.'])
        y_coord = test['Дальность (м)'] * np.cos(test['Азимут']) * np.cos(test['У.М.'])
        z_coord = test['Дальность (м)'] * np.sin(test['У.М.'])
        test['x_coord'] = x_coord
        test['y_coord'] = y_coord
        test['z_coord'] = z_coord
        
        
        preds = []
        for model, name in self.models:
            preds_model = model.predict(test)
            preds.append((name, preds_model))
        
        sums = 0
        for name, pred in preds:
            sums += pred
        if self.task == 'class':
            sums /= n
            sums[sums <= self.line] = 0
            sums[sums > self.line] = 1
        if self.task == 'regr':
            sums /= n
        
        return preds, sums
            
        

In [None]:
top_5_least_important = [
    'Количество импульсов',
 'Тип_измерения',
 '№ испытания',
 'Фаза Hor',
 'Фаза Ver'
    
            ]

train = pd.read_csv('mars-train-regr.csv')
pub_tests = pd.read_csv('mars-private_test-reg.csv')
train_b = pd.read_csv('mars-train-class.csv')
test_b = pd.read_csv('mars-public_test-class.csv')
features = train.columns
train_b = train_b[features]
test_b = test_b[features]
train_plus = pd.concat([train, train_b, test_b], axis=0, ignore_index=True)


models = [
    (CatBoostRegressor(**{
            "iterations":10000,
            "depth":11,
            "learning_rate": 0.1,
            "loss_function": "RMSE",
            "grow_policy": "Lossguide",
            "early_stopping_rounds":100,
            "verbose": False,
            "l2_leaf_reg": 1.4,
            "random_seed": 1010
            }), "cb_1"),
    (CatBoostRegressor(**{
            "iterations":10000,
            "depth": 11,
            "learning_rate": 0.1,
            "loss_function": "RMSE",
            "grow_policy": "SymmetricTree",
            "early_stopping_rounds":100,
            "verbose": False,
            "l2_leaf_reg": 1.4,
            "random_seed": 1011
            }), "cb_2"),
    (CatBoostRegressor(**{
            "iterations":10000,
            "depth": 11,
            "learning_rate": 0.1,
            "loss_function": "RMSE",
            "grow_policy": "Depthwise",
            "early_stopping_rounds":100,
            "verbose": False,
            "l2_leaf_reg": 1.4,
            "random_seed": 1011
            }), "cb_3"),
    (RandomForestRegressor(n_estimators=1000, max_depth=19, n_jobs=-1, random_state=1010), "rf_1"),
    (RandomForestRegressor(n_estimators=1000, max_depth=21, n_jobs=-1, random_state=1010), "rf_2"),
    (RandomForestRegressor(n_estimators=1000, max_depth=20, n_jobs=-1, random_state=1010), "rf_3"),
]
        
ensemble = Ensemble(models, train_plus, top_5_least_important, "regr")
scores, total_score = ensemble.fit()

    

In [5]:
total_score

62.5958349756209

In [6]:
for i in range(3):
    model, name = models[i]
    model.save_model(name)

In [24]:
with open("models_reg.joblib", "wb") as file:
    dump(models[3:], file)

In [4]:
train = pd.read_csv('mars-train-class.csv')
pub_tests = pd.read_csv('mars-private_test-class.csv')


models_class = [
(CatBoostRegressor(**{
        "iterations":10000,
        "depth": 11,
        "learning_rate": 0.1,
        "loss_function": "RMSE",
        "grow_policy": "Lossguide",
        "early_stopping_rounds":100,
        "verbose": False,
        "l2_leaf_reg" : 1.4,
        "random_seed": 1010
        }), "cb_class_1"),
        (CatBoostRegressor(**{
        "iterations":10000,
        "depth": 11,
        "learning_rate": 0.1,
        "loss_function": "RMSE",
        "grow_policy": "SymmetricTree",
        "early_stopping_rounds":100,
        "verbose": False,
        "l2_leaf_reg" : 1.4,
        "random_seed": 1011
        }), "cb_class_2"),
    (CatBoostRegressor(**{
        "iterations":10000,
        "depth": 11,
        "learning_rate": 0.1,
        "loss_function": "RMSE",
        "grow_policy": "Depthwise",
        "early_stopping_rounds":100,
        "verbose": False,
        "l2_leaf_reg" : 1.4,
        "random_seed": 1011
        }), "cb_class_3"),
     ]
ensemble_class = Ensemble(models_class, train_plus, [], line=0.5, task='class')
F1_scores, score = ensemble_class.fit()

Обучение моделей: 100%|██████████████████████████████████████████████| 3/3 [18:30<00:00, 370.00s/it]


In [5]:
score

29.985639061752035

In [7]:
for i in range(3):
    model, name = models_class[i]
    model.save_model(name)