In [1]:
from os.path import isdir
RANDOM_STATE = 42
if isdir("/kaggle"):
    PATH = "/kaggle/input/icr-identify-age-related-conditions/"
    SUB_PATH = "/kaggle/working/"
else:  # Для запуска локально (когда все необходимые файлы в одной дирректории)
    PATH = "./"
    SUB_PATH = "./"

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import warnings

In [3]:
def get_sample_weight(y: pd.Series): # Веса для log_loss
    weights = [1 / (y.shape[0] - y.sum()), 1 / y.sum()]
    weights_sample = np.zeros(y.shape[0])
    weights_sample[y == 0] = weights[0]
    weights_sample[y == 1] = weights[1]
    return weights_sample

# Организационный момент
1. Класс, обучающий модель должен удовлетворять следующим условиям:
    1. В `.fit(X, y)` подается X без целевого значения, в y - целевое значение. Предполагается, что X **не предобработан**
    2. В `.predict_proba(X)` или `.predict(X)` подаются непредобработанные данные. В стекинге отдается предпочтение методу `.predict_proba`, хотя и `.predict` допустим, если невозможен `.predict_proba`
2. В главе "Модули с моделями" размещается код класса с подглавой для соответствующей модели (аналогично "CatBoost модель")
3. В словаре `MODELS` определенны не сами классы, а **объекты** соответствующих классов
4. Если предполагается тюнить модель, то гиперпараметры определяются в `__init__` класса

# Идеи и замечания

1. CatBoost лучше отрабатывает с неглубокими деревьями нежели в глубокими, так же на больших reg_lambda (>100) <font color="green">(kaggle 0.23)</font>
2. В данных существуют моды с одинаковым значением, что может свидетельствовать о том, что на месте этих значений были NaNы, заменим такие значения на NaNы
3. Максимальное отклонение признака, то есть если распределение признака $\xi_i$, то максимальное отклонение равно $\max_i \frac{|E \xi_i - x_i|}{D \xi_i}$

## Модули с моделями

### CatBoost модель

In [4]:
class NaNizeTransformer(TransformerMixin):  # Трансформер, который восстанавливает NaNы, которые были скрыты создателями оригинального датасета
    def __init__(self, diff=2, include=[], exclude=[]):
        super().__init__()
        self.diff = diff
        if len(set(include) & set(exclude)) != 0:
            raise ValueError("include and exclude has mutual elements")
        self.include = include
        self.exclude = exclude
    
    def fit(self, X: pd.DataFrame, y=None):
        self.nulls = {}
        for col in X.columns:
            if col in self.include or (col not in self.exclude and pd.api.types.is_numeric_dtype(X[col].dtype)):
                temp = X[col].value_counts().sort_values(ascending=False)
                if col in self.include or temp.shape[0] > 5 and temp.values[0] / temp.values[1] > self.diff:
                    self.nulls[col] = temp.index[0]
        return self
    
    def transform(self, X: pd.DataFrame):
        X_copy = X.copy(deep=True)
        for col, key in self.nulls.items():
            X_copy[col].replace(key, np.nan, inplace=True)
        return X_copy
    
    def inverse_transform(self, X: pd.DataFrame):
        X_copy = X.copy(deep=True)
        for col, key in self.nulls.items():
            X_copy[col].fillna(value=key)
        return X_copy
                

class CatBoostTransformer(TransformerMixin):
    def __init__(self, bad_features, random_state=RANDOM_STATE):
        self.random_state = random_state
        self.bad_features = bad_features
    
    def fit(self, X, y):
        X_trans = X.copy(deep=True)
        X_trans.columns = X_trans.columns.str.strip()
        self.naninput = NaNizeTransformer()
        X_trans = self.naninput.fit_transform(X_trans)
        for col in X_trans.columns:
            if pd.api.types.is_string_dtype(X_trans[col]):
                continue
            X_trans[col] = np.log1p(X_trans[col])
        self.mean0 = X_trans.drop(columns=['EJ', 'Id']).loc[y == 0].mean()
        self.var0 = X_trans.drop(columns=['EJ', 'Id']).loc[y == 0].std()
        return self
    
    def transform(self, X):
        X_trans = X.copy(deep=True)
        X_trans.columns = X_trans.columns.str.strip()
        X_trans.drop(columns=self.bad_features, inplace=True)
        X_trans = self.naninput.transform(X_trans)
        for col in X_trans.columns:
            if col == "EJ":
                continue
            X_trans[col] = np.log1p(X_trans[col])
        vardrop = ['EJ']
        comp = X_trans.drop(columns=vardrop)
        # cols = X_trans.columns.values
        comp = np.abs(comp - self.mean0) / self.var0
        X_trans['max_variance'] = np.nanmax(comp.values, axis=1).ravel()
        # X_trans['max_variance_name'] = cols[np.nanargmax(comp.values, axis=1).ravel()]
        return X_trans


class CatBoostModel(BaseEstimator):
    def __init__(self, random_state=RANDOM_STATE, verbose=False):
        super().__init__()
        self.random_state = random_state
        self.bad_features = ['Id']
        self.categorical = ["EJ"]
        self.verbose = verbose
    
    def fit(self, X, y):
        self.transformer = CatBoostTransformer(self.bad_features, random_state=self.random_state)
        self.weights = [1 / (X.shape[0] - y.sum()), 1 / y.sum()]
        X_trans = self.transformer.fit_transform(X, y)
        X_train, X_val, y_train, y_val = train_test_split(X_trans, y, test_size=0.1, random_state=RANDOM_STATE, stratify=y)
        CATBOOST_PARAMS = {'n_estimators': 1000000,
                           "eval_metric": "Logloss:use_weights=True",
                           "class_weights": self.weights,
                           'max_depth': 4,
                           'max_leaves': 7,
                           'min_child_samples': 9,
                           'reg_lambda': 640.3664246084949,
                           'grow_policy': 'Lossguide',
                           'random_seed': RANDOM_STATE}
        
        CATBOOST_FIT_PARAMS = {"eval_set": (X_val, y_val),
                               "early_stopping_rounds": 50,
                               "cat_features": self.categorical,
                               "use_best_model": True,
                               "verbose": self.verbose}
        if not self.verbose:
            CATBOOST_PARAMS['verbose'] = -1
        else:
            CATBOOST_FIT_PARAMS['metric_period'] = 100
        model = CatBoostClassifier(**CATBOOST_PARAMS)
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=FutureWarning)
            model.fit(X_train, y_train, **CATBOOST_FIT_PARAMS)
        self.model = model
        return self
    
    def predict(self, X):
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=FutureWarning)
            return self.model.predict(self.transformer.transform(X))
    
    def predict_proba(self, X):
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=FutureWarning)
            return self.model.predict_proba(self.transformer.transform(X))

### SVM модель

In [5]:
class SVMModel(BaseEstimator):
    def __init__(self, **model_params):
        super().__init__()
        self.model_params = model_params
    
    def fit(self, X: pd.DataFrame, y):
        self.nantransform = NaNizeTransformer()
        X_trans = self.nantransform.fit_transform(X, y)
        self.ej_encoder = LabelEncoder()
        X_trans['EJ'] = self.ej_encoder.fit_transform(X_trans['EJ'])
        self.model = Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', StandardScaler()), ('SVM', SVC(**self.model_params))])
        X_trans.drop(columns=['Id'], inplace=True)
        self.model.fit(X_trans, y)
        return self
    
    def predict_proba(self, X: pd.DataFrame):
        X_trans = self.nantransform.transform(X)
        X_trans['EJ'] = self.ej_encoder.transform(X_trans['EJ'])
        X_trans.drop(columns=['Id'], inplace=True)
        return self.model.predict_proba(X_trans)
        


### Случайный лес

In [6]:
class RandomForestModel(BaseEstimator):
    def __init__(self, **model_params):
        super().__init__()
        self.model_params = model_params
    
    def fit(self, X: pd.DataFrame, y):
        self.nantransform = NaNizeTransformer()
        X_trans = self.nantransform.fit_transform(X, y)
        X_trans = self.nantransform.inverse_transform(X_trans)
        self.ej_encoder = LabelEncoder()
        X_trans['EJ'] = self.ej_encoder.fit_transform(X_trans['EJ'])
        self.model = Pipeline(steps=[('imputer', SimpleImputer(strategy="constant", fill_value=0)), ('RandomForest', RandomForestClassifier(**self.model_params))])
        X_trans.drop(columns=['Id'], inplace=True)
        self.model.fit(X_trans, y)
        return self
    
    def predict_proba(self, X: pd.DataFrame):
        X_trans = self.nantransform.transform(X)
        X_trans = self.nantransform.inverse_transform(X_trans)
        X_trans['EJ'] = self.ej_encoder.transform(X_trans['EJ'])
        X_trans.drop(columns=['Id'], inplace=True)
        return self.model.predict_proba(X_trans)

## Метамодели

### Среднее

In [7]:
class StackingMean(BaseEstimator):
    def __init__(self):
        super().__init__()
    
    def fit(self, X=None, y=None):
        return self
    
    def predict_proba(self, X):
        return np.hstack([1 - np.mean(X.to_numpy(), axis=1).reshape([-1, 1]), np.mean(X.to_numpy(), axis=1).reshape([-1, 1])])

### Фиксированная логистическая регрессия

In [8]:
class FixedLogRegressor(BaseEstimator):
    def __init__(self, inter, weights):
        super().__init__()
        self.model = LogisticRegression()
        self.model.coef_ = weights
        self.model.intercept_ = inter
        self.model.classes_ = np.array([0, 1])
    
    def fit(self, X=None, y=None):
        return self
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)

## Стекинг

In [9]:
class ModelStacker(BaseEstimator):
    def __init__(self, models, metamodel=None, random_state=RANDOM_STATE, outer_frac=0.2):
        """
            models - словарь: название модели - объект модели
            metamodel - объект метамодели
            Если в models всего одна модель, то metamodels игнорируется
        """
        super().__init__()
        self.models = models
        self.metamodel = metamodel
        self._random_state = random_state
        self._outer_frac = outer_frac
    
    def fit(self, X, y, inner=True, outer=True):
        """
            X, y - датасет
            inner - обучать ли внутренние модели
            outer - обучать ли метамодель
            outer_frac - доля объектов при обучении метамодели (если inner или outer = False, либо в models 1 модель, то этот параметр игнорируется)
            Если inner = False, то все внутренние модели должны быть обученны. Если в self.models всего 1 модель, то метамодель игнорируется
        """
        if inner and outer and len(self.models) > 1 and self._outer_frac > 0:
            X_inner, X_outer, y_inner, y_outer = train_test_split(X, y, test_size=self._outer_frac, stratify=y)
            for model in self.models:
                self.models[model].fit(X_inner, y_inner)
            temp = {}
            for model in self.models:
                if hasattr(self.models[model], "predict_proba"):
                    temp[model] = self.models[model].predict_proba(X_outer)[:, 1]
                else:
                    temp[model] = self.models[model].predict(X_outer)
            temp = pd.DataFrame(temp)
            self.metamodel.fit(temp, y_outer)
            
        elif inner and (len(self.models) == 1 or self._outer_frac == 0):
            for model in self.models:
                self.models[model].fit(X, y)
        elif len(self.models) > 1 and outer:
            temp = {}
            for model in self.models:
                if hasattr(self.models[model], "predict_proba"):
                    temp[model] = self.models[model].predict_proba(X)[:, 1]
                else:
                    temp[model] = self.models[model].predict(X)
            temp = pd.DataFrame(temp)
            self.metamodel.fit(temp, y)
        return self
    
    def predict_proba(self, X):
        if len(self.models) == 1:
            for model in self.models:
                return self.models[model].predict_proba(X)
        else:
            temp = {}
            for model in self.models:
                if hasattr(self.models[model], "predict_proba"):
                    temp[model] = self.models[model].predict_proba(X)[:, 1]
                else:
                    temp[model] = self.models[model].predict(X)
            return self.metamodel.predict_proba(pd.DataFrame(temp))

## Обучение

In [10]:
X = pd.read_csv(PATH + 'train.csv')
y = X.Class
X.drop(columns=['Class'], inplace=True)
test = pd.read_csv(PATH + "test.csv")

In [11]:
MODELS = {
          "CatBoost": CatBoostModel(random_state=RANDOM_STATE, verbose=True),
          "SVM": SVMModel(**{'C': 43.01520973313692,
                             'kernel': 'rbf',
                             'degree': 1,
                             'class_weight': None,
                             'probability': True}),
          "RandomForest": RandomForestModel(**{'n_estimators': 52,
                                               'criterion': 'entropy',
                                               'max_depth': 8,
                                               'min_samples_leaf': 8,
                                               'class_weight': 'balanced'})
        }

In [12]:
model = ModelStacker(MODELS, 
                     metamodel=FixedLogRegressor(np.array([-2.54545349]), np.array([[2.76411647, 2.48100103, 2.66959559]])), 
                     random_state=RANDOM_STATE, outer_frac=0)
model.fit(X, y)
ans = model.predict_proba(test)



0:	learn: 0.6843782	test: 0.6849267	best: 0.6849267 (0)	total: 56.8ms	remaining: 15h 47m 2s
100:	learn: 0.3449253	test: 0.4276315	best: 0.4276315 (100)	total: 371ms	remaining: 1h 1m 8s
200:	learn: 0.2574482	test: 0.3741315	best: 0.3741315 (200)	total: 688ms	remaining: 57m 1s
300:	learn: 0.2107060	test: 0.3509243	best: 0.3505165 (296)	total: 1.03s	remaining: 56m 54s
400:	learn: 0.1803666	test: 0.3401527	best: 0.3401527 (400)	total: 1.35s	remaining: 55m 59s
500:	learn: 0.1581670	test: 0.3306029	best: 0.3306029 (500)	total: 1.66s	remaining: 55m 11s
600:	learn: 0.1414376	test: 0.3225213	best: 0.3220734 (593)	total: 1.98s	remaining: 54m 44s
700:	learn: 0.1278044	test: 0.3141805	best: 0.3136874 (696)	total: 2.29s	remaining: 54m 20s
800:	learn: 0.1173923	test: 0.3073774	best: 0.3073774 (800)	total: 2.59s	remaining: 53m 56s
900:	learn: 0.1087084	test: 0.3002718	best: 0.3002718 (900)	total: 2.9s	remaining: 53m 40s
1000:	learn: 0.1008553	test: 0.2979944	best: 0.2978670 (994)	total: 3.21s	remaini



In [13]:
submission = pd.read_csv(PATH + 'sample_submission.csv')
submission[['class_0', 'class_1']] = ans

In [14]:
submission.to_csv(SUB_PATH + 'submission.csv', index=False)

# Анализ данных

## Общие замечания

1. В train.csv есть NaNы в EL, BQ, CC, FS, CB, FL, FC, DU, GL. ~~При этом в test.csv NaNов нет,~~ настоящий тест скрыт
2. Сильно коррелированные признаки DV и CR, BC и BZ
3. У данных часто существуют моды с единственным значением. 

## train.csv + greeks.csv

In [15]:
import plotly.express as px
X_train_joined = pd.read_csv(PATH + 'train.csv')
temp = pd.read_csv(PATH + 'greeks.csv')
X_train_joined = X_train_joined.merge(temp, how='left', on='Id')

In [16]:
temp = 0
X_train_joined.dtypes[temp:temp+20]

Id      object
AB     float64
AF     float64
AH     float64
AM     float64
AR     float64
AX     float64
AY     float64
AZ     float64
BC     float64
BD     float64
BN     float64
BP     float64
BQ     float64
BR     float64
BZ     float64
CB     float64
CC     float64
CD     float64
CF     float64
dtype: object

In [17]:
X_train_joined.to_numpy()

array([['000ff2bfdfe9', 0.209377, 3109.03329, ..., 'G', 'D', '3/19/2019'],
       ['007255e47698', 0.145282, 978.76416, ..., 'M', 'B', 'Unknown'],
       ['013f2bd269f5', 0.47003, 2635.10654, ..., 'M', 'B', 'Unknown'],
       ...,
       ['fd8ef6377f76', 0.4273, 2459.1072, ..., 'M', 'B', '7/24/2019'],
       ['fe1942975e40', 0.363205, 1263.53524, ..., 'M', 'B', '1/31/2019'],
       ['ffcca4ded3bb', 0.482849, 2672.53426, ..., 'M', 'B', 'Unknown']],
      dtype=object)

In [18]:
X_train_joined.columns = X_train_joined.columns.str.strip()

In [19]:
X_train_joined.Epsilon.replace("Unknown", np.nan, inplace=True)
X_train_joined.Epsilon = pd.to_datetime(X_train_joined.Epsilon)

In [20]:
X_train_joined

Unnamed: 0,Id,AB,AF,AH,AM,AR,AX,AY,AZ,BC,...,GF,GH,GI,GL,Class,Alpha,Beta,Gamma,Delta,Epsilon
0,000ff2bfdfe9,0.209377,3109.03329,85.200147,22.394407,8.138688,0.699861,0.025578,9.812214,5.555634,...,2003.810319,22.136229,69.834944,0.120343,1,B,C,G,D,2019-03-19
1,007255e47698,0.145282,978.76416,85.200147,36.968889,8.138688,3.632190,0.025578,13.517790,1.229900,...,27981.562750,29.135430,32.131996,21.978000,0,A,C,M,B,NaT
2,013f2bd269f5,0.470030,2635.10654,85.200147,32.360553,8.138688,6.732840,0.025578,12.824570,1.229900,...,13676.957810,28.022851,35.192676,0.196941,0,A,C,M,B,NaT
3,043ac50845d5,0.252107,3819.65177,120.201618,77.112203,8.138688,3.685344,0.025578,11.053708,1.229900,...,2094.262452,39.948656,90.493248,0.155829,0,A,C,M,B,NaT
4,044fb8a146ec,0.380297,3733.04844,85.200147,14.103738,8.138688,3.942255,0.054810,3.396778,102.151980,...,8524.370502,45.381316,36.262628,0.096614,1,D,B,F,B,2020-03-25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,fd3dafe738fd,0.149555,3130.05946,123.763599,9.513984,13.020852,3.499305,0.077343,8.545512,2.804172,...,8095.932828,24.640462,69.191944,21.978000,0,A,B,M,B,2020-09-13
613,fd895603f071,0.435846,5462.03438,85.200147,46.551007,15.973224,5.979825,0.025882,12.622906,3.777550,...,3085.308063,29.648928,124.808872,0.145340,0,A,B,M,B,2020-09-08
614,fd8ef6377f76,0.427300,2459.10720,130.138587,55.355778,10.005552,8.070549,0.025578,15.408390,1.229900,...,6474.652866,26.166072,119.559420,21.978000,0,A,C,M,B,2019-07-24
615,fe1942975e40,0.363205,1263.53524,85.200147,23.685856,8.138688,7.981959,0.025578,7.524588,1.229900,...,1965.343176,25.116750,37.155112,0.184622,0,A,C,M,B,2019-01-31


In [21]:
X_train_joined.isna().sum().sort_values(ascending=False)[:12]

Epsilon    144
BQ          60
EL          60
CC           3
FS           2
CB           2
DU           1
FL           1
GL           1
FC           1
Class        0
FD           0
dtype: int64

Обнаружение пиков

In [22]:
["AF", "AH", "AM", "AR", "AX", "AY", "AZ", "BC", "BQ", "BR", "BZ", "CB", "CD", "CF", "CL", "CR", "CS", "CU", "CW", "DF", "DH", "DI", "DL", "DU", "DV", "DY", "EB", "EE", "EG", "EH", "EL", "EP", "EU", "FD", "FI", "FL", "FR", "FS", "GE", "GF", "GH", "GL"]

['AF',
 'AH',
 'AM',
 'AR',
 'AX',
 'AY',
 'AZ',
 'BC',
 'BQ',
 'BR',
 'BZ',
 'CB',
 'CD',
 'CF',
 'CL',
 'CR',
 'CS',
 'CU',
 'CW',
 'DF',
 'DH',
 'DI',
 'DL',
 'DU',
 'DV',
 'DY',
 'EB',
 'EE',
 'EG',
 'EH',
 'EL',
 'EP',
 'EU',
 'FD',
 'FI',
 'FL',
 'FR',
 'FS',
 'GE',
 'GF',
 'GH',
 'GL']

In [23]:
col = "DA"
temp = X_train_joined[col].value_counts().sort_values(ascending=False)
peak = temp.index[0]
temp.iloc[:20]

90.58636    2
55.94960    2
47.00620    2
39.03668    2
47.67744    2
48.30988    2
31.17774    1
70.81970    1
47.27586    1
74.06532    1
55.22404    1
19.21570    1
63.21684    1
76.77356    1
62.34384    1
6.90640     1
55.11928    1
26.97376    1
33.67646    1
57.91288    1
Name: DA, dtype: int64

In [24]:
is_log = True
if is_log:
    temp = px.histogram(np.log1p(X_train_joined[col]))
else:
    temp = px.histogram(X_train_joined[col])
temp.show()

In [25]:
X_train_joined_feat = CatBoostTransformer(['Id']).fit_transform(X, y)

In [26]:
px.imshow(np.abs(X_train_joined_feat.corr(numeric_only=True)), height=1000)

In [27]:
y.corr(X_train_joined_feat.max_variance)

0.3383272791753242

In [28]:
X_train_joined_feat

Unnamed: 0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,...,FL,FR,FS,GB,GE,GF,GH,GI,GL,max_variance
0,0.190105,8.042389,,3.152497,,0.530546,,2.380676,1.880325,8.325448,...,2.116034,1.007429,0.090592,2.512776,,7.603305,3.141400,4.260352,0.113635,3.997082
1,0.135651,6.887312,,3.636767,,1.533030,,2.675375,,8.612127,...,,,0.450395,2.331435,,10.239337,3.405702,3.500499,,2.258235
2,0.385283,7.877058,,3.507374,,2.045476,,2.626447,,8.544182,...,2.164421,0.680852,0.787921,3.639631,4.495461,9.523541,3.368083,3.588857,0.179769,2.309937
3,0.224828,8.248176,4.797455,4.358146,,1.544439,,2.489372,,8.335834,...,1.963211,,0.250343,2.971930,4.423850,7.647434,3.712319,4.516265,0.144817,4.599578
4,0.322299,8.225248,,2.714942,,1.597822,0.053361,,4.636203,8.653424,...,2.214088,3.902000,0.115036,2.856972,4.991180,9.050802,3.836897,3.617991,0.092228,16.533265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
612,0.139375,8.049127,4.826421,2.352706,2.640546,1.503923,0.074498,2.256071,1.336098,8.332954,...,,0.815772,,2.299292,5.385176,8.999241,3.244172,4.251234,,2.230363
613,0.361754,8.605760,,3.861803,2.831637,1.943024,0.025553,2.611753,1.563928,8.640309,...,2.417979,0.807529,0.355363,3.608114,6.210588,8.034731,3.422598,4.834764,0.135702,2.228703
614,0.355785,7.807960,4.876255,4.031685,2.398400,2.205033,,2.797793,,8.680991,...,,,,3.042716,4.866741,8.775805,3.301969,4.792143,,3.747275
615,0.309839,7.142460,,3.206230,,2.195218,,2.142955,,8.416016,...,2.327960,0.580896,0.513139,3.242377,,7.583931,3.262577,3.641660,0.169423,1.733931


In [29]:
X_train_joined.Alpha.value_counts()

A    509
B     61
G     29
D     18
Name: Alpha, dtype: int64

In [30]:
X_train_joined.nunique().sort_values()[:20]

EJ           2
Class        2
Beta         3
Alpha        4
Delta        4
Gamma        8
DV          39
BN          53
BZ         115
CL         123
EH         127
AR         130
CH         135
DF         137
AY         148
FS         161
DH         191
Epsilon    197
AB         217
AH         227
dtype: int64

D - деменция?

In [31]:
np.exp(X_train_joined_feat.median(numeric_only=True).sort_values()[30:40])

DN    26.248800
GH    31.608946
EL    34.455029
CS    36.250955
FC    37.393960
CW    38.026504
EU    39.316746
GI    42.007968
CB    47.138618
DA    50.180940
dtype: float64

In [32]:
px.histogram(np.exp(X_train_joined_feat["EL"]))

In [33]:
X_train_joined.EJ.value_counts()

B    395
A    222
Name: EJ, dtype: int64

## test.csv

In [34]:
X_test = pd.read_csv(PATH + "test.csv")

In [35]:
X_test.isna().sum().sort_values(ascending=False)[:20]

Id     0
DH     0
DL     0
DN     0
DU     0
DV     0
DY     0
EB     0
EE     0
EG     0
EH     0
EJ     0
EL     0
EP     0
EU     0
FC     0
FD     0
FE     0
FI     0
FL     0
dtype: int64

In [36]:
raise ValueError

ValueError: 

## Калибровка логистической регрессии

In [None]:
model = ModelStacker(MODELS, 
                     metamodel=LogisticRegression(class_weight='balanced'), 
                     random_state=RANDOM_STATE, outer_frac=0.25)
model.fit(X, y)
print(f"np.{model.metamodel.intercept_.__repr__()}, np.{model.metamodel.coef_.__repr__()}")

# Тюнинг (CatBoost)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import optuna
import warnings
RANDOM_STATE_FRAC = 102
X = pd.read_csv(PATH + "train.csv")
y = X.Class
X.drop(columns=['Class'], inplace=True)
X = CatBoostTransformer(['Id']).fit_transform(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=RANDOM_STATE_FRAC, stratify=y)
weights_sample = np.zeros(X_train.shape[0])  # Для весов в log_loss
weights = [1 / (X.shape[0] - y.sum()), 1 / y.sum()]
weights_sample[y_train == 0] = weights[0]
weights_sample[y_train == 1] = weights[1]
categorical = ['EJ']
def func(trial: optuna.Trial):
    sample_params = {
        'n_estimators': 1000000,
        "eval_metric": "Logloss:use_weights=True",
        "class_weights": weights,
        'max_depth': trial.suggest_int("max_depth", 3, 11),
        'max_leaves': trial.suggest_int("max_leaves", 5, 200, log=True),
        'min_child_samples': trial.suggest_int("min_child_samples", 1, 300, log=True),
        'reg_lambda': trial.suggest_float("reg_lambda", 1, 1000.0, log=True),
        'grow_policy': 'Lossguide',
        'verbose': -1
    }
    fold = StratifiedKFold(shuffle=True, random_state=RANDOM_STATE_FRAC)
    metric_list = []  # Значение log_loss с каждого фолда
    temp = []
    for param in ['max_depth', 'max_leaves', 'min_child_samples', 'reg_lambda']:
        temp.append(f"{param}={sample_params[param]}")
    print(', '.join(temp))
    for i, (tr_id, tst_id) in enumerate(fold.split(X_train, y_train)):  # Цикл с фолдами
        print(f'Fold {i}')
        model = CatBoostClassifier(**sample_params)
        X_val, X_es, y_val, y_es = train_test_split(X_train.iloc[tr_id], y_train.iloc[tr_id], test_size=0.15, random_state=RANDOM_STATE_FRAC, shuffle=True, stratify=y_train.iloc[tr_id])
        with warnings.catch_warnings():
            warnings.filterwarnings('ignore', category=FutureWarning)
            model.fit(X_val, y_val,
                    eval_set=(X_es, y_es),
                    early_stopping_rounds=50,
                    cat_features=categorical,
                    use_best_model=True,
                    verbose=False)
            temp = model.predict_proba(X_train.iloc[tst_id])[:, 1]
        metric_list.append(log_loss(y_train.iloc[tst_id], temp, eps=1e-15, sample_weight=weights_sample[tst_id]))
    return np.mean(metric_list)

In [None]:
opt = optuna.create_study(sampler=optuna.samplers.TPESampler(n_startup_trials=100), direction='minimize')

In [None]:
opt.optimize(func, n_trials=150)

In [None]:
optuna.visualization.plot_slice(opt)

In [None]:
opt.best_params

# Оптимизация (Любая удовлетворяющая требованиям функция)

In [None]:
import optuna
from sklearn.metrics import log_loss
def tune_model(X, y,
               model_class,
               study_init_params={},
               optimize_params={},
               model_params: dict={},
               fit_params={},
               random_state=None,
               test_size=0.1):
    """
        X, y - данные для тюнинга, подобные train.csv
        model_class - класс модели для тюнинга
        study_init_params - параметры для инициализации optuna.Study
        optimize_params - параметры для study.optimize()
        model_params - словарь вида "имя гиперпараметра: кортеж", где кортеж имеет вид в зависимости от действия перебора:
            ('const', param) - если гиперпараметр всегда константа param - любой объект
            ('int', a, b, is_log) - если перебирается целое число в пределах [a, b], is_log - булево значение
            ('float', a, b, is_log) - если перебирается float в пределах [a, b], is_log - булево значение
            ('cat', [...]) - если значение выбирается из конечного множества вариантов
        fit_params - параметры для fit
        random_state - сид для разбиения данных на валидацию и train
        test_size - доля валидационной выборки

        Возвращается optuna.Study объект
    """
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    weights_sample = get_sample_weight(y_val)
    def func(trial: optuna.Trial):
        _model_params = {}
        for name, contents in model_params.items():
            if contents[0] == 'const':
                _model_params[name] = trial.suggest_categorical(name, [contents[1]])
            if contents[0] == 'int':
                _model_params[name] = trial.suggest_int(name, contents[1], contents[2], log=contents[3])
            if contents[0] == 'float':
                _model_params[name] = trial.suggest_float(name, contents[1], contents[2], log=contents[3])
            if contents[0] == 'cat':
                _model_params[name] = trial.suggest_categorical(name, contents[1])
        model = model_class(**_model_params)
        model.fit(X_train, y_train, **fit_params)
        if hasattr(model, "predict_proba"):
            temp = model.predict_proba(X_val)[:, 1].ravel()
        else:
            temp = model.predict(X_val)
        return log_loss(y_val, temp, eps=1e-15, sample_weight=weights_sample)
    opt = optuna.create_study(**study_init_params)
    opt.optimize(func, **optimize_params)
    return opt


In [None]:
temp = tune_model(X, y,
                  SVMModel,
                  study_init_params={'sampler': optuna.samplers.TPESampler(n_startup_trials=700), 
                                     'direction': 'minimize'},
                  optimize_params={'n_trials': 1000},
                  model_params={'C': ('float', 0.001, 100, True),
                                'kernel': ('cat', ['poly', 'rbf', 'sigmoid']),
                                'degree': ('int', 1, 5, False),
                                'class_weight': ('cat', ['balanced', None]),
                                'probability': ('const', True)},
                 random_state=30)

In [None]:
temp = tune_model(X, y,
                  RandomForestModel,
                  study_init_params={'sampler': optuna.samplers.TPESampler(n_startup_trials=700), 
                                     'direction': 'minimize'},
                  optimize_params={'n_trials': 1000},
                  model_params={'n_estimators': ('int', 10, 100, True),
                                'criterion': ('cat', ['gini', 'entropy']),
                                'max_depth': ('int', 5, 9, False),
                                'min_samples_leaf': ('int', 2, 40, True),
                                'class_weight': ('cat', ['balanced', 'balanced_subsample'])},
                 random_state=30)

In [None]:
tune_model(X, y,
           ModelStacker,
           study_init_params={'sampler': optuna.samplers.TPESampler(n_startup_trials=30), 
                                     'direction': 'minimize'},
           optimize_params={'n_trials': 50},
           model_params={'models': ('const', {"CatBoost": CatBoostModel(random_state=RANDOM_STATE, verbose=False),
                                              "SVM": SVMModel(**{'C': 43.01520973313692,
                                                                 'kernel': 'rbf',
                                                                 'degree': 1,
                                                                 'class_weight': None,
                                                                 'probability': True})}),
                         'metamodel': ('const', RandomForestClassifier()),
                         'random_state': ('const', RANDOM_STATE),
                         'outer_frac': ('float', 0.05, 0.9, False)
                        })

In [None]:
optuna.visualization.plot_slice(temp)

In [None]:
temp.best_params

# KFold валидация (Любой класс модели)

In [None]:
from sklearn.model_selection import StratifiedKFold
from copy import deepcopy
from sklearn.metrics import log_loss
def kfoldscore(X: pd.DataFrame, y: pd.DataFrame,
               model,
               folds=10,
               random_state=42):
    score_list = []
    for tr_ind, tst_ind in StratifiedKFold(folds, shuffle=True, random_state=random_state).split(X, y):
        temp = deepcopy(model)
        temp.fit(X.iloc[tr_ind], y.iloc[tr_ind])
        score_list.append(log_loss(y.iloc[tst_ind], temp.predict_proba(X.iloc[tst_ind])[:, 1], 
                                   eps=1e-15, 
                                   sample_weight=get_sample_weight(y.iloc[tst_ind])))
    print(f"Mean = {np.mean(score_list)}, min = {min(score_list)}, max = {max(score_list)}")



In [None]:
kfoldscore(X, y, SVMModel(**{'C': 43.01520973313692,
 'kernel': 'rbf',
 'degree': 1,
 'class_weight': None,
 'probability': True}), folds=15)

In [None]:
kfoldscore(X, y, CatBoostModel(random_state=RANDOM_STATE, verbose=False), folds=15)

In [None]:
kfoldscore(X, y, RandomForestModel(), folds=15)

In [None]:
kfoldscore(X, y, ModelStacker(MODELS, 
                     metamodel=FixedLogRegressor(np.array([-2.44671242]), np.array([[3.09712072, 1.98385503, 1.9165799 ]])), 
                     random_state=RANDOM_STATE, outer_frac=0), folds=15)

In [None]:
kfoldscore()

In [None]:
kfoldscore(X, y, CatBoostModel(random_state=RANDOM_STATE, verbose=False), folds=15)