In [132]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from lightgbm import LGBMClassifier
import random
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score
import seaborn as sns
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, auc, f1_score
import tqdm
random.seed(42)
np.random.seed(42)

In [2]:
!pip install -U lightautoml

Looking in links: /kaggle/input/lightautoml-038-dependecies
Processing /kaggle/input/lightautoml-038-dependecies/lightautoml-0.3.8-py3-none-any.whl
Processing /kaggle/input/lightautoml-038-dependecies/AutoWoE-1.3.2-py3-none-any.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/cmaes-0.10.0-py3-none-any.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/joblib-1.2.0-py3-none-any.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/json2html-1.3.0.tar.gz (from lightautoml==0.3.8)
  Preparing metadata (setup.py) ... [?25ldone
[?25hProcessing /kaggle/input/lightautoml-038-dependecies/lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/poetry_core-1.8.1-py3-none-any.whl (from

In [133]:
import random
import torch
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import os
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [134]:
df = pd.read_parquet('/kaggle/input/purple-hack/train_ai_comp_final_dp.parquet')

In [135]:
test = pd.read_parquet('/kaggle/input/purple-hack-with-test/test_sber.parquet')

In [136]:
for col in df.columns:
    if df[col].dtype == 'int64':
        df[col] = df[col].astype('int32')
    elif df[col].dtype == 'float64':
        df[col] = df[col].astype('float32')
    elif df[col].dtype == 'object':
        if len(df[col].unique()) / len(df[col]) < 0.5:
            df[col] = df[col].astype('category')


In [137]:
# Удаление фичей, которые коррелируют друг с другом больше, чем на 0.9
def get_correlated_feats(corr_matrix, feat_stats, greater_is_better=True, corr_threshold=0.95):
    cols = corr_matrix.columns.to_list()
    dropped = {col:0 for col in cols}
    for col in tqdm.tqdm(cols, desc='Get correlated features'):
        if dropped[col] == 0:
            columns_to_check = corr_matrix.index.values[np.abs(corr_matrix[col]) >= corr_threshold]
            if len(columns_to_check) > 1:
                if feat_stats is None:
                    bad_cols = columns_to_check[1:]
                else:
                    sel_stats = feat_stats.loc[columns_to_check]
                    if greater_is_better:
                        bad_cond = np.abs(sel_stats) < np.abs(sel_stats).max()
                    else:
                        bad_cond = np.abs(sel_stats) > np.abs(sel_stats).min()
                        
                    bad_cols = sel_stats[bad_cond].index.to_list()
                    norm_cols = sel_stats[~bad_cond].index.to_list()
                    if len(norm_cols) > 1:
                        for norm_col in norm_cols[1:]:
                            dropped[norm_col] += 1
                            
                for bad_col in bad_cols:
                    dropped[bad_col] += 1
    high_corr_cols = [c for c in dropped.keys() if dropped[c] > 0]
    return high_corr_cols

In [138]:
clean_df = df.copy()
nan_percentage = (clean_df == 0).mean()
cols_to_drop = list(nan_percentage[nan_percentage > 0.95].index)

ignore_features = ['id', 'target', 'sample_ml_new', 'feature756']+cols_to_drop
print(len(ignore_features))
clean_df = clean_df.drop(columns=ignore_features)

corr_mx = pd.DataFrame(np.corrcoef(clean_df.values, rowvar=False), columns=clean_df.columns, index=clean_df.columns)

194


In [139]:
corr_feats = get_correlated_feats(corr_mx, feat_stats=nan_percentage, greater_is_better=False, corr_threshold=0.95)
len(corr_feats)

Get correlated features: 100%|██████████| 886/886 [00:00<00:00, 3107.48it/s]


275

In [140]:
df_no_corr = clean_df.drop(columns=corr_feats)
print("Было:", df.shape)
print("Стало:", df_no_corr.shape)

Было: (519615, 1079)
Стало: (519615, 611)


In [141]:
final_feats = df_no_corr.columns

X = df[final_feats].drop(columns = ['feature642'])
y = df['target']

In [143]:
X['sin341'] = np.sin(X['feature341'])
X['log940'] = np.log1p(X['feature940'])
X['new1'] = (X['feature1004'] * X['feature994'])

test['sin341'] = np.sin(test['feature341'])
test['log940'] = np.log1p(test['feature940'])
test['new1'] = (test['feature1004'] * test['feature994'])

  X['sin341'] = np.sin(X['feature341'])
  X['log940'] = np.log1p(X['feature940'])
  X['new1'] = (X['feature1004'] * X['feature994'])


In [144]:
X = X.reset_index(drop = True)
y = y.reset_index(drop = True)

In [145]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2, interaction_only=True)
poly_features = poly.fit_transform(X[['feature1004', 'feature318', 'feature341', 'feature994', 'feature952',
        'feature944', 'feature320', 'feature1000', 'feature943', 'feature993', 'new1', 'sin341', 'log940']])
poly_feature_names = [f'poly_{i}' for i in range(poly_features.shape[1])]
X_poly = pd.DataFrame(poly_features, columns=poly_feature_names)
X = pd.concat([X, X_poly], axis=1)


poly = PolynomialFeatures(degree=2, interaction_only=True)
poly_features = poly.fit_transform(test[['feature1004', 'feature318', 'feature341', 'feature994', 'feature952',
        'feature944', 'feature320', 'feature1000', 'feature943', 'feature993', 'new1', 'sin341', 'log940']])
poly_feature_names = [f'poly_{i}' for i in range(poly_features.shape[1])]
test_poly = pd.DataFrame(poly_features, columns=poly_feature_names)

test = pd.concat([test, test_poly], axis=1)

In [146]:
feats2 = [
    'feature208', 'feature310', 'poly_63', 'feature998', 'poly_74', 
    'feature534', 'poly_53', 'feature950', 'poly_76', 'feature992', 
    'poly_14', 'feature943', 'feature357', 'feature341', 'feature920', 
    'feature944', 'feature572', 'feature210', 'poly_19', 'feature952', 
    'feature309', 'feature990', 'feature817', 'feature945', 'feature999',
    'feature156', 'poly_81', 'poly_11', 'feature940', 'feature342', 'feature187', 
    'feature191', 'feature1', 'poly_1', 'feature993', 'feature35', 'poly_58', 
    'feature356', 'poly_7', 'feature444', 'feature320', 'feature192', 'poly_88', 
    'poly_83', 'feature928', 'feature193', 'feature559', 'poly_46', 'feature948', 
    'feature101', 'poly_4', 'poly_31', 'feature1056', 'poly_5', 'feature994', 
    'feature688', 'feature941', 'feature1000', 'feature139', 'poly_34', 'poly_17', 
    'feature494', 'feature869', 'feature861', 'feature190', 'feature898', 'feature551', 
    'feature935', 'feature350', 'feature939', 'poly_57', 'sin341', 'feature922', 'poly_23', 
    'feature287', 'feature930', 'feature195', 'feature862', 'poly_65', 'feature543', 
    'feature349', 'feature532', 'feature713', 'feature212', 'feature946', 'feature782', 
    'poly_79', 'poly_82', 'poly_3', 'poly_55', 'feature936', 'poly_71', 'poly_56', 
    'feature1036', 'feature985', 'poly_78', 'poly_28', 'feature758', 'feature472', 
    'feature1003', 'feature997', 'poly_22', 'feature318', 'poly_36', 'poly_50', 'poly_27', 
    'feature1002', 'feature112', 'feature942', 'feature988', 'poly_52', 'feature1069', 
    'feature1004', 'new1', 'feature270', 'feature951', 'poly_77', 'feature43', 'poly_25', 
    'log940', 'feature435', 'feature548', 'poly_2', 'poly_29', 'feature989', 'feature546', 
    'feature949', 'feature94', 'feature47', 'poly_30', 'feature497', 'feature128', 'poly_68'
]
X = X[feats2]
test = test[feats2]

In [149]:
automl_production = TabularAutoML(
    task=Task('binary', metric='auc', loss='logloss'), 
    reader_params={'n_jobs': 4, 'cv' : 2, 'random_state': 42, 'advanced_roles': False},
    debug=True,
    general_params={"use_algos": [['denselight', 'autoint']]}, 
    nn_params={
        "0": {
            "bs": 1024,
            'lr': 0.0006672367170464204,
            'weight_decay': 2.9204338471814107e-05,
            'weight_decay_bin': 1,
            "freeze_defaults": True,
            "n_epochs": 15,
            'path_to_save': '/kaggle/working/denselight'
        },
        '1': {
            "bs": 1024,
            'lr': 1e-3,
            "freeze_defaults": True,
            "n_epochs": 10,
            'path_to_save': '/kaggle/working/autoint'
        }
    }
)


In [150]:
import joblib
class SoftVotingClassifier():
    def __init__(self, estimators):
        self.estimators = estimators
    
    def fit(self, X, y):
        self.estimators_ = [estimator for estimator in self.estimators]
        for i, estimator in enumerate(self.estimators_):
            if str(type(estimator)) == "<class 'lightautoml.automl.presets.tabular_presets.TabularAutoML'>":
                df_train = pd.concat([X, y], axis=1)
                self.estimators_[i].fit_predict(df_train, roles =  {'target': 'target'}, verbose = 3)
            else:
                estimator.fit(X, y)
                joblib.dump(estimator, 'cat_' + str(i) + '.pkl')
        return self
    
    def predict_proba(self, X):
        l = len(self.estimators_)
        w = [0.4, 0.15, 0.15, 0.3]
        all_probabilities = [estimator.predict_proba(X)[:, 1] * 2/9 if str(type(estimator)) != "<class 'lightautoml.automl.presets.tabular_presets.TabularAutoML'>" else estimator.predict(X).data.flatten() * 1/3 for i, estimator in enumerate(self.estimators_)]
        probabilities = [estimator.predict_proba(X)[:, 1] if str(type(estimator)) != "<class 'lightautoml.automl.presets.tabular_presets.TabularAutoML'>" else estimator.predict(X).data.flatten() for i, estimator in enumerate(self.estimators_)]

        mean_probabilities = np.sum(all_probabilities, axis=0)
        return mean_probabilities, probabilities
    
    def predict(self, X):
        probabilities = self.predict_proba(X)
        return np.argmax(probabilities, axis=1)

In [74]:
params = {
    'iterations': 2654, 
    'learning_rate': 0.03354989749053529, 
    'depth': 5, 'l2_leaf_reg': 9.633986278110502, 
    'random_strength': 8.976484608530859, 
    'bagging_temperature': 0.44428949970755705, 
    'border_count': 158, 
    'verbose': 500,  
    'task_type': 'GPU', 
    'class_weights': [1, 10]
}
# 0.7658591942913879

In [154]:
from sklearn.ensemble import VotingClassifier
from sklearn.base import ClassifierMixin, TransformerMixin, clone

model = SoftVotingClassifier(
            [CatBoostClassifier(**params, random_seed=42 + i) for i in range(3)]*1 + [LGBMClassifier(is_unbalance=True, max_depth = 6, random_seed = 42)]*0 + [automl_production]*1
        )
model.fit(X, y)

0:	learn: 0.6820260	total: 13.2ms	remaining: 34.9s
500:	learn: 0.4891738	total: 5.81s	remaining: 25s
1000:	learn: 0.4728566	total: 11.8s	remaining: 19.4s
1500:	learn: 0.4604504	total: 17.6s	remaining: 13.5s
2000:	learn: 0.4497023	total: 23.3s	remaining: 7.62s
2500:	learn: 0.4394678	total: 29.3s	remaining: 1.79s
2653:	learn: 0.4364880	total: 31.1s	remaining: 0us
0:	learn: 0.6814623	total: 12.2ms	remaining: 32.4s
500:	learn: 0.4892420	total: 5.85s	remaining: 25.1s
1000:	learn: 0.4730384	total: 11.7s	remaining: 19.4s
1500:	learn: 0.4604896	total: 17.6s	remaining: 13.5s
2000:	learn: 0.4494862	total: 23.6s	remaining: 7.69s
2500:	learn: 0.4394482	total: 29.6s	remaining: 1.81s
2653:	learn: 0.4365051	total: 31.4s	remaining: 0us
0:	learn: 0.6818211	total: 13ms	remaining: 34.4s
500:	learn: 0.4889623	total: 5.81s	remaining: 25s
1000:	learn: 0.4728696	total: 11.6s	remaining: 19.2s
1500:	learn: 0.4604591	total: 17.6s	remaining: 13.5s
2000:	learn: 0.4495535	total: 23.4s	remaining: 7.65s
2500:	learn:

<__main__.SoftVotingClassifier at 0x7e6d3c3d17b0>

In [None]:
import csv

pred = model.predict_proba(test)[0]
pred_binary = (pred >= 0.405).astype(int)

submission = pd.read_csv("/kaggle/input/purple-hack-with-test/sample_submission.csv")
submission["target_prob"] = pred
submission["target_bin"] = pred_binary
submission.head(3)

with open("submission.csv", "w", newline="") as f:
    writer = csv.writer(f)
    writer.writerow(submission.columns)
    writer.writerows(submission.values)