In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import matthews_corrcoef
import optuna
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [41]:
df = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv', index_col='id')
df.drop_duplicates(inplace=True)
X = df.drop('class', axis=1)
y = df['class']

: 

In [42]:
num_var = X.select_dtypes(['float64']).columns.to_list()
cat_var = X.select_dtypes(['object']).columns.to_list()
transformer = ColumnTransformer([('num', StandardScaler(), num_var),
                                 ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_var)])

X_trans = transformer.fit_transform(X)

num_cols = transformer.named_transformers_['num'].get_feature_names_out().tolist()
cat_cols = transformer.named_transformers_['cat'].get_feature_names_out().tolist()
columns = num_cols + cat_cols

X_trans = pd.DataFrame.sparse.from_spmatrix(X_trans, columns=columns)
y_trans = y.map({'p': 1, 'e': 0})

In [None]:
sel = VarianceThreshold(threshold=0.001)
X_sel = sel.fit_transform(X_trans)
X_sel = X_sel.toarray()

In [None]:
models = {}

xgb_params = {'eta': 0.14125997609617025,
 'max_depth': 20,
 'lambda': 0.9674532237426338,
 'min_child_weight': 4.082841046413778,
 'subsample': 0.8776652508409101,
 'colsample_bytree': 0.7417850753962157}

models['xgb'] = xgb.XGBClassifier(**xgb_params)

lgbm_params = {'num_leaves': 140,
 'learning_rate': 0.09154835441455036,
 'feature_fraction': 0.5806836555634359,
 'bagging_fraction': 0.9884282473485232,
 'bagging_freq': 7,
 'min_child_samples': 98,
 'verbose': -1}

models['lgb'] = lgb.LGBMClassifier(**lgbm_params)

dart_params = {'num_leaves': 236,
 'learning_rate': 0.17745389398567907,
 'feature_fraction': 0.5590877395748017,
 'bagging_fraction': 0.9950711489582625,
 'bagging_freq': 1,
 'min_child_samples': 85,
 'verbose': -1}

models['dart'] = lgb.LGBMClassifier(**dart_params)

catboost_params = {'iterations': 159,
                   'learning_rate': 0.17379868130781356,
                   'depth': 11,
                   'border_count': 80,
                   'verbose': 0}

models['catboost'] = cb.CatBoostClassifier(**catboost_params)

In [None]:
oof_preds = {}
kf = KFold(n_splits=5, shuffle=True, random_state=2)

def get_oof_predictions(X, y, model):
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        global y_oof
        y_oof = y[test_index]

        print(f'Fold {i}')
        models[model].fit(X_train, y_train)
        oof_pred = models[model].predict(X_test)
        mcc = matthews_corrcoef(y_oof, oof_pred)
        print(f'MCC: {mcc}')

        break
    
    return oof_pred

for model in models:
    print(f'Model: {model}')
    oof_preds[model] = get_oof_predictions(X_sel, y_trans, model)

oof_preds['target'] = y_oof

Fold 0
Fold 0
Fold 0
[LightGBM] [Info] Number of positive: 43797, number of negative: 36203
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1003
[LightGBM] [Info] Number of data points in the train set: 80000, number of used features: 122
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547462 -> initscore=0.190423
[LightGBM] [Info] Start training from score 0.190423
Fold 0
0:	learn: 0.4862876	total: 110ms	remaining: 17.4s
1:	learn: 0.3916988	total: 161ms	remaining: 12.6s
2:	learn: 0.3251627	total: 205ms	remaining: 10.6s
3:	learn: 0.2706753	total: 250ms	remaining: 9.67s
4:	learn: 0.2261945	total: 290ms	remaining: 8.93s
5:	learn: 0.1900897	total: 366ms	remaining: 9.33s
6:	learn: 0.1640041	total: 449ms	remaining: 9.76s
7:	learn: 0.1438769	total: 517ms	remaining: 9.76s
8:	learn: 0.130

In [None]:
df_stack = pd.DataFrame(oof_preds)
df_stack.head()

{'xgb': array([0, 0, 0, ..., 1, 1, 0]),
 'lgb': array([0, 1, 0, ..., 1, 1, 0]),
 'dart': array([0, 1, 0, ..., 1, 1, 0]),
 'catboost': array([0, 0, 0, ..., 1, 1, 0]),
 'target': id
 4        0
 9        0
 10       0
 14       1
 17       1
         ..
 99966    0
 99967    1
 99972    1
 99981    1
 99988    0
 Name: class, Length: 20000, dtype: int64}

In [None]:
lr = LogisticRegression()
X_stack = df_stack.drop('target', axis=1)
y_stack = df_stack['target']
lr.fit(X_stack, y_stack)

Unnamed: 0_level_0,xgb,lgb,dart,catboost,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
4,0,0,0,0,0
9,0,1,1,0,0
10,0,0,0,0,0
14,1,1,1,1,1
17,1,1,1,1,1


In [None]:
lr.coef_

In [None]:
# load test dataset
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv', index_col='id')

# preprocess
X_test_trans = transformer.transform(df_test)
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=columns)
X_test_sel = sel.transform(X_test_trans).toarray()

# make predictions
test_preds = {}

for model in models:
    print(f'Predict with model: {model}')
    test_preds[model] = models[model].predict(X_test_sel)
    
df_test_stack = pd.DataFrame(test_preds)

lr_pred = lr.predict(df_test_stack)

0.9817050911597321

In [None]:
lr_pred = list(map(lambda x: 'p' if x==1 else 'e', lr_pred))
submission = pd.DataFrame({'id': df_test.index,
                          'class': lr_pred})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)