In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import matthews_corrcoef
import optuna
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
%%time
df = pd.read_csv('/kaggle/input/playground-series-s4e8/train.csv', index_col='id')
df.drop_duplicates(inplace=True)
X = df.drop('class', axis=1)
y = df['class']

CPU times: user 21.2 s, sys: 3.63 s, total: 24.8 s
Wall time: 26.2 s


In [None]:
%%time
num_var = X.select_dtypes(['float64']).columns.to_list()
cat_var = X.select_dtypes(['object']).columns.to_list()
transformer = ColumnTransformer([('num', StandardScaler(), num_var),
                                 ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_var)])

X_trans = transformer.fit_transform(X)

num_cols = transformer.named_transformers_['num'].get_feature_names_out().tolist()
cat_cols = transformer.named_transformers_['cat'].get_feature_names_out().tolist()
columns = num_cols + cat_cols

X_trans = pd.DataFrame.sparse.from_spmatrix(X_trans, columns=columns)
y_trans = y.map({'p': 1, 'e': 0})

CPU times: user 30.4 s, sys: 2.21 s, total: 32.6 s
Wall time: 32.7 s


In [None]:
%%time
sel = VarianceThreshold(threshold=0.001)
X_sel = sel.fit_transform(X_trans)
X_sel = X_sel.toarray()

CPU times: user 11.4 s, sys: 5.16 s, total: 16.5 s
Wall time: 16.6 s


In [None]:
%%time
models = {}

xgb_params = {'eta': 0.14125997609617025,
 'max_depth': 20,
 'lambda': 0.9674532237426338,
 'min_child_weight': 4.082841046413778,
 'subsample': 0.8776652508409101,
 'colsample_bytree': 0.7417850753962157}

models['xgb'] = xgb.XGBClassifier(**xgb_params)

lgbm_params = {'num_leaves': 140,
 'learning_rate': 0.09154835441455036,
 'feature_fraction': 0.5806836555634359,
 'bagging_fraction': 0.9884282473485232,
 'bagging_freq': 7,
 'min_child_samples': 98,
 'verbose': -1}

models['lgb'] = lgb.LGBMClassifier(**lgbm_params)

dart_params = {'num_leaves': 236,
 'learning_rate': 0.17745389398567907,
 'feature_fraction': 0.5590877395748017,
 'bagging_fraction': 0.9950711489582625,
 'bagging_freq': 1,
 'min_child_samples': 85,
 'verbose': -1}

models['dart'] = lgb.LGBMClassifier(**dart_params)

catboost_params = {'iterations': 159,
                   'learning_rate': 0.17379868130781356,
                   'depth': 11,
                   'border_count': 80,
                   'verbose': 0}

models['catboost'] = cb.CatBoostClassifier(**catboost_params)

CPU times: user 0 ns, sys: 1.02 ms, total: 1.02 ms
Wall time: 3.34 ms


In [None]:
%%time
kf = KFold(n_splits=5, shuffle=True, random_state=2)

def get_oof_predictions(X, y, model):
    '''
    Get the dataset X, label y and model, and generate the out-of-fold
    predictions to be used to train the meta-model.
    '''
    y_oof = [] # store probabilistic predicitons of out-of-fold dataset
    y_true = [] # store real values
    mcc_mean = []
     
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        X_train = X[train_index]
        y_train = y[train_index]
        X_test = X[test_index]
        y_test = y[test_index]
        y_true.extend(y_test)
        
        print(f'Fold {i}')
        models[model].fit(X_train, y_train)
        y_proba = models[model].predict_proba(X_test)[:,1]
        y_oof.extend(y_proba)
        y_pred = (y_proba > 0.5).astype(int)
        mcc = matthews_corrcoef(y_test, y_pred)
        mcc_mean.append(mcc)
        print(f'MCC: {mcc}')
        
    print(f'Mean MCC of {model}: {np.mean(mcc_mean)}')
    return (y_oof, y_true)

oof_preds = {} # store predictions of each model

for model in models:
    print(f'Model: {model}')
    oof_preds[model], y_true = get_oof_predictions(X_sel, y_trans, model)
    
oof_preds['class'] = y_true

Model: xgb
Fold 0
MCC: 0.9843118600695787
Fold 1
MCC: 0.9844996603792536
Fold 2
MCC: 0.9843067695955716
Fold 3
MCC: 0.984248772248641
Fold 4
MCC: 0.9845661546397148
Mean MCC of xgb: 0.984386643386552
Model: lgb
Fold 0
MCC: 0.983806738097231
Fold 1
MCC: 0.9840403402516217
Fold 2
MCC: 0.9838248918371466
Fold 3
MCC: 0.983744170760554
Fold 4
MCC: 0.9840095531905231
Mean MCC of lgb: 0.9838851388274152
Model: dart
Fold 0
MCC: 0.9841140817755413
Fold 1
MCC: 0.9843991228178812
Fold 2
MCC: 0.9841638458649895
Fold 3
MCC: 0.9842226892535362
Fold 4
MCC: 0.9843975997198631
Mean MCC of dart: 0.9842594678863623
Model: catboost
Fold 0
Fold 1
MCC: 0.9841609596714157
Fold 2
MCC: 0.9840417173631987
Fold 3
MCC: 0.9840620294231217
Fold 4
MCC: 0.9842080014256588
Mean MCC of catboost: 0.9840701654329855
CPU times: user 2h 16min 3s, sys: 1min 1s, total: 2h 17min 4s
Wall time: 1h 8min 36s


In [8]:
%%time
df_stack = pd.DataFrame(oof_preds)
df_stack.to_csv('stacking_dataset.csv', index=False)
X_stack_train, X_stack_val, y_stack_train, y_stack_val  = train_test_split(df_stack.drop('class', axis=1), df_stack['class'], test_size=0.2)

CPU times: user 35.5 s, sys: 896 ms, total: 36.4 s
Wall time: 36.4 s


In [11]:
%%time
lr = LogisticRegression()
y_stack_pred = lr.fit(X_stack_train, y_stack_train).predict(X_stack_val)
mcc = matthews_corrcoef(y_stack_val, y_stack_pred)
print(f'MCC of Stacking Model: {mcc}')

MCC of Stacking Model: 0.9846946130187121
CPU times: user 14.7 s, sys: 9.89 s, total: 24.6 s
Wall time: 8.22 s


In [None]:
lr.coef_

# Create submission

In [12]:
%%time
# load test dataset
df_test = pd.read_csv('/kaggle/input/playground-series-s4e8/test.csv', index_col='id')

# preprocess
X_test_trans = transformer.transform(df_test)
X_test_trans = pd.DataFrame.sparse.from_spmatrix(X_test_trans, columns=columns)
X_test_sel = sel.transform(X_test_trans).toarray()

# make predictions
test_preds = {}

for model in models:
    print(f'Predict with model: {model}')
    test_preds[model] = models[model].predict_proba(X_test_sel)[:,1]
    
df_test_stack = pd.DataFrame(test_preds)

lr_pred = lr.fit(df_stack.drop('class', axis=1), df_stack['class']).predict(df_test_stack)



Predict with model: xgb
Predict with model: lgb
Predict with model: dart
Predict with model: catboost
CPU times: user 5min 14s, sys: 15.8 s, total: 5min 30s
Wall time: 5min 12s


In [13]:
lr_pred = list(map(lambda x: 'p' if x==1 else 'e', lr_pred))
submission = pd.DataFrame({'id': df_test.index,
 
                           'class': lr_pred})
submission.head()

Unnamed: 0,id,class
0,3116945,e
1,3116946,p
2,3116947,p
3,3116948,p
4,3116949,e


In [15]:
submission.to_csv('submission.csv', index=False)