<a href="https://www.kaggle.com/code/elcaiseri/icr-tabularnn-bagging-baseline-cv-0-05?scriptVersionId=139625052" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

This NoteBook takes [ICR | MultiHeads Ensemble Baseline [CV 0.23] & EDA](https://www.kaggle.com/code/elcaiseri/icr-multiheads-ensemble-baseline-cv-0-23-eda) as reference.

## Imports

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns; sns.set()
import matplotlib.pyplot as plt 

from tqdm.auto import tqdm

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import KFold

from imblearn.over_sampling import RandomOverSampler, SMOTE

from sklearn.impute import SimpleImputer

## Data Exploration

In [None]:
MAIN_DIR = "/kaggle/input/icr-identify-age-related-conditions/"

train = pd.read_csv(MAIN_DIR + "train.csv")
test = pd.read_csv(MAIN_DIR + "test.csv")
sub = pd.read_csv(MAIN_DIR + "sample_submission.csv")
greeks = pd.read_csv(MAIN_DIR + "greeks.csv")

In [None]:
train.head()

In [None]:
greeks.head()

In [None]:
train.shape, train.Class.value_counts()

target is unbalanced

In [None]:
train.isna().sum()

there are some missing values in the dataset, let's explore how do we can impute them

## Data Preproccessing

In [None]:
# define heads
heads = [
    ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ'],
    ['BC', 'BD ', 'BN', 'BP', 'BQ', 'BR', 'BZ'], 
    ['CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'CW '], 
    ['DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY'], 
    ['EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU'], # EJ is categorical
    ['FC', 'FD ', 'FE', 'FI', 'FL', 'FR', 'FS'], 
    ['GB', 'GE', 'GF', 'GH', 'GI', 'GL'], 
]

n_heads = len(heads)

n_heads

In [None]:
cat_col = 'EJ'

train[cat_col] = train[cat_col].map({"A":0, "B":1})
test[cat_col] = test[cat_col].map({"A":0, "B":1})

In [None]:
drop_cols = ["Id", "EJ", "Class"]

feat_cols = [col for col in train.columns if col not in drop_cols]

In [None]:
target_col =  'Class'
labels = train[target_col].values

## Model

The idea behined MultiHeads is simple, we would use No. of heads as features to predict class/target and ensamble all predictions over multi models (linear, tree and ensemble, etc)

### Metrics

In [None]:
# https://www.kaggle.com/code/datafan07/icr-simple-eda-baseline
def balance_logloss(y_true, y_pred):
    
    y_pred = np.stack([1-y_pred,y_pred]).T
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred / np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

# balanced preds
def boost_preds(yp):
    c_0, c_1 = yp.sum(axis=0)
    # Weighted probabilities based on class imbalance
    prob = yp * np.array([[1/(c_0 if i==0 else c_1) for i in range(yp.shape[1])]])
    yp_ = prob / np.sum(prob, axis=1, keepdims=1)
    
    return yp_

# https://www.kaggle.com/competitions/icr-identify-age-related-conditions/discussion/412507#2291644
def more_boost(oof, c):
    return c*oof / (1 - oof + c*oof)

### Impute

In [None]:
all_cols = feat_cols + [cat_col]
imputer = SimpleImputer(strategy="median")
imputer.fit(train[all_cols])

train[all_cols] = imputer.transform(train[all_cols])
test[all_cols] = imputer.transform(test[all_cols])

### Normailization

In [None]:
# default range (0, 1) but the original range is very large, so i descide to scale it larger range
sc = StandardScaler() 
sc.fit(train[all_cols])

train[all_cols] = sc.transform(train[all_cols])
test[all_cols] = sc.transform(test[all_cols])

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential, Model


def keras_balance_logloss(y_true, y_pred):
    y_pred = K.stack([1 - y_pred, y_pred], axis=1)
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    y_pred /= K.sum(y_pred, axis=1, keepdims=True)
    nc = K.sum(K.cast(K.equal(y_true, 0), dtype=K.floatx()))

    logloss = (-1 / nc * K.sum(K.cast(K.equal(y_true, 0), dtype=K.floatx()) * K.log(y_pred[:, 0])) -
               (1 - 1 / nc) * K.sum(K.cast(K.not_equal(y_true, 0), dtype=K.floatx()) * K.log(y_pred[:, 1]))) / 2

    return logloss

In [None]:
def create_model(hidden_sizes, lr=0.2, num_categories=2):
    # Input layer for numeric features
    input_numerical = layers.Input(shape=(len(feat_cols),))

    x = layers.Dense(hidden_sizes[0], activation='relu')(input_numerical)

    # One-hot encoding for categorical feature
    input_categorical = layers.Input(shape=(len([cat_col]),))
    embedded = layers.Embedding(num_categories, hidden_sizes[0])(input_categorical)
    flattened = layers.Flatten()(embedded)

    # Concatenate numeric and categorical features
    concatenated = layers.concatenate([x, flattened])

    # Hidden layers
    for size in hidden_sizes[1:]:
        concatenated = layers.Dense(size, activation='relu')(concatenated)

    # Output layer
    output = layers.Dense(1, activation='sigmoid')(concatenated)

    model = Model(inputs=[input_numerical, input_categorical], outputs=output)
    
    # Compile the model
    opt = keras.optimizers.Adam(lr=lr)
    model.compile(optimizer=opt, loss=keras_balance_logloss)

    return model


### Training

In [None]:
BAGS = 6
n_splits = 5 

loss = []
result = np.zeros(len(test))

count = 0
oof = np.zeros(len(train))

for rs, _ in enumerate(tqdm(range(BAGS), total=BAGS)):
    train_df, train_cat = train[feat_cols].values, train[cat_col].values.astype(np.int32)
    test_df, test_cat = test[feat_cols].values, test[cat_col].values.astype(np.int32)
    
    gkf = KFold(n_splits=n_splits, shuffle=True, random_state=rs)
    ids = gkf.split(train_df, labels, groups=greeks.iloc[:, 1:-1].sum(1))

    head_result = []
    head_loss = []
    clfs = [create_model([64, 64]), create_model([128, 128]), create_model([128, 64])]
    for idx, (train_idx, val_idx) in enumerate(ids): 
        # select fold
        print("--> FOLD:", idx+1, end=" | ") 
        xr, xrc, xt, xtc = train_df[train_idx], train_cat[train_idx].reshape(-1, 1), train_df[val_idx], train_cat[val_idx].reshape(-1, 1)
        yr, yt = labels[train_idx], labels[val_idx]
        
        # over-sampleing
        #NUM_POS = np.bincount(yr)[1] # {0: int(NUM_POS*1.3), 1: NUM_POS}
        #sampler = RandomOverSampler(sampling_strategy="auto", random_state=rs*idx)
        #xr_, yr = sampler.fit_resample(xr, yr)
        
        cw = len(yr) / (2 * np.bincount(yr)) # sklearn docs.
        cw = dict(zip(np.unique(yr), cw)) 
        
        # train
        for clf in clfs:
            # Train the model
            clf.fit([xr, xrc], yr, 
                    epochs=100, 
                    batch_size=32, 
                    validation_data=[[xt, xtc], yt], 
                    shuffle=True, 
                    verbose=0, 
                    class_weight=cw,
                   )
            
            yp = clf.predict([xt, xtc], verbose=0)
            oof[val_idx] += yp[:, 0] / BAGS / len(clfs)
            count += 1
            
            # test
            result += clf.predict([test_df, test_cat], verbose=0)[:, 0] / BAGS / len(clfs)
            
    # tracking loss
    log_loss = balance_logloss(labels, oof) # 
    loss.append(log_loss)
    print("=> logLoss:", log_loss)
            
    print("="*12)

Now training logs looks better.

## Evaluate

In [None]:
overall_cv = balance_logloss(labels, oof)

overall_cv

In [None]:
np.min(loss), np.mean(loss)

In [None]:
bst_cv = np.inf
bst_c = 0

for b in range(1, 50):
    oof_ = more_boost(oof, b)
    cv_loss = balance_logloss(train.Class.values, oof_)
    if cv_loss < bst_cv:
        print(b, "=>",cv_loss)
        bst_cv = cv_loss
        bst_c = b

Watch out! good people, we overfitting.

In [None]:
plt.hist(oof, bins=50, label="oof");
plt.hist(oof_, bins=50, label="more oof");
plt.legend()
plt.show()

In [None]:
preds = result/n_splits #more_boost(result/n_splits, bst_c)

## Submission

In [None]:
sub = test[['Id']].copy()
sub['Class_0'] = 1-preds
sub['Class_1'] = preds
sub.to_csv('submission.csv',index=False)
sub.head()

This is a baseline, if you have any farther recommandetions, write it in the comment section.

**Upvote** if you like it, your feedback is highly appreciated