# Import

In [2]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import warnings
import sklearn
import matplotlib.pyplot as plt
from sklearn import utils
from sklearn.model_selection import StratifiedKFold
import random 
import os
import datetime
from sklearn.model_selection import train_test_split
import xgboost as xgb

# Config

In [42]:
class cfg:
    running_on_kaggle = False
    running_on_private_test = True

    filter_warnings = True

    seed = 100
    create_test_seed = 100

    val_folds_num = 10
    val_models_num = 50
    
    add_noise = True
    noise_strength = 0.04
    undersample_train = True
    undersample_val = False
    double_positive_data = True
    use_val_for_training = False

    use_only_cols = [
        # 'AB', 'AF', 'AM', 'AX', 'AY', 'BC', 'BP', 'BQ', 'CC', 'CD', 'CR',
        # 'CU', 'CW', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DU', 'DY', 'EB', 'EE',
        # 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD', 'FE', 'FI', 'FL', 'FR', 'GE',
        # 'GF', 'GL',
        # '1/CH', '1/DE', '1/DN', '1/EB', '1/EU', 
        # 'ln_FR', 'ln_BC', 'ln_EB',
        # 'epsilon_num',
    ]

    add_log_cols = True
    log_cols = ['DY', 'FR', 'BC', 'EB'] 
    add_inverse_cols = True
    inverse_cols = [ 'CH', 'CL', 'DA', 'DE', 'DN', 'EB', 'EU']

    class lgb:
        max_bin_by_feature = {
            # 'EP': 4, 'FR': 6, 'EL': 4, 'DL': 5, 'GL': 6, 'FI': 6, 'AM': 6, 
            # 'CS': 8, 'EB': 5, 'DL': 8, 'EE': 4, 'DN': 4, 'GH': 5, 'AB_2*BQ': 8,
        }
        use_early_stopping = True
        boost_rounds = 400
        learning_rate = 0.1
        early_stopping_rounds = 50
        early_stopping_verbose = 0
        min_child_samples = 20
        num_leaves = 16
        max_depth = 10
        max_bin = 16
        is_unbalance = False
        colsample_bytree = 0.7
        subsample_freq = 1
        subsample = 0.6
        reg_alpha = 0
        reg_lambda = 0
        keep_over_iteration=70

    class xgb: 
        boost_rounds = 500
        learning_rate = 0.05
        early_stopping_rounds = 50
        early_stopping_verbose = 0
        gamma = 0.1
        min_child_weight = 0.5
        max_leaves = 16
        max_depth = 10
        scale_pos_weight = 1 # probably add weights (pos_samples/neg_samples)
        colsample_bytree = 0.7
        # subsample_freq = 1
        subsample = 0.6
        reg_alpha = 0
        reg_lambda = 0
        max_bin = 32
        keep_over_iteration = 140

if cfg.running_on_kaggle:
    data_dir = '/kaggle/input/icr-identify-age-related-conditions'
else:
    data_dir = '../icr-identify-age-related-conditions'

class paths:
    prepared_dir = '../prepared_data'
    prepared_data_folds = f'{prepared_dir}/prepared_folds.parquet'
    splits_dir = f'{prepared_dir}/splits'

    original_train  = f'{data_dir}/train.csv'
    original_test = f'{data_dir}/test.csv'
    original_greeks = f'{data_dir}/greeks.csv'
    original_submission = f'{data_dir}/sample_submission.csv'

all_data_df          = pd.read_csv(paths.original_train)
test_sample_df       = pd.read_csv(paths.original_test)
greeks_df            = pd.read_csv(paths.original_greeks)
submission_sample_df = pd.read_csv(paths.original_submission)

all_data_df.columns = [col.replace(' ', '') for col in all_data_df.columns]
test_sample_df.columns = [col.replace(' ', '') for col in test_sample_df.columns]
greeks_df.columns = [col.replace(' ', '') for col in greeks_df.columns]

In [4]:
numerical_features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 
                      'BC', 'BD', 'BN', 'BP', 'BQ', 'BR', 'BZ', 
                      'CB', 'CC', 'CD', 'CF', 'CH', 'CL', 'CR', 'CS', 'CU', 'CW', 
                      'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY', 
                      'EB', 'EE', 'EG', 'EH', 'EL', 'EP', 'EU', 
                      'FC', 'FD', 'FE', 'FI', 'FL', 'FR', 'FS', 
                      'GB', 'GE', 'GF', 'GH', 'GI', 'GL']

categorical_features = ['EJ']
features = numerical_features + categorical_features

print('Cols not in features:')
for col in all_data_df.columns:
    if col not in features:
        print(col, end=' ')

Cols not in features:
Id Class 

In [5]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(cfg.seed)

# Helper functions

In [10]:
def competition_log_loss(y_true, y_pred):
    # y_true: correct labels 0, 1
    # y_pred: predicted probabilities of class=1
    # Implements the Evaluation equation with w_0 = w_1 = 1.
    # Calculate the number of observations for each class
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1
    # Calculate the average log loss for each class
    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1

    return (log_loss_0 + log_loss_1)/2

In [11]:
def get_accuracy(y_true, y_pred):
    def _format(x):
        return np.round(x,4)
    
    c_ll_result = competition_log_loss(y_true, y_pred)
    log_loss = sklearn.metrics.log_loss(y_true, y_pred, labels=[0,1])

    thr = 0.5
    y_pred_int = (y_pred > thr).astype('int8')
    true_pos = ((y_true==1) & (y_pred_int==1)).sum()
    # true_neg = ((y_true==0) & (y_pred_int==0)).sum()
    false_pos = ((y_true==0) & (y_pred_int==1)).sum()
    false_neg = ((y_true==1) & (y_pred_int==0)).sum()
    acc = (y_true==y_pred_int).sum() / len(y_true)
    prec = true_pos / (true_pos+false_pos)
    rec = true_pos / (true_pos+false_neg)

    # return (c_ll_result, acc, prec, rec)
    return (_format(c_ll_result), _format(log_loss), _format(acc), _format(prec), _format(rec))

# Preprocessing functions

In [12]:
def epilon_to_num(x):
    if x == 'Unknown':
        return 10
    else:
        m,d,y=map(int, x.split('/'))
        num = 2020-y
        if num<=1:
            num=1
        return num
def add_epsilon_num(df):
    resulting_len = df.merge(greeks_df[['Id', 'Epsilon']], on='Id').shape[0]
    # print(resulting_len, df.shape)
    # print(df.columns)
    if resulting_len == 0:
        df['epsilon_num'] = 0
    elif resulting_len == len(df):
        df = pd.merge(df, greeks_df[['Id', 'Epsilon']], on='Id')
        df['epsilon_num'] = df.Epsilon.apply(epilon_to_num) 
        df = df.drop(columns=['Epsilon'])
    else:
        raise Exception('Error in adding epsilon num')
    return df

In [13]:
def label_encode(df):
    df['EJ'] = df['EJ'].replace({'A': 0, 'B': 1})
    return df

In [14]:
# It was better to undersample with repetition (having multiple models, do not know without that)
def undersample(df, random_state, replace=False):
    negative_values_sampled_df = df.loc[df.Class==0].sample(
        n=df["Class"].value_counts()[1], 
        random_state=random_state, 
        replace=replace,
        ignore_index=True
    )
    return utils.shuffle(pd.concat([negative_values_sampled_df, df.loc[df.Class==1]], ignore_index=True), random_state=cfg.seed).reset_index(drop=True)

In [15]:
random_uniform_arr = np.random.random_sample(len(all_data_df))

def add_noise(df, noise_strength=cfg.noise_strength):
    n = len(df)
    for col in numerical_features:
        if col in df.columns:
            noise = (random_uniform_arr[0:n]-0.5) * noise_strength
            df[col] = np.clip(df[col] + noise, 
                                        df[col].min(), 
                                        df[col].max())

    return df

In [16]:
def add_inverse_cols(train_val_df, test_df):
    new_cols_train = {}
    new_cols_test = {}
    for col in cfg.inverse_cols:
        new_cols_train[f'1/{col}'] = 1 / (train_val_df[col]+1e-6)
        new_cols_test[f'1/{col}'] = 1 / (test_df[col]+1e-6)

    train_val_df = pd.concat([train_val_df, pd.DataFrame(new_cols_train, index=train_val_df.index)], axis=1)
    test_df = pd.concat([test_df, pd.DataFrame(new_cols_test, index=test_df.index)], axis=1)

    return train_val_df, test_df

In [17]:
def add_log_cols(train_val_df, test_df):
    new_cols_train = {}
    new_cols_test = {}
    for col in cfg.log_cols:
        new_cols_train[f'ln_{col}'] = np.log(train_val_df[col])
        new_cols_test[f'ln_{col}'] = np.log(test_df[col])

    train_val_df = pd.concat([train_val_df, pd.DataFrame(new_cols_train, index=train_val_df.index)], axis=1)
    test_df = pd.concat([test_df, pd.DataFrame(new_cols_test, index=test_df.index)], axis=1)

    return train_val_df, test_df

#### Preprocess

In [39]:
def preprocess(train_val_df, test_df):
    
    train_val_df = label_encode(train_val_df)
    test_df = label_encode(test_df)

    train_val_df = add_epsilon_num(train_val_df)
    test_df = add_epsilon_num(test_df)
    # test_df['epsilon_num'] = 1

    if cfg.add_inverse_cols:
        train_val_df, test_df = add_inverse_cols(train_val_df, test_df)

    if cfg.add_log_cols:
        train_val_df, test_df = add_log_cols(train_val_df, test_df)

    if len(cfg.use_only_cols) > 0:
        train_val_df = train_val_df[['Class', 'Id'] + cfg.use_only_cols]
        if 'Class' in test_df.columns:
            test_df = test_df[['Class', 'Id'] + cfg.use_only_cols]
        else:
            test_df = test_df[['Id'] + cfg.use_only_cols]

    return train_val_df, test_df

#### Get folds

In [19]:
def get_folds_df(df, greeks_df, stratified_col='Alpha', n_splits=cfg.val_folds_num, seed=None):
    if seed is None:
        skf = StratifiedKFold(n_splits=n_splits, random_state=None)
    else:
        skf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=True)
        
    folds_df = pd.merge(df, greeks_df, on='Id', how='left')[['Id', stratified_col]]
    folds_df['fold'] = 0
    fold = 0
    for train_index, test_index in skf.split(folds_df, folds_df[stratified_col]):
        folds_df.loc[test_index, 'fold'] = fold
        fold+=1

    return folds_df[['Id', 'fold']]

In [20]:
# helpers
def get_df_fold_equal(df, folds_df, fold_num):
    return pd.merge(
        df,
        folds_df.loc[folds_df.fold==fold_num].Id, 
        on='Id', how='inner'
    ).reset_index(drop=True)
def get_df_fold_not_equal(df, folds_df, fold_num):
    return pd.merge(
        df,
        folds_df.loc[folds_df.fold!=fold_num].Id, 
        on='Id', how='inner'
    ).reset_index(drop=True)

# Train

### Classifier

* XGBClassifier

In [21]:
def custom_eval_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1

    return ('custom_eval_loss', (log_loss_0 + log_loss_1)/2, False) 

def xgb_custom_eval_loss(y_true, y_pred):
    N_0 = np.sum(1 - y_true)
    N_1 = np.sum(y_true)
    # Calculate the predicted probabilities for each class
    p_1 = np.clip(y_pred, 1e-15, 1 - 1e-15)
    p_0 = 1 - p_1

    log_loss_0 = -np.sum((1 - y_true) * np.log(p_0)) / N_0
    log_loss_1 = -np.sum(y_true * np.log(p_1)) / N_1

    return (log_loss_0 + log_loss_1)/2

In [22]:
def double_positives(train_df):
    train_df = pd.concat([
        train_df, 
        train_df.loc[train_df.Class==1],
        # train_df.loc[train_df.Class==1],
        ], ignore_index=True)
    return utils.shuffle(train_df, random_state=cfg.seed).reset_index(drop=True)

In [29]:
def xgb_train_predict(train_val_df, test_df):
    features = list(train_val_df.columns)
    features.remove('Id')
    features.remove('Class')
    clf_li = []

    val_preds_df = train_val_df[['Id', 'Class']].copy()
    output_test = None
    for i in range(cfg.val_models_num):

        if cfg.val_folds_num > 1:
            if i%cfg.val_folds_num==0:
                folds_df = get_folds_df(
                    train_val_df, greeks_df, 
                    n_splits=cfg.val_folds_num, 
                    stratified_col='Alpha', 
                    seed=i
                )
            
            val_fold = i%cfg.val_folds_num
            # print(val_fold)
            if cfg.use_val_for_training:
                train_df = train_val_df.copy()
            else:
                train_df = get_df_fold_not_equal(train_val_df.copy(), folds_df, val_fold)
            val_df = get_df_fold_equal(train_val_df.copy(), folds_df, val_fold)
        else:
            train_df = train_val_df.copy()
            val_df = train_val_df.copy()
            val_fold = 0

        if cfg.double_positive_data:
            train_df = double_positives(train_df)

        # Sample to have equal num of classes
        if cfg.undersample_train:
            train_df = undersample(train_df, random_state=i)

        if cfg.add_noise:
            train_df = add_noise(train_df, noise_strength=cfg.noise_strength*np.arange(1.0, 1.5, 0.1)[i%5])
            
        # if i==0:
        #     print(train_df.shape)

        X_train = train_df.drop('Class', axis=1)
        y_train = train_df['Class']
        
        if cfg.undersample_val:
            val_df = undersample(val_df, random_state=i)

        # if i==0:
        #     print(y_train.value_counts())

        X_val = val_df.drop('Class', axis=1)
        y_val = val_df['Class']

        X_test=test_df.copy()
        if not cfg.running_on_private_test:
            y_test = X_test['Class']
            X_test = X_test.drop("Class",axis=1)
        

        clf = xgb.XGBClassifier(
            tree_method='hist',
            max_leaves=cfg.xgb.max_leaves,
            n_estimators=cfg.xgb.boost_rounds,
            objective='binary:logistic',
            learning_rate=cfg.xgb.learning_rate,
            metric=None,
            # is_unbalance=cfg.is_unbalance,
            scale_pos_weight=cfg.xgb.scale_pos_weight, # (sum(negative instances) / sum(positive instances))
            random_state=cfg.seed,
            max_bin=cfg.xgb.max_bin,
            gamma=cfg.xgb.gamma,
            max_depth=cfg.xgb.max_depth,
            min_child_weight=cfg.xgb.min_child_weight,
            colsample_bytree=cfg.xgb.colsample_bytree,
            subsample=cfg.xgb.subsample,
            reg_alpha=cfg.xgb.reg_alpha,
            reg_lambda=cfg.xgb.reg_lambda,
            eval_metric=xgb_custom_eval_loss,
        )

        clf.fit(
            X_train.drop("Id",axis=1), 
            y_train,
            eval_set=[
                (X_val.drop("Id",axis=1), y_val)],
            early_stopping_rounds = cfg.xgb.early_stopping_rounds,
            verbose = cfg.xgb.early_stopping_verbose
        )
        
        clf_li.append(clf)

        # Val OOF:
        new_col = f'preds_{val_fold}_{i}'
        pred_val = clf.predict_proba(X_val.drop("Id",axis=1), iteration_range=(0, clf.best_iteration))

        if len(pred_val.shape)==2 and pred_val.shape[1]==2:
            pred_val = pred_val[:,1]
        val_df[new_col] = pred_val
        val_preds_df = pd.merge(val_preds_df, val_df[['Id', new_col]], on='Id', how='left')

        # Predict test:
        if (clf.best_iteration > cfg.xgb.keep_over_iteration):

            pred_test = clf.predict_proba(X_test.drop("Id",axis=1), iteration_range=(0, clf.best_iteration))

            if len(pred_test.shape)==2 and pred_test.shape[1]==2:
                pred_test = pred_test[:,1]

            if output_test is None:
                output_test = pd.DataFrame(pred_test, columns=['pred' + str(i + 1)])
                output_test2 = output_test
            else:
                output_test = pd.DataFrame(pred_test, columns=['pred' + str(i + 1)])
                output_test2 = pd.concat([output_test2, output_test], axis=1)

    # Val OOF:
    subset = [col for col in val_preds_df.columns if 'preds_' in col]
    val_preds_df = val_preds_df.dropna(axis=0, how='all', subset=subset)
    val_preds_df['preds'] = val_preds_df[subset].mean(axis=1, skipna=True, numeric_only=True)
    val_oof_acc = get_accuracy(y_pred=val_preds_df['preds'], y_true=val_preds_df['Class'])

    pred_test = output_test2.mean(axis='columns')
    
    return pred_test, clf_li, val_oof_acc

#### LGBClassifier

In [24]:
def lgb_train_predict(train_val_df, test_df):
    features = list(train_val_df.columns)
    features.remove('Id')
    features.remove('Class')
    clf_li = []

    max_bin_by_feature = []
    if len(cfg.lgb.max_bin_by_feature)>0:
        for f in features:
            if f in cfg.lgb.max_bin_by_feature:
                max_bin_by_feature.append(cfg.lgb.max_bin_by_feature[f])
            else:
                max_bin_by_feature.append(cfg.lgb.max_bin)

    val_preds_df = train_val_df[['Id', 'Class']].copy()
    output_test = None
    for i in range(cfg.val_models_num):

        if cfg.val_folds_num > 1:
            if i%cfg.val_folds_num==0:
                folds_df = get_folds_df(
                    train_val_df, greeks_df, 
                    n_splits=cfg.val_folds_num, 
                    stratified_col='Alpha', 
                    seed=i
                )
            
            val_fold = i%cfg.val_folds_num
            if cfg.use_val_for_training:
                train_df = train_val_df.copy()
            else:
                train_df = get_df_fold_not_equal(train_val_df.copy(), folds_df, val_fold)
            val_df = get_df_fold_equal(train_val_df.copy(), folds_df, val_fold)
        else:
            train_df = train_val_df.copy()
            val_df = train_val_df.copy()
            val_fold = 0

        if cfg.double_positive_data:
            train_df = double_positives(train_df)

        # Sample to have equal num of classes
        if cfg.undersample_train:
            train_df = undersample(train_df, random_state=i)

        if cfg.add_noise:
            train_df = add_noise(train_df, noise_strength=cfg.noise_strength*np.arange(1.0, 1.5, 0.1)[i%5])
            
        # if i==0:
        #     print(train_df.shape)

        X_train = train_df.drop('Class', axis=1)
        y_train = train_df['Class']
        
        if cfg.undersample_val:
            val_df = undersample(val_df, random_state=i)

        X_val = val_df.drop('Class', axis=1)
        y_val = val_df['Class']

        X_test=test_df.copy()
        if not cfg.running_on_private_test:
            y_test = X_test['Class']
            X_test = X_test.drop("Class",axis=1)
        
        clf = lgb.LGBMClassifier(
            boosting_type='GBDT',
            # boosting_type='dart',
            # n_estimators=80,
            num_leaves=cfg.lgb.num_leaves,
            n_estimators=cfg.lgb.boost_rounds,
            objective='binary',
            metric='None',
            # objective=custom_loss,
            # class_weight='balanced', # is_unbalance should be used for binary, and class_weight seems to not work at all
            is_unbalance=cfg.lgb.is_unbalance,
            random_state=cfg.seed,
            subsample_for_bin=200_0000,
            max_depth=cfg.lgb.max_depth,
            min_child_samples=cfg.lgb.min_child_samples,
            learning_rate=cfg.lgb.learning_rate,
            colsample_bytree=cfg.lgb.colsample_bytree,
            subsample_freq=cfg.lgb.subsample_freq,
            subsample=cfg.lgb.subsample,
            reg_alpha=cfg.lgb.reg_alpha,
            reg_lambda=cfg.lgb.reg_lambda,
            force_col_wise=True,
            max_bin=cfg.lgb.max_bin,
            max_bin_by_feature=max_bin_by_feature,
            first_metric_only=True,
        )

        categorical_features = ['EJ'] if 'EJ' in features else []

        fit_callbacks = [lgb.log_evaluation(2000)]
        if cfg.lgb.use_early_stopping:
            fit_callbacks.append(
                lgb.early_stopping(
                    cfg.lgb.early_stopping_rounds, 
                    first_metric_only=True,
                    verbose=cfg.lgb.early_stopping_verbose)
            )

        clf.fit(
            X_train.drop("Id",axis=1), 
            y_train,
            eval_set=[
                (X_val.drop("Id",axis=1), y_val), 
                (X_train.drop("Id",axis=1), y_train),
                # (X_test.drop("Id",axis=1), y_test),
            ], 
            eval_names=['val', 'train'],
            # eval_metric='binary_logloss',
            eval_metric=custom_eval_loss,
            init_score=None, 
            feature_name=features,
            categorical_feature=categorical_features,
            callbacks=fit_callbacks
        )

        clf_li.append(clf)

        # Val OOF:
        new_col = f'preds_{val_fold}_{i}'
        pred_val = clf.predict_proba(X_val.drop("Id",axis=1), start_iteration=0,
                                      num_iteration=clf.best_iteration_, raw_score=False)
        if len(pred_val.shape)==2 and pred_val.shape[1]==2:
            pred_val = pred_val[:,1]
        val_df[new_col] = pred_val
        val_preds_df = pd.merge(val_preds_df, val_df[['Id', new_col]], on='Id', how='left')

        # Predict test:
        if (clf.best_iteration_ > cfg.lgb.keep_over_iteration):
            pred_test = clf.predict_proba(X_test.drop("Id",axis=1), start_iteration=0,
                                        num_iteration=clf.best_iteration_, raw_score=False)
            if len(pred_test.shape)==2 and pred_test.shape[1]==2:
                pred_test = pred_test[:,1]

            if output_test is None:
                output_test = pd.DataFrame(pred_test, columns=['pred' + str(i + 1)])
                output_test2 = output_test
            else:
                output_test = pd.DataFrame(pred_test, columns=['pred' + str(i + 1)])
                output_test2 = pd.concat([output_test2, output_test], axis=1)

    # Val OOF:
    subset = [col for col in val_preds_df.columns if 'preds_' in col]
    val_preds_df = val_preds_df.dropna(axis=0, how='all', subset=subset)
    val_preds_df['preds'] = val_preds_df[subset].mean(axis=1, skipna=True, numeric_only=True)
    val_oof_acc = get_accuracy(y_pred=val_preds_df['preds'], y_true=val_preds_df['Class'])

    pred_test = output_test2.mean(axis='columns')
    
    return pred_test, clf_li, val_oof_acc

In [25]:
if cfg.filter_warnings:
    warnings.filterwarnings('ignore')

#### run

In [43]:
if cfg.running_on_private_test:
    oof_vals = []
    for val in [0]:
        print(val)
        # cfg.seed = val

        train_val_df = all_data_df.copy()
        test_df = test_sample_df.copy()
        
        train_val_df, test_df = preprocess(train_val_df, test_df)
        print(train_val_df.shape, test_df.shape)
        pred_lgb, clf_li, val_oof_acc = lgb_train_predict(train_val_df, test_df)
        clf = clf_li[-1]

        print(val_oof_acc)
        oof_vals.append(val_oof_acc[0])

    print(oof_vals)
    print(np.round(np.mean(oof_vals), 5), 
          np.round(np.max(oof_vals)-np.min(oof_vals), 5), 
          np.round(np.std(oof_vals), 5))
    
# 0
# (617, 48) (5, 47)
# (0.1812, 0.1639, 0.9433, 0.792, 0.9167)
# 0.1812

0
(617, 70) (5, 69)
(0.1904, 0.1677, 0.94, 0.7795, 0.9167)
[0.1904]
0.1904 0.0 0.0


In [34]:
if cfg.running_on_private_test:
    oof_vals = []
    for val in [100]:
        print(val)
        # cfg.seed = val

        train_val_df = all_data_df.copy()
        test_df = test_sample_df.copy()
        train_val_df, test_df = preprocess(train_val_df, test_df)
        print(train_val_df.shape, test_df.shape)
        pred_xgb, clf_li, val_oof_acc = xgb_train_predict(train_val_df, test_df)
        clf = clf_li[-1]

        print(val_oof_acc)
        oof_vals.append(val_oof_acc[0])
        
    print(oof_vals)
    print(np.round(np.mean(oof_vals), 5), 
          np.round(np.max(oof_vals)-np.min(oof_vals), 5), 
          np.round(np.std(oof_vals), 5))
    
# 100
# (617, 48) (5, 47)
# (0.1993, 0.1709, 0.9433, 0.7967, 0.9074)
# [0.1993]
# 0.1993 0.0 0.0

100
(617, 70) (5, 69)
(0.2034, 0.1724, 0.9303, 0.76, 0.8796)
[0.2034]
0.2034 0.0 0.0


In [None]:
pred = (pred_lgb + pred_xgb) / 2

# Submission

In [None]:
submit=pd.DataFrame(test_df["Id"], columns=["Id"])

In [None]:
submit["class_0"]=1-pred
submit["class_1"]=pred

In [None]:
submit.to_csv('submission.csv',index=False)

In [None]:
submit

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.77905,0.22095
1,010ebe33f668,0.77905,0.22095
2,02fa521e1838,0.77905,0.22095
3,040e15f562a2,0.77905,0.22095
4,046e85c7cc7f,0.77905,0.22095
