In [1]:
!pip install dirty_cat 

Collecting dirty_cat
  Downloading dirty_cat-0.4.1-py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.8/125.8 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dirty_cat
Successfully installed dirty_cat-0.4.1


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import warnings

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from dirty_cat import (
    TableVectorizer,
    SimilarityEncoder,
    MinHashEncoder,
    GapEncoder
)

import catboost as cb
import lightgbm as lgb

pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

# 1. Read Data

In [3]:
TARGET = 'DiagPeriodL90D'
DROP_COLS = ['patient_id', 'patient_gender']
DATA_PATH = Path('/kaggle/input/widsdatathon2024-challenge1')

N_FOLDS = 10
MODEL_SEED = 0
FOLD_SEED = 1

In [4]:
train = pd.read_csv(DATA_PATH / 'training.csv')
test = pd.read_csv(DATA_PATH / 'test.csv')

train.drop(DROP_COLS, axis=1, inplace=True)
test.drop(DROP_COLS, axis=1, inplace=True)

# 2. Split Folds

In [5]:
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=FOLD_SEED)

fold = 0
for _, val_idx in skf.split(train, train[TARGET]):
    fold += 1
    train.loc[train.index.isin(val_idx), 'fold'] = fold

# 3. Models

## Model 1

In [6]:
CAT_FEATURE_NAMES_1 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'metastatic_first_novel_treatment',
    'metastatic_first_novel_treatment_type'
]

FEATURE_NAMES_1 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'metastatic_first_novel_treatment',
    'metastatic_first_novel_treatment_type',
    'patient_age'
]

In [7]:
oof_1 = train[['fold', TARGET]]
val_scores = []
fis_1 = []

pred_probs_1 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_1] = df_train[CAT_FEATURE_NAMES_1].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_1] = df_val[CAT_FEATURE_NAMES_1].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_1] = df_test[CAT_FEATURE_NAMES_1].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_1]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_1]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_1]

    similarity_encoder_1 = SimilarityEncoder(ngram_range=(1, 1), random_state=MODEL_SEED+1)
    similarity_encoder_2 = SimilarityEncoder(ngram_range=(1, 1), random_state=MODEL_SEED+1)

    ct = make_column_transformer(
        (similarity_encoder_1, ['breast_cancer_diagnosis_code',
                              'breast_cancer_diagnosis_desc',
                               'metastatic_cancer_diagnosis_code']),
         (similarity_encoder_2, ['patient_race',
                                'payer_type','patient_state', 
                                'metastatic_first_novel_treatment',
                                'metastatic_first_novel_treatment_type']),
         remainder='passthrough'
    )

    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)

    print(X_train.shape, X_val.shape)



    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    best_params = {'learning_rate': 0.08044312037292342,
     'depth': 5,
     'l2_leaf_reg': 82.94720357045378,
     'subsample': 0.5769825967089384,
     'colsample_bylevel': 0.7242665041142794}

    model = cb.CatBoostClassifier(
        eval_metric='AUC',
        use_best_model=True,
        random_seed=MODEL_SEED,
        class_weights=class_weights,
        **best_params
    )

    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=100
    )
    
    pred_probs = model.predict_proba(
        X_val,
        thread_count=-1,
        ntree_end=model.best_iteration_
    )[:, 1]


    fi = pd.DataFrame(data=model.feature_importances_,
                      index=model.feature_names_,
                      columns=[f'{fold}_importance'])

    oof_1.loc[oof_1.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_1.loc[oof_1.fold == fold][TARGET],
        oof_1.loc[oof_1.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    fis_1.append(fi)

    pred_probs_1 += model.predict_proba(
        X_test, 
        thread_count=-1, 
        ntree_end=model.best_iteration_
    )[:, 1] / N_FOLDS

Fold 1
(11610, 200) (1291, 200)
0:	test: 0.7510974	best: 0.7510974 (0)	total: 63.6ms	remaining: 1m 3s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8036555729
bestIteration = 26

Shrink model to first 27 iterations.
Fold 2
(11610, 201) (1291, 201)
0:	test: 0.7755494	best: 0.7755494 (0)	total: 9.66ms	remaining: 9.65s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8169335653
bestIteration = 46

Shrink model to first 47 iterations.
Fold 3
(11610, 201) (1291, 201)
0:	test: 0.7614080	best: 0.7614080 (0)	total: 9.56ms	remaining: 9.55s
100:	test: 0.7987248	best: 0.7996521 (92)	total: 933ms	remaining: 8.31s
200:	test: 0.8043898	best: 0.8043898 (200)	total: 1.79s	remaining: 7.13s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8043897572
bestIteration = 200

Shrink model to first 201 iterations.
Fold 4
(11609, 197) (1291, 197)
0:	test: 0.7603950	best: 0.7603950 (0)	total: 8.95ms	remaining: 8.95s
100:	test: 0.7944693	best: 0.7959095 

In [8]:
np.mean(val_scores)

0.8118450665336718

## Model 2

In [9]:
oof_2 = train[['fold', TARGET]]
val_scores = []
fis_2 = []

pred_probs_2 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_1] = df_train[CAT_FEATURE_NAMES_1].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_1] = df_val[CAT_FEATURE_NAMES_1].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_1] = df_test[CAT_FEATURE_NAMES_1].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_1]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_1]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_1]

    print(X_train.shape, X_val.shape)


    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    params = {
        'learning_rate': 0.07897250131885453,
        'depth': 6, 
        'l2_leaf_reg': 3.0969061741208472, 
        'subsample': 0.9704455072410895, 
        'colsample_bylevel': 0.6786789881556758
    }
    
    model = cb.CatBoostClassifier(
                eval_metric='AUC',
                use_best_model=True,
                cat_features=CAT_FEATURE_NAMES_1,
                class_weights=class_weights,
                **params
            )

    model.fit(X_train, y_train, 
              cat_features=CAT_FEATURE_NAMES_1,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50,
              verbose=100)

    pred_probs = model.predict_proba(
        X_val,
        ntree_end=model.best_iteration_,
        thread_count=-1
    )[:, 1]

        
    fi = pd.DataFrame(data=model.feature_importances_,
                      index=model.feature_names_,
                      columns=[f'{fold}_importance'])

    oof_2.loc[oof_2.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_2.loc[oof_2.fold == fold][TARGET],
        oof_2.loc[oof_2.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    fis_2.append(fi)

    pred_probs_2 += model.predict_proba(
        X_test, 
        thread_count=-1, 
        ntree_end=model.best_iteration_
    )[:, 1] / N_FOLDS

Fold 1
(11610, 9) (1291, 9)
0:	test: 0.7568750	best: 0.7568750 (0)	total: 10.2ms	remaining: 10.2s
100:	test: 0.7982042	best: 0.7984267 (99)	total: 1.65s	remaining: 14.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8037259216
bestIteration = 145

Shrink model to first 146 iterations.
Fold 2
(11610, 9) (1291, 9)
0:	test: 0.7797293	best: 0.7797293 (0)	total: 8.58ms	remaining: 8.57s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8215932056
bestIteration = 38

Shrink model to first 39 iterations.
Fold 3
(11610, 9) (1291, 9)
0:	test: 0.7446778	best: 0.7446778 (0)	total: 14.9ms	remaining: 14.9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8018712747
bestIteration = 43

Shrink model to first 44 iterations.
Fold 4
(11609, 9) (1291, 9)
0:	test: 0.7706326	best: 0.7706326 (0)	total: 19.4ms	remaining: 19.4s
100:	test: 0.7985457	best: 0.7985457 (100)	total: 1.66s	remaining: 14.8s
200:	test: 0.8071014	best: 0.8071666 (187)	total: 3.4

In [10]:
np.mean(val_scores)

0.8140687516371526

## Model 3

In [11]:
oof_3 = train[['fold', TARGET]]
val_scores = []
fis_3 = []

pred_probs_3 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_1] = df_train[CAT_FEATURE_NAMES_1].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_1] = df_val[CAT_FEATURE_NAMES_1].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_1] = df_test[CAT_FEATURE_NAMES_1].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_1]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_1]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_1]

    gap_encoder = GapEncoder(n_components=10, random_state=MODEL_SEED+3)
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
                   
    ct = make_column_transformer(
        (gap_encoder, ['breast_cancer_diagnosis_code',
                        'breast_cancer_diagnosis_desc',
                        'metastatic_cancer_diagnosis_code',
                        'metastatic_first_novel_treatment',
                        'metastatic_first_novel_treatment_type']),
         (onehot_encoder, 
            ['patient_race',
            'payer_type',
            'patient_state']),
         remainder='passthrough'
    )

    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)

    print(X_train.shape, X_val.shape)



    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    best_params = {
        'learning_rate': 0.08347871150863913,
        'depth': 5, 
        'l2_leaf_reg': 67.16862310527561,
        'subsample': 0.674488872663687,
        'colsample_bylevel': 0.6678716652309943
    }

    model = cb.CatBoostClassifier(
        eval_metric='AUC',
        use_best_model=True,
        class_weights=class_weights,
        **best_params
    )


    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=100
    )

    pred_probs = model.predict_proba(
        X_val,
        thread_count=-1,
        ntree_end=model.best_iteration_
    )[:, 1]


    fi = pd.DataFrame(data=model.feature_importances_,
                      index=model.feature_names_,
                      columns=[f'{fold}_importance'])

    oof_3.loc[oof_3.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_3.loc[oof_3.fold == fold][TARGET],
        oof_3.loc[oof_3.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    fis_3.append(fi)

    pred_probs_3 += model.predict_proba(
        X_test, 
        thread_count=-1, 
        ntree_end=model.best_iteration_
    )[:, 1] / N_FOLDS

Fold 1
(11610, 112) (1291, 112)
0:	test: 0.7604500	best: 0.7604500 (0)	total: 5.34ms	remaining: 5.33s
100:	test: 0.8016474	best: 0.8016577 (99)	total: 521ms	remaining: 4.63s
200:	test: 0.8049615	best: 0.8051713 (196)	total: 1.01s	remaining: 4.03s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8054833593
bestIteration = 217

Shrink model to first 218 iterations.
Fold 2
(11610, 111) (1291, 111)
0:	test: 0.7915671	best: 0.7915671 (0)	total: 5.38ms	remaining: 5.38s
100:	test: 0.8162058	best: 0.8194354 (58)	total: 505ms	remaining: 4.5s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8194354199
bestIteration = 58

Shrink model to first 59 iterations.
Fold 3
(11610, 112) (1291, 112)
0:	test: 0.7594523	best: 0.7594523 (0)	total: 5.11ms	remaining: 5.1s
100:	test: 0.7979637	best: 0.7983513 (96)	total: 507ms	remaining: 4.51s
200:	test: 0.7990023	best: 0.8020478 (161)	total: 1.01s	remaining: 4.02s
Stopped by overfitting detector  (50 iterations wait)

bestT

In [12]:
np.mean(val_scores)

0.811287471371631

In [13]:
CAT_FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code'
]

FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'patient_age'
]

##  Model 4

In [14]:
oof_4 = train[['fold', TARGET]]
val_scores = []
fis_4 = []

pred_probs_4 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_2] = df_train[CAT_FEATURE_NAMES_2]
    df_val[CAT_FEATURE_NAMES_2] = df_val[CAT_FEATURE_NAMES_2]
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_2] = df_test[CAT_FEATURE_NAMES_2]

    X_train = df_train[FEATURE_NAMES_2]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_2]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_2]

    similarity_encoder_1 = SimilarityEncoder(ngram_range=(2, 3), random_state=MODEL_SEED+1)
    similarity_encoder_2 = SimilarityEncoder(ngram_range=(2, 3), random_state=MODEL_SEED+1)
    
    vect = TfidfVectorizer(ngram_range=(1, 2), analyzer='word')
    ct = make_column_transformer(
        (vect, 'breast_cancer_diagnosis_desc'),
        (similarity_encoder_1, ['breast_cancer_diagnosis_code',
                                'metastatic_cancer_diagnosis_code']),
         (similarity_encoder_2, ['patient_race', 'payer_type', 'patient_state']),
         remainder='passthrough'
    )

    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)

    print(X_train.shape, X_val.shape)


    model = cb.CatBoostClassifier(
        eval_metric='AUC',
        use_best_model=True,
        class_weights=class_weights,
        learning_rate=0.05,
        random_state=MODEL_SEED+4
    )


    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=100
    )

    pred_probs = model.predict_proba(
        X_val,
        thread_count=-1,
        ntree_end=model.best_iteration_
    )[:, 1]


    oof_4.loc[oof_4.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_4.loc[oof_4.fold == fold][TARGET],
        oof_4.loc[oof_4.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)

    pred_probs_4 += model.predict_proba(X_test,        
                                        thread_count=-1,
                                        ntree_end=model.best_iteration_)[:, 1] / N_FOLDS

Fold 1
(11610, 234) (1291, 234)
0:	test: 0.7762094	best: 0.7762094 (0)	total: 9.32ms	remaining: 9.31s
100:	test: 0.8045612	best: 0.8094971 (59)	total: 949ms	remaining: 8.45s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8094970709
bestIteration = 59

Shrink model to first 60 iterations.
Fold 2
(11610, 234) (1291, 234)
0:	test: 0.8039830	best: 0.8039830 (0)	total: 10.8ms	remaining: 10.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8237420378
bestIteration = 48

Shrink model to first 49 iterations.
Fold 3
(11610, 234) (1291, 234)
0:	test: 0.7537899	best: 0.7537899 (0)	total: 9.71ms	remaining: 9.7s
100:	test: 0.7979330	best: 0.7992006 (82)	total: 929ms	remaining: 8.27s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7992005833
bestIteration = 82

Shrink model to first 83 iterations.
Fold 4
(11609, 231) (1291, 231)
0:	test: 0.7561868	best: 0.7561868 (0)	total: 10.6ms	remaining: 10.6s
100:	test: 0.7932747	best: 0.7932747 (100

In [15]:
np.mean(val_scores)

0.81165350883962

## Model 5

In [16]:
CAT_FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code'
]

FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'patient_age'
]

In [17]:
oof_5 = train[['fold', TARGET]]
val_scores = []
fis_5 = []

pred_probs_5 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_2] = df_train[CAT_FEATURE_NAMES_2]
    df_val[CAT_FEATURE_NAMES_2] = df_val[CAT_FEATURE_NAMES_2]

    df_test = test
    df_test[CAT_FEATURE_NAMES_2] = df_test[CAT_FEATURE_NAMES_2]

    X_train = df_train[FEATURE_NAMES_2]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_2]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_2]

    vect = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, analyzer='char')
    ohe = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
    imp = SimpleImputer(strategy='constant', fill_value='missingness', add_indicator=True)
    imp_ohe = make_pipeline(imp, ohe)
    
    ct = make_column_transformer((vect, 'breast_cancer_diagnosis_desc'),
                                (imp_ohe, ['patient_race',
                                            'payer_type',
                                            'patient_state',
                                            'breast_cancer_diagnosis_code',
                                            'metastatic_cancer_diagnosis_code']),
                                 remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)
    
    print(X_train.shape, X_val.shape)

    model = cb.CatBoostClassifier(
        eval_metric='AUC',
        use_best_model=True,
        class_weights=class_weights,
        learning_rate=0.05,
        random_state=MODEL_SEED+5
    )


    model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=100
    )

    pred_probs = model.predict_proba(
        X_val,
        thread_count=-1,
        ntree_end=model.best_iteration_
    )[:, 1]


    oof_5.loc[oof_5.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_5.loc[oof_5.fold == fold][TARGET],
        oof_5.loc[oof_5.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)

    pred_probs_5 += model.predict_proba(X_test,       
                                        thread_count=-1,
                                        ntree_end=model.best_iteration_)[:, 1] / N_FOLDS

Fold 1
(11610, 181) (1291, 181)
0:	test: 0.7580351	best: 0.7580351 (0)	total: 5.43ms	remaining: 5.43s
100:	test: 0.7942135	best: 0.7942135 (100)	total: 582ms	remaining: 5.18s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.7946432683
bestIteration = 117

Shrink model to first 118 iterations.
Fold 2
(11610, 181) (1291, 181)
0:	test: 0.7680975	best: 0.7680975 (0)	total: 5.86ms	remaining: 5.86s
100:	test: 0.8136502	best: 0.8138549 (99)	total: 580ms	remaining: 5.16s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8150060116
bestIteration = 123

Shrink model to first 124 iterations.
Fold 3
(11610, 181) (1291, 181)
0:	test: 0.7602197	best: 0.7602197 (0)	total: 6.65ms	remaining: 6.64s
100:	test: 0.7912627	best: 0.7912627 (100)	total: 580ms	remaining: 5.16s
200:	test: 0.7993247	best: 0.7993247 (200)	total: 1.14s	remaining: 4.54s
300:	test: 0.8014466	best: 0.8026886 (260)	total: 1.7s	remaining: 3.95s
Stopped by overfitting detector  (50 iterations wait)



In [18]:
np.mean(val_scores)

0.8092440112718553

## Model 6

In [19]:
def classify_bmi(bmi):
    if bmi < 16:
        return 'Severe Thinness'
    elif 16 <= bmi < 17:
        return 'Moderate Thinness'
    elif 17 <= bmi < 18.5:
        return 'Mild Thinness'
    elif 18.5 <= bmi < 25:
        return 'Normal'
    elif 25 <= bmi < 30:
        return 'Overweight'
    elif 30 <= bmi < 35:
        return 'Obese Class I'
    elif 35 <= bmi < 40:
        return 'Obese Class II'
    else:
        return 'Obese Class III'
    
train['bmi'] = train['bmi'].apply(lambda x: classify_bmi(x))
test['bmi'] = test['bmi'].apply(lambda x: classify_bmi(x))

g_1 = ['payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code']

train_agg = train.groupby(g_1)['patient_age'].agg(['min']).rename({'min': 'min_1'}, axis=1).reset_index()
train = train.merge(train_agg,  how='left')

test_agg = test.groupby(g_1)['patient_age'].agg(['min']).rename({'min': 'min_1'}, axis=1).reset_index()
test = test.merge(test_agg, how='left')

g_2 = ['bmi', 'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code']

train_agg = train.groupby(g_2)['patient_age'].agg(['mean']).rename({'mean': 'mean_1'}, axis=1).reset_index()
train = train.merge(train_agg,  how='left')

test_agg = test.groupby(g_2)['patient_age'].agg(['mean']).rename({'mean': 'mean_1'}, axis=1).reset_index()
test = test.merge(test_agg,  how='left')

In [20]:
CAT_FEATURE_NAMES_3 = [
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'metastatic_first_novel_treatment', 
]

FEATURE_NAMES_3 = [
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'metastatic_first_novel_treatment',
    'patient_age', 'min_1', 'mean_1']

oof_6 = train[['fold', TARGET]]
val_scores = []
fis_6 = []

pred_probs_6 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_3] = df_train[CAT_FEATURE_NAMES_3].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_3] = df_val[CAT_FEATURE_NAMES_3].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_3] = df_test[CAT_FEATURE_NAMES_3].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_3]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_3]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_3]

    print(X_train.shape, X_val.shape)


    classes = np.unique(y_train)
    weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights = dict(zip(classes, weights))

    
    model = cb.CatBoostClassifier(
                eval_metric='AUC',
                use_best_model=True,
                cat_features=CAT_FEATURE_NAMES_3,
                class_weights=class_weights,
                random_seed=MODEL_SEED+7,
                learning_rate=0.05
            )

    model.fit(X_train, y_train, 
              cat_features=CAT_FEATURE_NAMES_3,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=50,
              verbose=100)

    pred_probs = model.predict_proba(
        X_val,
        ntree_end=model.best_iteration_,
        thread_count=-1
    )[:, 1]

        
    fi = pd.DataFrame(data=model.feature_importances_,
                      index=model.feature_names_,
                      columns=[f'{fold}_importance'])

    oof_6.loc[oof_6.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_6.loc[oof_6.fold == fold][TARGET],
        oof_6.loc[oof_6.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    fis_6.append(fi)

    pred_probs_6 += model.predict_proba(
        X_test, 
        thread_count=-1, 
        ntree_end=model.best_iteration_
    )[:, 1] / N_FOLDS

Fold 1
(11610, 9) (1291, 9)
0:	test: 0.7568750	best: 0.7568750 (0)	total: 11.5ms	remaining: 11.4s
100:	test: 0.7992837	best: 0.8004976 (72)	total: 1.58s	remaining: 14.1s
200:	test: 0.8037298	best: 0.8037298 (200)	total: 3.19s	remaining: 12.7s
300:	test: 0.8034688	best: 0.8049449 (262)	total: 4.99s	remaining: 11.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8049448722
bestIteration = 262

Shrink model to first 263 iterations.
Fold 2
(11610, 9) (1291, 9)
0:	test: 0.7786984	best: 0.7786984 (0)	total: 11.8ms	remaining: 11.8s
100:	test: 0.8170717	best: 0.8172712 (97)	total: 1.53s	remaining: 13.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.8179798419
bestIteration = 110

Shrink model to first 111 iterations.
Fold 3
(11610, 9) (1291, 9)
0:	test: 0.7446778	best: 0.7446778 (0)	total: 12.1ms	remaining: 12.1s
100:	test: 0.7975263	best: 0.7981134 (92)	total: 1.62s	remaining: 14.4s
200:	test: 0.8000998	best: 0.8003709 (195)	total: 3.29s	remaining: 1

In [21]:
np.mean(val_scores)

0.8124921371747338

# Model 7

In [22]:
CAT_FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code'
]

FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'patient_age'
]

In [23]:
oof_7 = train[['fold', TARGET]]
val_scores = []
fis_7 = []

pred_probs_7 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_2] = df_train[CAT_FEATURE_NAMES_2]
    df_val[CAT_FEATURE_NAMES_2] = df_val[CAT_FEATURE_NAMES_2]
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_2] = df_test[CAT_FEATURE_NAMES_2]

    X_train = df_train[FEATURE_NAMES_2]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_2]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_2]

    similarity_encoder_1 = SimilarityEncoder(ngram_range=(2, 3), random_state=MODEL_SEED+1)
    similarity_encoder_2 = SimilarityEncoder(ngram_range=(2, 3), random_state=MODEL_SEED+1)
    
    vect = TfidfVectorizer(ngram_range=(1, 2), analyzer='word')
    ct = make_column_transformer(
        (vect, 'breast_cancer_diagnosis_desc'),
        (similarity_encoder_1, ['breast_cancer_diagnosis_code',
                                'metastatic_cancer_diagnosis_code']),
         (similarity_encoder_2, ['patient_race', 'payer_type', 'patient_state']),
         remainder='passthrough'
    )

    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)

    print(X_train.shape, X_val.shape)


    model = lgb.LGBMClassifier(
        n_estimators=100_000,
        learning_rate=0.1,
        random_state=MODEL_SEED,
        n_jobs=-1,
        reg_alpha=0.05,
        class_weight='balanced',
        max_depth=5

    )

    model.fit(
        X_train, y_train, 
        eval_metric='auc',
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=100
    )

    pred_probs = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]


    fi = pd.DataFrame(data=model.feature_importances_,
                      index=model.feature_name_,
                      columns=[f'{fold}_importance'])

    oof_7.loc[oof_7.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_7.loc[oof_7.fold == fold][TARGET],
        oof_7.loc[oof_7.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    fis_7.append(fi)

    pred_probs_7 += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1] / N_FOLDS

Fold 1
(11610, 234) (1291, 234)
Fold 2
(11610, 234) (1291, 234)
[100]	valid_0's auc: 0.822642	valid_0's binary_logloss: 0.480389
Fold 3
(11610, 234) (1291, 234)
[100]	valid_0's auc: 0.787115	valid_0's binary_logloss: 0.509644
Fold 4
(11609, 231) (1291, 231)
[100]	valid_0's auc: 0.801466	valid_0's binary_logloss: 0.489066
Fold 5
(11610, 233) (1291, 233)
[100]	valid_0's auc: 0.807452	valid_0's binary_logloss: 0.485063
Fold 6
(11610, 233) (1291, 233)
Fold 7
(11610, 235) (1290, 235)
Fold 8
(11610, 233) (1290, 233)
[100]	valid_0's auc: 0.810053	valid_0's binary_logloss: 0.477464
Fold 9
(11611, 234) (1290, 234)
Fold 10
(11610, 233) (1290, 233)
[100]	valid_0's auc: 0.812145	valid_0's binary_logloss: 0.483692


## Model 8

In [24]:
CAT_FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code'
]

FEATURE_NAMES_2 = [
    'patient_race',
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'patient_age'
]

In [25]:
oof_8 = train[['fold', TARGET]]
val_scores = []
fis_8 = []

pred_probs_8 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_2] = df_train[CAT_FEATURE_NAMES_2]
    df_val[CAT_FEATURE_NAMES_2] = df_val[CAT_FEATURE_NAMES_2]

    df_test = test
    df_test[CAT_FEATURE_NAMES_2] = df_test[CAT_FEATURE_NAMES_2]

    X_train = df_train[FEATURE_NAMES_2]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_2]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_2]

    vect = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, analyzer='char')
    ohe = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
    imp = SimpleImputer(strategy='constant', fill_value='missingness', add_indicator=True)
    imp_ohe = make_pipeline(imp, ohe)
    
    ct = make_column_transformer((vect, 'breast_cancer_diagnosis_desc'),
                                (imp_ohe, ['patient_race',
                                            'payer_type',
                                            'patient_state',
                                            'breast_cancer_diagnosis_code',
                                            'metastatic_cancer_diagnosis_code']),
                                 remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)
    
    print(X_train.shape, X_val.shape)

    model = lgb.LGBMClassifier(
        n_estimators=1000,
        learning_rate=0.1,
        random_state=MODEL_SEED,
        n_jobs=-1,
        class_weight='balanced',
        reg_alpha=0.05,
        importance_type='gain',
        max_depth=5

    )

    model.fit(
        X_train, y_train, 
        eval_metric='auc',
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=50,
        verbose=100
    )

    pred_probs = model.predict_proba(X_val, num_iteration=model.best_iteration_)[:, 1]


    fi = pd.DataFrame(data=model.feature_importances_,
                      index=model.feature_name_,
                      columns=[f'{fold}_importance'])

    oof_8.loc[oof_8.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_8.loc[oof_8.fold == fold][TARGET],
        oof_8.loc[oof_8.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    fis_8.append(fi)

    pred_probs_8 += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:, 1] / N_FOLDS

Fold 1
(11610, 181) (1291, 181)
[100]	valid_0's auc: 0.788818	valid_0's binary_logloss: 0.481372
Fold 2
(11610, 181) (1291, 181)
[100]	valid_0's auc: 0.820778	valid_0's binary_logloss: 0.480174
Fold 3
(11610, 181) (1291, 181)
Fold 4
(11609, 179) (1291, 179)
[100]	valid_0's auc: 0.796085	valid_0's binary_logloss: 0.487326
Fold 5
(11610, 180) (1291, 180)
Fold 6
(11610, 180) (1291, 180)
[100]	valid_0's auc: 0.817835	valid_0's binary_logloss: 0.474444
Fold 7
(11610, 182) (1290, 182)
Fold 8
(11610, 180) (1290, 180)
Fold 9
(11611, 181) (1290, 181)
[100]	valid_0's auc: 0.788227	valid_0's binary_logloss: 0.48698
Fold 10
(11610, 180) (1290, 180)
[100]	valid_0's auc: 0.815003	valid_0's binary_logloss: 0.48554


## Model 9

In [26]:
oof_9 = train[['fold', TARGET]]
val_scores = []
fis_9 = []

pred_probs_9 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_2] = df_train[CAT_FEATURE_NAMES_2]
    df_val[CAT_FEATURE_NAMES_2] = df_val[CAT_FEATURE_NAMES_2]

    df_test = test
    df_test[CAT_FEATURE_NAMES_2] = df_test[CAT_FEATURE_NAMES_2]

    X_train = df_train[FEATURE_NAMES_2]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_2]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_2]

    vect = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True, analyzer='char')
    ohe = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
    imp = SimpleImputer(strategy='constant', fill_value='missingness', add_indicator=True)
    imp_ohe = make_pipeline(imp, ohe)
    
    ct = make_column_transformer((vect, 'breast_cancer_diagnosis_desc'),
                                (imp_ohe, ['patient_race',
                                            'payer_type',
                                            'patient_state',
                                            'breast_cancer_diagnosis_code',
                                            'metastatic_cancer_diagnosis_code']),
                                 remainder='passthrough')
    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)
    
    print(X_train.shape, X_val.shape)

    model = LogisticRegression(solver='liblinear', random_state=1, class_weight='balanced')

    model.fit(
        X_train, y_train
    )

    pred_probs = model.predict_proba(X_val)[:, 1]


    oof_9.loc[oof_9.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_9.loc[oof_9.fold == fold][TARGET],
        oof_9.loc[oof_9.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)
    pred_probs_9 += model.predict_proba(X_test)[:, 1] / N_FOLDS

Fold 1
(11610, 181) (1291, 181)
Fold 2
(11610, 181) (1291, 181)
Fold 3
(11610, 181) (1291, 181)
Fold 4
(11609, 179) (1291, 179)
Fold 5
(11610, 180) (1291, 180)
Fold 6
(11610, 180) (1291, 180)
Fold 7
(11610, 182) (1290, 182)
Fold 8
(11610, 180) (1290, 180)
Fold 9
(11611, 181) (1290, 181)
Fold 10
(11610, 180) (1290, 180)


In [27]:
np.mean(val_scores)

0.796763093944179

## Model 10

In [28]:
CAT_FEATURE_NAMES_3 = [
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'metastatic_first_novel_treatment', 
]

FEATURE_NAMES_3 = [
    'payer_type',
    'patient_state',
    'breast_cancer_diagnosis_code',
    'breast_cancer_diagnosis_desc',
    'metastatic_cancer_diagnosis_code',
    'metastatic_first_novel_treatment',
    'patient_age', 'min_1', 'mean_1']

oof_10 = train[['fold', TARGET]]
val_scores = []
fis_10 = []

pred_probs_10 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_3] = df_train[CAT_FEATURE_NAMES_3].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_3] = df_val[CAT_FEATURE_NAMES_3].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_3] = df_test[CAT_FEATURE_NAMES_3].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_3].fillna(-9999)
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_3].fillna(-9999)
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_3].fillna(-9999)
    
    ct = make_column_transformer(
        (SimilarityEncoder(ngram_range=(1, 1)), CAT_FEATURE_NAMES_3),
        remainder='passthrough'
    )
    
    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)
    
    print(X_train.shape, X_val.shape)

    
    model = RandomForestClassifier(n_estimators=100, random_state=MODEL_SEED+10, class_weight='balanced')

    model.fit(X_train, y_train)

    pred_probs = model.predict_proba(X_val)[:, 1]

        

    oof_10.loc[oof_10.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_10.loc[oof_10.fold == fold][TARGET],
        oof_10.loc[oof_10.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)

    pred_probs_10 += model.predict_proba(X_test)[:, 1] / N_FOLDS

Fold 1
(11610, 194) (1291, 194)
Fold 2
(11610, 195) (1291, 195)
Fold 3
(11610, 195) (1291, 195)
Fold 4
(11609, 191) (1291, 191)
Fold 5
(11610, 193) (1291, 193)
Fold 6
(11610, 194) (1291, 194)
Fold 7
(11610, 196) (1290, 196)
Fold 8
(11610, 194) (1290, 194)
Fold 9
(11611, 195) (1290, 195)
Fold 10
(11610, 194) (1290, 194)


In [29]:
np.mean(val_scores)

0.7706800266954982

## Model 11

In [30]:
oof_11 = train[['fold', TARGET]]
val_scores = []

pred_probs_11 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_1] = df_train[CAT_FEATURE_NAMES_1].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_1] = df_val[CAT_FEATURE_NAMES_1].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_1] = df_test[CAT_FEATURE_NAMES_1].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_1]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_1]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_1]

    similarity_encoder_1 = SimilarityEncoder(ngram_range=(1, 1), random_state=MODEL_SEED+1)
    similarity_encoder_2 = SimilarityEncoder(ngram_range=(1, 1), random_state=MODEL_SEED+1)

    ct = make_column_transformer(
        (similarity_encoder_1, ['breast_cancer_diagnosis_code',
                              'breast_cancer_diagnosis_desc',
                               'metastatic_cancer_diagnosis_code']),
         (similarity_encoder_2, ['patient_race',
                                'payer_type','patient_state', 
                                'metastatic_first_novel_treatment',
                                'metastatic_first_novel_treatment_type']),
         remainder='passthrough'
    )
    
    scaler = StandardScaler()
    ct = make_pipeline(ct, scaler)
    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)

    print(X_train.shape, X_val.shape)


    model = LogisticRegression(solver='saga', penalty='l2', C=0.05)

    model.fit(
        X_train, y_train, 
    )
    
    pred_probs = model.predict_proba(X_val)[:, 1]

    oof_11.loc[oof_11.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_11.loc[oof_11.fold == fold][TARGET],
        oof_11.loc[oof_11.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)

    pred_probs_11 += model.predict_proba(X_test)[:, 1] / N_FOLDS

Fold 1
(11610, 200) (1291, 200)
Fold 2
(11610, 201) (1291, 201)
Fold 3
(11610, 201) (1291, 201)
Fold 4
(11609, 197) (1291, 197)
Fold 5
(11610, 199) (1291, 199)
Fold 6
(11610, 200) (1291, 200)
Fold 7
(11610, 202) (1290, 202)
Fold 8
(11610, 200) (1290, 200)
Fold 9
(11611, 201) (1290, 201)
Fold 10
(11610, 200) (1290, 200)


In [31]:
np.mean(val_scores)

0.7957863750211152

## Model 12

In [32]:
oof_12 = train[['fold', TARGET]]
val_scores = []

pred_probs_12 = 0
for fold in range(1, N_FOLDS+1):
    print('Fold', fold)
    
    df_train = train[train.fold != fold]
    df_val = train[train.fold == fold]
    
    df_train = df_train[~df_train['breast_cancer_diagnosis_desc'].str.contains(' male')]
    df_train[CAT_FEATURE_NAMES_1] = df_train[CAT_FEATURE_NAMES_1].fillna('NaN')
    df_val[CAT_FEATURE_NAMES_1] = df_val[CAT_FEATURE_NAMES_1].fillna('NaN')
    
    df_test = test
    df_test[CAT_FEATURE_NAMES_1] = df_test[CAT_FEATURE_NAMES_1].fillna('NaN')

    X_train = df_train[FEATURE_NAMES_1]
    y_train = df_train[TARGET]
    
    X_val = df_val[FEATURE_NAMES_1]
    y_val = df_val[TARGET]
    
    X_test = df_test[FEATURE_NAMES_1]

    gap_encoder = GapEncoder(n_components=10, random_state=MODEL_SEED+12)
    onehot_encoder = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
                   
    ct = make_column_transformer(
        (gap_encoder, ['breast_cancer_diagnosis_code',
                        'breast_cancer_diagnosis_desc',
                        'metastatic_cancer_diagnosis_code',
                        'metastatic_first_novel_treatment',
                        'metastatic_first_novel_treatment_type']),
         (onehot_encoder, 
            ['patient_race',
            'payer_type',
            'patient_state']),
         remainder='passthrough'
    )
    
    scaler = StandardScaler()
    ct = make_pipeline(ct, scaler)
    
    X_train = ct.fit_transform(X_train)
    X_val = ct.transform(X_val)
    X_test = ct.transform(X_test)

    print(X_train.shape, X_val.shape)

    model = MLPClassifier(
        hidden_layer_sizes=(64, 32, 16),
        activation='relu',
        solver='adam',
        learning_rate='constant',
        learning_rate_init=0.01,
        random_state=MODEL_SEED+12,
        alpha=0.01,
        batch_size=32,
        max_iter=10_000,
        early_stopping=True
    )


    model.fit(
        X_train, y_train
    )

    pred_probs = model.predict_proba(
        X_val
    )[:, 1]


    oof_12.loc[oof_12.fold == fold, f'fold_pred_probs'] = pred_probs

    val_score = roc_auc_score(
        oof_12.loc[oof_12.fold == fold][TARGET],
        oof_12.loc[oof_12.fold == fold][f'fold_pred_probs']
    )
    
    val_scores.append(val_score)

    pred_probs_12 += model.predict_proba(
        X_test
    )[:, 1] / N_FOLDS

Fold 1
(11610, 112) (1291, 112)
Fold 2
(11610, 111) (1291, 111)
Fold 3
(11610, 112) (1291, 112)
Fold 4
(11609, 111) (1291, 111)
Fold 5
(11610, 112) (1291, 112)
Fold 6
(11610, 111) (1291, 111)
Fold 7
(11610, 112) (1290, 112)
Fold 8
(11610, 112) (1290, 112)
Fold 9
(11611, 111) (1290, 111)
Fold 10
(11610, 112) (1290, 112)


In [33]:
np.mean(val_scores)

0.7990414181912051

# 4. Ensemble

In [34]:
from sklearn.linear_model import Ridge
ridge = Ridge(random_state=1, alpha=0.1)

X = pd.concat(
    [
        oof_1[['fold_pred_probs']],
        oof_2[['fold_pred_probs']],
        oof_3[['fold_pred_probs']],
        oof_4[['fold_pred_probs']],
        oof_5[['fold_pred_probs']],
        oof_6[['fold_pred_probs']],
        oof_7[['fold_pred_probs']],
        oof_8[['fold_pred_probs']],
        oof_9[['fold_pred_probs']],
        oof_10[['fold_pred_probs']],
        oof_11[['fold_pred_probs']],
        oof_12[['fold_pred_probs']]
    ],
    axis=1
)

y = oof_1[TARGET]
ridge.fit(X, y)
w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12 = ridge.coef_
sum_w = w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w9 + w10 + w11 + w12
w1 = w1 / sum_w
w2 = w2 / sum_w
w3 = w3 / sum_w
w4 = w4 / sum_w
w5 = w5 / sum_w
w6 = w6 / sum_w
w7 = w7 / sum_w
w8 = w8 / sum_w
w9 = w9 / sum_w
w10 = w10 / sum_w
w11 = w11 / sum_w
w12 = w12 / sum_w


print(w1 + w2 + w3 + w4 + w5 + w6 + w7 + w8 + w8 + w9 + w10 + w11 + w12)
w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12

1.0904545351561716


(0.2743511782740821,
 0.25844136681259394,
 0.17799484411763417,
 0.06803838789333422,
 0.03159291921899259,
 0.11460303553515859,
 0.11653545677662894,
 0.09045453515617152,
 -0.10346165473580368,
 -0.04788741554062401,
 -0.08327513193210556,
 0.10261247842393693)

In [35]:
ensemble_pred_probs = (
      w1*oof_1['fold_pred_probs'] 
    + w2*oof_2['fold_pred_probs'] 
    + w3*oof_3['fold_pred_probs']
    + w4*oof_4['fold_pred_probs']
    + w5*oof_5['fold_pred_probs']
    + w6*oof_6['fold_pred_probs']
    + w7*oof_7['fold_pred_probs']
    + w8*oof_8['fold_pred_probs']
    + w9*oof_9['fold_pred_probs']
    + w10*oof_10['fold_pred_probs']
    + w11*oof_11['fold_pred_probs']
    + w12*oof_12['fold_pred_probs']
) 

roc_auc_score(train[TARGET], ensemble_pred_probs)

0.8158701914756127

In [36]:
# y_pred = (
#       w1*pred_probs_1 + w2*pred_probs_2 + w3*pred_probs_3 
#     + w4*pred_probs_4 + w5*pred_probs_5 + w6*pred_probs_6
#     + w7*pred_probs_7 + w8*pred_probs_7 + w9*pred_probs_9 
#     + w10*pred_probs_10 + w11*pred_probs_11 + w12*pred_probs_12
# )

y_pred = (
      pred_probs_1 + pred_probs_2 + pred_probs_3 
    + pred_probs_4 + pred_probs_5 + pred_probs_6
    + pred_probs_7 + pred_probs_7 + pred_probs_9 
    + pred_probs_10 + pred_probs_11 + pred_probs_12
) / 12
y_pred

array([0.71639232, 0.69717231, 0.6710562 , ..., 0.81629541, 0.0662792 ,
       0.80950949])

# Submission

In [37]:
sub = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv')
sub['DiagPeriodL90D'] = y_pred
sub.to_csv('submission.csv', index=False)
sub

Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.716392
1,593679,0.697172
2,184532,0.671056
3,447383,0.669022
4,687972,0.704964
...,...,...
5787,977076,0.740500
5788,922960,0.771449
5789,759690,0.816295
5790,911717,0.066279
