In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings("ignore")

# First Model (catboost)

The first model is based on catboost joker code (was published in code section): https://www.kaggle.com/code/onurkoc83/catboost-joker

In [2]:
# importing datasets
train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
test = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/test.csv')
train.drop(columns=['patient_id'],inplace=True)
test.drop(columns=['patient_id'],inplace=True)
numerical_cols = train.select_dtypes(exclude=['object']).columns
categorical_columns = train.select_dtypes(include=['object']).columns

In [3]:
# Filling missing data with mean or mode
for col in categorical_columns:
    if col != 'DiagPeriodL90D':
        mode = train[col].mode()[0]
        train[col].fillna(mode, inplace=True)
        test[col].fillna(mode, inplace=True)

for col in numerical_cols:
    if col != 'DiagPeriodL90D':
        mean = train[col].median()
        train[col].fillna(mean, inplace=True)
        test[col].fillna(mean, inplace=True)

In [4]:
test['DiagPeriodL90D'] = 2
df = pd.concat([train,test])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
for col in categorical_columns.to_list()+['patient_zip3']:
    encoder.fit(df[[col]])
    df[col] = encoder.transform(df[[col]])
cols = ['breast_cancer_diagnosis_code','metastatic_cancer_diagnosis_code','patient_zip3','patient_age','payer_type',
        'patient_state','breast_cancer_diagnosis_desc']

train = df[df['DiagPeriodL90D']!=2]
test = df[df['DiagPeriodL90D']==2].drop(columns=['DiagPeriodL90D'])   

In [5]:
X = train[cols+['DiagPeriodL90D']].drop(columns=['DiagPeriodL90D'], axis=1)
y = train['DiagPeriodL90D']
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

params = {
    
    'depth':2,
    'random_state': 42,
    'eval_metric': 'AUC',
    'verbose': False,
    'loss_function': 'Logloss',
    'learning_rate':0.3,
    'iterations':1000
}

auc_scores = []
test_preds = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)
    preds = model.predict_proba(X_test)[:, 1]
    preds_test = model.predict_proba(test[cols])[:, 1]
    test_preds.append(preds_test)
    auc_score = roc_auc_score(y_test, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

    
print(f"Ortalama AUC Skoru: {np.mean(auc_scores)}")
print(pd.DataFrame([1 if prob >= 0.5 else 0 for prob in np.mean(test_preds,axis=0)], columns=['test_preds'])['test_preds'].value_counts())

AUC Score: 0.8150712440203627
AUC Score: 0.8078315497545498
AUC Score: 0.800331364098467
AUC Score: 0.8080924285608196
AUC Score: 0.8031005206052644
Ortalama AUC Skoru: 0.8068854214078927
test_preds
1    4396
0    1396
Name: count, dtype: int64


In [6]:
submission = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv')
submission['DiagPeriodL90D'] = np.mean(test_preds,axis=0)
submission.to_csv('submission1.csv',index=False)

# Second Model (LightGBM)

In [7]:
# importing datasets
train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
test = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/test.csv')
train.drop(columns=['patient_id'],inplace=True)
test.drop(columns=['patient_id'],inplace=True)
numerical_cols = train.select_dtypes(exclude=['object']).columns
categorical_columns = train.select_dtypes(include=['object']).columns

In [8]:
test['DiagPeriodL90D'] = 2
df = pd.concat([train,test])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
for col in categorical_columns.to_list()+['patient_zip3']:
    encoder.fit(df[[col]])
    df[col] = encoder.transform(df[[col]])
cols = ['breast_cancer_diagnosis_code','metastatic_cancer_diagnosis_code','patient_zip3','patient_age','payer_type',
        'patient_state','breast_cancer_diagnosis_desc']

train = df[df['DiagPeriodL90D']!=2]
test = df[df['DiagPeriodL90D']==2].drop(columns=['DiagPeriodL90D'])   

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np

X = train[cols+['DiagPeriodL90D']].drop(columns=['DiagPeriodL90D'], axis=1)
y = train['DiagPeriodL90D']

# Stratejik çapraz doğrulama için katlama ayarları
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model için parametreler
params = {
            'objective'           : 'binary',
            'boosting_type'       : 'gbdt',
            'metric'              : "auc",
            'colsample_bytree'    : 0.56,
            'subsample'           : 0.35,
            'learning_rate'       : 0.05,
            'max_depth'           : 8,
            'n_estimators'        : 3000,
            'num_leaves'          : 140,
            'reg_alpha'           : 0.14,
            'reg_lambda'          : 0.85,
            'verbosity'           : -1, 
            }


auc_scores = []
test_preds = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        
    model = lgb.LGBMClassifier(**params)
    
    model.fit(X_train, y_train, 
              eval_set=[(X_test, y_test)], 
              eval_metric='auc',
              callbacks=[lgb.callback.log_evaluation(0), 
                         lgb.callback.early_stopping(stopping_rounds=10, 
                                                     verbose=False)],
              )

    preds = model.predict_proba(X_test)[:, 1]
    preds_test = model.predict_proba(test[cols])[:, 1]
    test_preds.append(preds_test)
    
    auc_score = roc_auc_score(y_test, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

    
print(f"Ortalama AUC Skoru: {np.mean(auc_scores)}")
print(pd.DataFrame([1 if prob >= 0.5 else 0 for prob in np.mean(test_preds,axis=0)], columns=['test_preds'])['test_preds'].value_counts())

AUC Score: 0.8065708858816607
AUC Score: 0.8022218551780121
AUC Score: 0.8003828996663312
AUC Score: 0.8077492849039839
AUC Score: 0.8075232966374482
Ortalama AUC Skoru: 0.8048896444534872
test_preds
1    4418
0    1374
Name: count, dtype: int64


In [10]:
submission = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv')
submission['DiagPeriodL90D'] = np.mean(test_preds,axis=0)
submission.to_csv('submission2.csv',index=False)

# Third Model (XGBoost)

In [11]:
# importing datasets
train = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/training.csv')
test = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/test.csv')
train.drop(columns=['patient_id'],inplace=True)
test.drop(columns=['patient_id'],inplace=True)
numerical_cols = train.select_dtypes(exclude=['object']).columns
categorical_columns = train.select_dtypes(include=['object']).columns

In [12]:
test['DiagPeriodL90D'] = 2
df = pd.concat([train,test])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
for col in categorical_columns.to_list()+['patient_zip3']:
    encoder.fit(df[[col]])
    df[col] = encoder.transform(df[[col]])
cols = ['breast_cancer_diagnosis_code','metastatic_cancer_diagnosis_code','patient_zip3','patient_age','payer_type',
        'patient_state','breast_cancer_diagnosis_desc']

train = df[df['DiagPeriodL90D']!=2]
test = df[df['DiagPeriodL90D']==2].drop(columns=['DiagPeriodL90D'])   

In [13]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import numpy as np

X = train[cols+['DiagPeriodL90D']].drop(columns=['DiagPeriodL90D'], axis=1)
y = train['DiagPeriodL90D']

# Stratejik çapraz doğrulama için katlama ayarları
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Model için parametreler
params = {'objective'             : 'binary:logistic',
          'eval_metric'           : "auc",
          'colsample_bytree'      : 0.5,
          'learning_rate'         : 0.055,
          'max_depth'             : 9,
          'n_estimators'          : 3000,                         
          'reg_alpha'             : 0.2,
          'reg_lambda'            : 0.6,
          'min_child_weight'      : 25,
          'verbosity'             : 0,
         }


auc_scores = []
test_preds = []
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # XGBoost modelini başlat
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    # Modeli eğit
    model = xgb.train(params, dtrain, num_boost_round=1000, 
                      evals=[(dtest, "Test")], 
                      early_stopping_rounds=10,
                      verbose_eval=False)
    
    # Tahminleri yap
    preds = model.predict(dtest)
    preds_test = model.predict(xgb.DMatrix(test[cols]))
    test_preds.append(preds_test)
    
    # AUC skorunu hesapla
    auc_score = roc_auc_score(y_test, preds)
    auc_scores.append(auc_score)
    print(f"AUC Score: {auc_score}")

# Ortalama AUC skorunu yazdır
print(f"Ortalama AUC Skoru: {np.mean(auc_scores)}")
print(pd.DataFrame([1 if prob >= 0.5 else 0 for prob in np.mean(test_preds,axis=0)], columns=['test_preds'])['test_preds'].value_counts())

AUC Score: 0.8056672251924997
AUC Score: 0.795164043154156
AUC Score: 0.7907086172591017
AUC Score: 0.7938420438045926
AUC Score: 0.8021475927448164
Ortalama AUC Skoru: 0.7975059044310333
test_preds
1    4419
0    1373
Name: count, dtype: int64


In [14]:
submission = pd.read_csv('/kaggle/input/widsdatathon2024-challenge1/sample_submission.csv')
submission['DiagPeriodL90D'] = np.mean(test_preds,axis=0)
submission.to_csv('submission3.csv',index=False)

# Doleh's Ensemble Method

In [15]:
df1 = pd.read_csv('/kaggle/working/submission1.csv')
df2 = pd.read_csv('/kaggle/working/submission2.csv')
df3 = pd.read_csv('/kaggle/working/submission3.csv')

In [16]:
import pandas as pd
from sklearn.metrics import mean_absolute_error

def calculate_mae(df1, df2):
    array1 = df1.to_numpy()
    array2 = df2.to_numpy()
    mae = mean_absolute_error(array1, array2)
    return mae

mae_result = calculate_mae(df1['DiagPeriodL90D'], df2['DiagPeriodL90D'])
print(f"Mean Absolute Error Between model1 and model2: {mae_result}")
mae_result = calculate_mae(df1['DiagPeriodL90D'], df3['DiagPeriodL90D'])
print(f"Mean Absolute Error Between model1 and model3: {mae_result}")
mae_result = calculate_mae(df2['DiagPeriodL90D'], df3['DiagPeriodL90D'])
print(f"Mean Absolute Error Between model2 and model3: {mae_result}")

Mean Absolute Error Between model1 and model2: 0.12130881866049413
Mean Absolute Error Between model1 and model3: 0.12131218938377931
Mean Absolute Error Between model2 and model3: 0.017230854855586675


In [17]:
predictions_df = pd.DataFrame({'df1': df1['DiagPeriodL90D'], 'df2': df2['DiagPeriodL90D'], 'df3': df3['DiagPeriodL90D']})
# create an empty list to store the chosen predictions
preds = []
for _, row in predictions_df.iterrows():
    # get the median of the row
    row_median = row.median()
    # get the mean of the row excluding the median
    row_mean = row[row != row_median].mean()
    # calculate the mean between the median and the mean
    row_mean_median = (row_median + row_mean) / 2
    preds.append(row_mean_median)

preds1 = pd.DataFrame()
preds1['DiagPeriodL90D'] = preds

In [18]:
df = preds1.copy()
df['patient_id'] = df1['patient_id']
df.to_csv('submission_wids.csv', index = False)