In [8]:
import pandas as pd

train = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv", nrows=5)
print("--- ALL COLUMNS ---")
print(train.columns.tolist())

print("\n--- COLUMN TYPES ---")
print(train.dtypes)

--- ALL COLUMNS ---
['id', 'age', 'alcohol_consumption_per_week', 'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi', 'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate', 'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides', 'gender', 'ethnicity', 'education_level', 'income_level', 'smoking_status', 'employment_status', 'family_history_diabetes', 'hypertension_history', 'cardiovascular_history', 'diagnosed_diabetes']

--- COLUMN TYPES ---
id                                      int64
age                                     int64
alcohol_consumption_per_week            int64
physical_activity_minutes_per_week      int64
diet_score                            float64
sleep_hours_per_day                   float64
screen_time_hours_per_day             float64
bmi                                   float64
waist_to_hip_ratio                    float64
systolic_bp                             int64
di

In [9]:
def prepare_data_safe(df):
    # Standardize to lowercase for easier matching
    df.columns = [c.lower() for c in df.columns]
    cols = df.columns
    
    # 1. BMI Calculation (Checking for weight_kg or weight)
    w_col = 'weight_kg' if 'weight_kg' in cols else 'weight'
    h_col = 'height_cm' if 'height_cm' in cols else 'height'
    
    if w_col in cols and h_col in cols:
        df['bmi'] = df[w_col] / ((df[h_col] / 100) ** 2)
    
    # 2. Blood Pressure (Checking for systolic/diastolic)
    if 'systolic_bp' in cols and 'diastolic_bp' in cols:
        df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
        df['map'] = df['diastolic_bp'] + (df['pulse_pressure'] / 3)
        
    return df

In [10]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OrdinalEncoder
from scipy.stats import rankdata
import warnings

warnings.filterwarnings("ignore")

# --- 1. DATA LOADING ---
train = pd.read_csv("/kaggle/input/playground-series-s5e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s5e12/test.csv")

TARGET = "diagnosed_diabetes"
ID_COL = "id"

# --- 2. MEDICAL FEATURE ENGINEERING (Clean & Direct) ---
def prepare_data(df):
    # Ratios often help trees find signal in blood work
    df['cholesterol_ratio'] = df['cholesterol_total'] / (df['hdl_cholesterol'] + 1e-5)
    df['ldl_hdl_ratio'] = df['ldl_cholesterol'] / (df['hdl_cholesterol'] + 1e-5)
    df['pulse_pressure'] = df['systolic_bp'] - df['diastolic_bp']
    
    # Lifestyle interactions
    df['activity_to_diet'] = df['physical_activity_minutes_per_week'] * df['diet_score']
    return df

train = prepare_data(train)
test = prepare_data(test)

X = train.drop(columns=[ID_COL, TARGET])
y = train[TARGET].astype(int) # Force to int for classification
X_test = test.drop(columns=[ID_COL])

# --- 3. CATEGORICAL IDENTIFICATION ---
cat_cols = ['gender', 'ethnicity', 'education_level', 'income_level', 
            'smoking_status', 'employment_status']

# LightGBM needs numeric codes
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_lgb = X.copy()
X_test_lgb = X_test.copy()
X_lgb[cat_cols] = encoder.fit_transform(X[cat_cols].astype(str))
X_test_lgb[cat_cols] = encoder.transform(X_test[cat_cols].astype(str))

# --- 4. TRAINING SETUP (GPU T4 x2) ---
N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_lgb = np.zeros(len(X))
oof_cat = np.zeros(len(X))
test_preds_lgb = np.zeros(len(X_test))
test_preds_cat = np.zeros(len(X_test))

# --- 5. DUAL-MODEL TRAINING LOOP ---
print("üöÄ Training LightGBM and CatBoost on GPU...")

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    X_tr_lgb, X_val_lgb = X_lgb.iloc[tr_idx], X_lgb.iloc[val_idx]
    X_tr_cat, X_val_cat = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    # LightGBM
    m_lgb = lgb.LGBMClassifier(
        objective='binary', metric='auc', device='gpu', n_estimators=3000,
        learning_rate=0.02, num_leaves=63, random_state=42, verbose=-1
    )
    m_lgb.fit(X_tr_lgb, y_tr, eval_set=[(X_val_lgb, y_val)], 
              callbacks=[lgb.early_stopping(stopping_rounds=100)])
    oof_lgb[val_idx] = m_lgb.predict_proba(X_val_lgb)[:, 1]
    test_preds_lgb += m_lgb.predict_proba(X_test_lgb)[:, 1] / N_SPLITS

    # CatBoost
    m_cat = CatBoostClassifier(
        iterations=3000, learning_rate=0.03, depth=6, task_type='GPU',
        eval_metric='AUC', random_seed=42, verbose=0
    )
    m_cat.fit(X_tr_cat, y_tr, cat_features=cat_cols, eval_set=(X_val_cat, y_val))
    oof_cat[val_idx] = m_cat.predict_proba(X_val_cat)[:, 1]
    test_preds_cat += m_cat.predict_proba(X_test)[:, 1] / N_SPLITS
    print(f"Fold {fold+1} done.")

# --- 6. OPTIMIZED RANK BLENDING ---
best_auc = 0
best_w = 0
for w in np.linspace(0, 1, 11):
    blend = (rankdata(oof_cat) * w) + (rankdata(oof_lgb) * (1-w))
    score = roc_auc_score(y, blend)
    if score > best_auc:
        best_auc = score
        best_w = w

print(f"\nüèÜ Best CV AUC: {best_auc:.6f} at Weight {best_w}")

# --- 7. FINAL SUBMISSION ---
final_test_preds = (rankdata(test_preds_cat) * best_w) + (rankdata(test_preds_lgb) * (1-best_w))
submission = pd.DataFrame({"id": test[ID_COL], TARGET: final_test_preds})
submission.to_csv("submission_master.csv", index=False)

üöÄ Training LightGBM and CatBoost on GPU...




Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[2390]	valid_0's auc: 0.726805


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 1 done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1864]	valid_0's auc: 0.725255


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 2 done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1949]	valid_0's auc: 0.725767


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 3 done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1762]	valid_0's auc: 0.726415


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 4 done.
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1578]	valid_0's auc: 0.726456


Default metric period is 5 because AUC is/are not implemented for GPU


Fold 5 done.

üèÜ Best CV AUC: 0.726389 at Weight 0.30000000000000004


In [11]:
# 1. Inspect the top of the dataframe
print("--- Final Submission Preview ---")
display(submission.head(10))

# 2. Check the distribution (to ensure no clipping or constant values)
print("\n--- Prediction Statistics ---")
print(submission[TARGET].describe())

# 3. Save with the exact name 'submission.csv' for Kaggle
submission.to_csv("submission.csv", index=False)
print("\n‚úÖ SUCCESS: 'submission.csv' saved and ready for final submission!")

--- Final Submission Preview ---


Unnamed: 0,id,diagnosed_diabetes
0,700000,94453.8
1,700001,192690.6
2,700002,236722.7
3,700003,40414.4
4,700004,285662.3
5,700005,173369.9
6,700006,222133.6
7,700007,289734.6
8,700008,140175.1
9,700009,254503.4



--- Prediction Statistics ---
count    300000.000000
mean     150000.500000
std       86408.008146
min           1.900000
25%       75221.600000
50%      149933.950000
75%      224756.575000
max      300000.000000
Name: diagnosed_diabetes, dtype: float64

‚úÖ SUCCESS: 'submission.csv' saved and ready for final submission!
