In [None]:
!pip install /kaggle/input/pip-install-lifelines/autograd-1.7.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/autograd-gamma-0.5.0.tar.gz
!pip install /kaggle/input/pip-install-lifelines/interface_meta-1.3.0-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/formulaic-1.0.2-py3-none-any.whl
!pip install /kaggle/input/pip-install-lifelines/lifelines-0.30.0-py3-none-any.whl

In [None]:
import pandas as pd
df = pd.read_csv("/kaggle/input/equity-post-HCT-survival-predictions/train.csv",index_col='ID')

In [None]:
threshold = 1.0  # 30%

# Calculate missing value percentage for each column
missing_percent = df.isnull().mean()

# Identify columns to drop
columns_to_drop = missing_percent[missing_percent > threshold].index.tolist()

# Print removed columns
print("Columns removed due to missing values > 50%:")
print(columns_to_drop)

# Drop the columns
data_cleaned = df.drop(columns=columns_to_drop)

In [None]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer  # Enable Iterative Imputer
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler

def pmm_imputation(df):

    df_imputed = df.copy()  # Create a copy to avoid modifying the original dataset

    # 🔹 Step 1: Identify categorical & numerical columns
    cat_cols = df_imputed.select_dtypes(include=['object']).columns.tolist()
    num_cols = df_imputed.select_dtypes(include=['number']).columns.tolist()

    # 🔹 Step 2: Remove constant (zero-variance) columns
    for col in df_imputed.columns:
        if df_imputed[col].nunique() == 1:
            df_imputed.drop(columns=[col], inplace=True)

    # 🔹 Step 3: Encode categorical columns using Label Encoding
    label_encoders = {}
    for col in cat_cols:
        le = LabelEncoder()
        df_imputed[col] = df_imputed[col].astype(str)  # Convert to string to avoid NaN issues
        df_imputed[col] = le.fit_transform(df_imputed[col])
        label_encoders[col] = le  # Store label encoders for decoding later

    # 🔹 Step 4: Scale numerical columns (to prevent instability in PMM)
    scaler = StandardScaler()
    df_imputed[num_cols] = scaler.fit_transform(df_imputed[num_cols])

    # 🔹 Step 5: Apply PMM (Predictive Mean Matching) via IterativeImputer
    print("\n⚡ Applying Predictive Mean Matching (PMM) Imputation...")
    pmm_imputer = IterativeImputer(sample_posterior=True, max_iter=10, random_state=42, min_value=0)
    df_imputed[:] = pmm_imputer.fit_transform(df_imputed)  # Apply imputation to the entire dataset

    # 🔹 Step 6: Restore categorical columns to original encoding
    for col in cat_cols:
        df_imputed[col] = df_imputed[col].round().astype(int)  # Ensure integer values
        df_imputed[col] = label_encoders[col].inverse_transform(df_imputed[col])  # Decode labels back to original categories

    # 🔹 Step 7: Reverse scaling for numerical columns
    df_imputed[num_cols] = scaler.inverse_transform(df_imputed[num_cols])

    print("\n✅ PMM Imputation Complete!")
    return df_imputed

imputed_pmm = pmm_imputation(data_cleaned)  # Apply PMM imputation

# Check if missing values are handled
print("\nMissing Values After Imputation:")
print(imputed_pmm.isnull().sum().sum())  # Should print 0 if all missing values are handled

# Save the imputed dataset
# imputed_pmm.to_csv("imputed_data.csv", index=False)


In [None]:
# IMPORTS FINAL
import pandas as pd
import numpy as np
import lifelines
from lifelines import NelsonAalenFitter
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

# Load data
# train = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/train.csv', index_col='ID')
train= imputed_pmm.copy()
test = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/test.csv', index_col='ID')
data_description = pd.read_csv('/kaggle/input/equity-post-HCT-survival-predictions/data_dictionary.csv')

# Take 85% of the data for training uniformly
train_split = train.sample(frac=0.85, random_state=42)

# Categorical & Numeric columns
cat_cols = []
num_cols = []
for v, t in data_description[['variable', 'type']].values:
    if t == 'Categorical' and v != 'efs':
        cat_cols.append(v)
    elif v not in ['efs_time', 'efs']:
        num_cols.append(v)

# Feature engineering using Nelson-Aalen estimator for training set
naf = NelsonAalenFitter()
naf.fit(train_split['efs_time'], train_split['efs'])
train_split['naf_label'] = -naf.cumulative_hazard_at_times(train_split['efs_time']).values
train_split.loc[train_split['efs'] == 0, 'naf_label'] -= 0.15

# Model Parameters for LightGBM
lgbm_naf_params = {
    'task': 'train',
    'objective': 'regression',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 10,
    'max_depth': 5,
    'min_data_in_leaf': 10,
    'min_sum_hessian_in_leaf': 1e-3,
    'feature_fraction': 1.0,
    'bagging_fraction': 1.0,
    'bagging_freq': 1,
    'lambda_l1': 0.0,
    'lambda_l2': 0.0,
    'metric': 'rmse',
    'seed': 53,
    'n_estimators': 1999,
    'num_threads': 4,
    'device_type': 'cpu'
}

# Define target columns
target_cols = ['efs', 'efs_time', 'naf_label']

# Convert object type features to category for proper handling
cat_cols = train_split.drop(columns=target_cols).select_dtypes(include='object').columns.tolist()
train_split[cat_cols] = train_split[cat_cols].astype('category')
test[cat_cols] = test[cat_cols].astype('category')

# Prepare LightGBM datasets
train_lgb_naf = lgb.Dataset(train_split.drop(columns=target_cols),
                            label=train_split['naf_label'],
                            categorical_feature=cat_cols)

# Train LightGBM model
best_naf = lgb.train(lgbm_naf_params, train_lgb_naf, 1000)

# Prepare test data for prediction
test_features = test.copy()

# Align columns between train_split and test
test_features = test_features[train_split.drop(columns=target_cols).columns]

# Make predictions on test data
preds_lgb_naf = best_naf.predict(test_features)

# Prepare final submission DataFrame
submission = pd.DataFrame({
    'ID': test.index,
    'prediction': preds_lgb_naf
})

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

# # Print the required format output
# print("ID\nprediction")
# for idx, pred in submission.iterrows():
#     print(f"{int(pred['ID'])}\n{pred['prediction']}")
# print("total values")
# print(len(submission))


In [None]:
submission.head()