In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import set_config
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier, plot_importance
from scipy import stats

In [2]:
train_df =  pd.read_csv("full_training_df.csv")


In [3]:
df_9 = train_df[train_df['modben'] != 9]

df_dropna_modben = df_9.dropna(subset=['modben'])

In [4]:
X = df_dropna_modben.drop(columns=['PID', "modben"]).copy()
cat_features = ["age_category", "sexcd", "bmi_category", "srdecc1", "surgcd1", "spcsuc1", "scdecc1",
                      "hemccd1", "mhpsyccd", "mhneurcd", "mhcardcd", "mhmetacd", "tx1_r", "ais1", "ais4", "ais8", "ais16"]


In [5]:
y = df_dropna_modben["modben"]
for col in cat_features:
    X[col] = X[col].astype(str).fillna("missing")

y = y.astype(str).fillna("missing")

In [6]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score


# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Define model
model = CatBoostClassifier(
    iterations=600,
    learning_rate=0.05,
    depth=5,
    loss_function='MultiClass',  # use 'Logloss' for binary classification
    eval_metric='TotalF1',
    verbose=100,
    early_stopping_rounds=20,
    boosting_type='Ordered',
    random_seed=20,
    auto_class_weights='Balanced',

)

# # Train

model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_features)




0:	learn: 0.3619166	test: 0.3159071	best: 0.3159071 (0)	total: 316ms	remaining: 3m 8s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.6530265621
bestIteration = 49

Shrink model to first 50 iterations.


<catboost.core.CatBoostClassifier at 0x22cd54d53a0>

In [7]:
Modben_missing = train_df[(train_df['modben'] == 9) | (train_df['modben'].isna())]

In [8]:
X_na_test = Modben_missing.drop(columns=['PID', "modben"]).copy()
for col in X_na_test.columns:
    # Ensure all values in cat_features columns are strings
    X_na_test[col] = X_na_test[col].astype(str).fillna("missing")
    # Explicitly convert any remaining numeric types to string
    X_na_test[col] = X_na_test[col].apply(str)

na_pred = model.predict(X_na_test)

In [9]:
min(na_pred)

array(['1.0'], dtype=object)

In [9]:
X_na_test["modben"] = na_pred.flatten() # Flatten na_pred to a 1D array before assigning
X_na_test["PID"] = Modben_missing["PID"]

  X_na_test["modben"] = na_pred.flatten() # Flatten na_pred to a 1D array before assigning
  X_na_test["PID"] = Modben_missing["PID"]


In [10]:
X_na_test

Unnamed: 0,elbfll01,wrextl01,elbexl01,finfll01,finabl01,hipfll01,kneexl01,ankdol01,gretol01,ankpll01,...,surgcd1,spcsuc1,scdecc1,hemccd1,mhpsyccd,mhneurcd,mhcardcd,mhmetacd,modben,PID
21,5.0,4.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,4.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,4.0,PID_361
22,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,1.0,...,,,,,0.0,0.0,0.0,0.0,4.0,PID_74
36,5.0,5.0,5.0,5.0,5.0,0.0,0.0,0.0,0.0,0.0,...,,,,,0.0,0.0,0.0,0.0,4.0,PID_726
42,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,0.0,0.0,0.0,0.0,4.0,PID_770
46,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,PID_351
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
650,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,,1.0,0.0,0.0,0.0,1.0,0.0,1.0,PID_657
651,,,,,,,,,,,...,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,4.0,PID_333
658,,,,,,,,,,,...,,,,,0.0,0.0,0.0,0.0,4.0,PID_579
661,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,PID_266


In [11]:
final_test = pd.concat([df_dropna_modben, X_na_test])

In [13]:
final_test.to_csv("full_training_df_W_pred.csv", index=False)

In [12]:
target_variables = ["PID", "modben", "age_category", "sexcd", "bmi_category", "srdecc1", "surgcd1", "spcsuc1", "scdecc1",
                    "hemccd1", "mhpsyccd", "mhneurcd", "mhcardcd", "mhmetacd", "tx1_r", "ais1", "elbfll01", "wrextl01",
                    "elbexl01", "finfll01", "finabl01", "hipfll01", "kneexl01", "ankdol01", "gretol01", "ankpll01",
                    "elbflr01", "wrextr01", "elbexr01", "finflr01", "finabr01", "hipflr01", "kneetr01", "ankdor01",
                    "gretor01", "ankplr01", "c2ltl01", "c3ltl01", "c4ltl01", "c5ltl01", "c6ltl01", "c7ltl01", "c8ltl01",
                    "t1ltl01", "t2ltl01", "t3ltl01", "t4ltl01", "t5ltl01", "t6ltl01", "t7ltl01", "t8ltl01", "t9ltl01",
                    "t10ltl01", "t11ltl01", "t12ltl01", "l1ltl01", "l2ltl01", "l3ltl01", "l4ltl01", "l5ltl01", "s1ltl01",
                    "s2ltl01", "s3ltl01", "s45ltl01", "c2ltr01", "c3ltr01", "c4ltr01", "c5ltr01", "c6ltr01", "c7ltr01",
                    "c8ltr01", "t1ltr01", "t2ltr01", "t3ltr01", "t4ltr01", "t5ltr01", "t6ltr01", "t7ltr01", "t8ltr01",
                    "t9ltr01", "t10ltr01", "t11ltr01", "t12ltr01", "l1ltr01", "l2ltr01", "l3ltr01", "l4ltr01", "l5ltr01",
                    "s1ltr01", "s2ltr01", "s3ltr01", "s45ltr01", "c2ppl01", "c3ppl01", "c4ppl01", "c5ppl01", "c6ppl01",
                    "c7ppl01", "c8ppl01", "t1ppl01", "t2ppl01", "t3ppl01", "t4ppl01", "t5ppl01", "t6ppl01", "t7ppl01",
                    "t8ppl01", "t9ppl01", "t10ppl01", "t11ppl01", "t12ppl01", "l1ppl01", "l2ppl01", "l3ppl01", "l4ppl01",
                    "l5ppl01", "s1ppl01", "s2ppl01", "s3ppl01", "s45ppl01", "c2ppr01", "c3ppr01", "c4ppr01", "c5ppr01",
                    "c6ppr01", "c7ppr01", "c8ppr01", "t1ppr01", "t2ppr01", "t3ppr01", "t4ppr01", "t5ppr01", "t6ppr01",
                    "t7ppr01", "t8ppr01", "t9ppr01", "t10ppr01", "t11ppr01", "t12ppr01", "l1ppr01", "l2ppr01", "l3ppr01",
                    "l4ppr01", "l5ppr01", "s1ppr01", "s2ppr01", "s3ppr01", "s45ppr01"]


cat_features = ["age_category", "sexcd", "bmi_category", "srdecc1", "surgcd1", "spcsuc1", "scdecc1",
                      "hemccd1", "mhpsyccd", "mhneurcd", "mhcardcd", "mhmetacd", "tx1_r", "ais1"]

training_df = final_test[target_variables]

In [13]:
X = training_df.drop(columns=['PID', "modben"]).copy()
y = training_df["modben"]
for col in cat_features:
    X[col] = X[col].astype(str).fillna("missing")

y = y.astype(str).fillna("missing")

In [14]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

# Define model
model = CatBoostClassifier(
    iterations=600,
    learning_rate=0.05,
    depth=5,
    loss_function='MultiClass',  # use 'Logloss' for binary classification
    eval_metric='TotalF1',
    verbose=100,
    early_stopping_rounds=50,
    boosting_type='Ordered',
    random_seed=50,
    auto_class_weights='Balanced',

)
# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


model.fit(X_train, y_train, eval_set=(X_val, y_val), cat_features=cat_features)




0:	learn: 0.2386233	test: 0.1994414	best: 0.1994414 (0)	total: 159ms	remaining: 1m 35s
100:	learn: 0.4892254	test: 0.4155744	best: 0.4554351 (73)	total: 11.2s	remaining: 55.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.4554350637
bestIteration = 73

Shrink model to first 74 iterations.


<catboost.core.CatBoostClassifier at 0x22cd61f07a0>

In [15]:
from sklearn.metrics import f1_score

# Predict class labels for the validation set
y_pred = model.predict(X_val)

# Flatten predictions (CatBoost may return shape (n, 1) as ndarray)
y_pred = y_pred.ravel()

# Compute F1 scores
macro_f1 = f1_score(y_val, y_pred, average='macro')
weighted_f1 = f1_score(y_val, y_pred, average='weighted')

print(f"Macro F1: {macro_f1:.4f}")
print(f"Weighted F1: {weighted_f1:.4f}")


Macro F1: 0.4736
Weighted F1: 0.5678


In [16]:
# Final model training on all data
final_model = CatBoostClassifier(
    iterations=model.tree_count_,  # Use optimal iteration count
    learning_rate=0.05,
    depth=5,
    loss_function='MultiClass',
    eval_metric='TotalF1',
    boosting_type='Ordered',
    random_seed=20,
    auto_class_weights='Balanced',
    verbose=100
)
final_model.fit(X, y, cat_features=cat_features)


0:	learn: 0.2978038	total: 132ms	remaining: 9.63s
73:	learn: 0.4869692	total: 7.44s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x22cd6aaa090>

In [17]:
test_features_df = pd.read_csv("test_features.csv")
#
test_outcomes_df = pd.read_csv("test_outcomes_Fun_template_update.csv")
metadata_df = pd.read_csv("metadata.csv")

In [18]:

testing_df = test_features_df.merge(test_outcomes_df, on="PID", how="inner")
full_testing_df = testing_df.merge(metadata_df, on="PID", how="inner")

In [19]:
testing_df = full_testing_df[target_variables]

In [20]:
test = testing_df.merge(test_outcomes_df, on="PID", how="inner")
test = test.drop(columns=["time-DELETE THIS COLUMN FOR SUBMISSION"], axis=1)

In [21]:
test_final = test.drop(columns=["modben_x", "modben_y", "PID"], axis=1)

In [22]:
for col in cat_features:
    test_final[col] = test_final[col].astype(str).fillna("missing")

In [23]:
final_pred = model.predict(test_final)

In [24]:
len(final_pred)

118

In [25]:
result_df = test_final.copy()
result_df["modben"] = final_pred.flatten()

In [26]:
result_df["PID"] = test['PID']

In [27]:
submission_df = result_df[["PID", "modben"]]

In [28]:
submission_df

Unnamed: 0,PID,modben
0,PID_510,1.0
1,PID_448,1.0
2,PID_334,1.0
3,PID_581,1.0
4,PID_452,1.0
...,...,...
113,PID_147,6.0
114,PID_783,7.0
115,PID_114,7.0
116,PID_364,7.0


In [29]:
# with open(f"predicted_catboost.csv", "wb") as input_csv:
#     submission_df.to_csv(input_csv, index=False)