In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score,
    f1_score, roc_auc_score, classification_report, confusion_matrix
)

In [2]:
train_df = pd.read_excel("recruiting_zeta-disease_training-data_take-home-challenge.xlsx")
test_df = pd.read_excel("recruiting_zeta-disease_prediction-data_take-home-challenge.xlsx")

In [10]:
features = [
    'age', 'weight', 'bmi', 'blood_pressure', 'insulin_test',
    'liver_stress_test', 'cardio_stress_test', 'years_smoking'
]

In [11]:
for i in features:
    lower = train_df[i].quantile(0.01)
    upper = train_df[i].quantile(0.99)
    train_df[i] = train_df[i].clip(lower, upper)
    test_df[i] = test_df[i].clip(lower, upper)

In [12]:
train_df['age_smoke'] = train_df['age'] * train_df['years_smoking']
test_df['age_smoke'] = test_df['age'] * test_df['years_smoking']

train_df['bmi_age_ratio'] = train_df['bmi'] / (train_df['age'] + 1)
test_df['bmi_age_ratio'] = test_df['bmi'] / (test_df['age'] + 1)

train_df['total_stress'] = train_df['liver_stress_test'] + train_df['cardio_stress_test']
test_df['total_stress'] = test_df['liver_stress_test'] + test_df['cardio_stress_test']

features += ['age_smoke', 'bmi_age_ratio', 'total_stress']

In [13]:
X = train_df[features]
y = train_df['zeta_disease'].astype(int)
X_test = test_df[features]

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [17]:
rf = RandomForestClassifier(class_weight='balanced', random_state=42)
grid = GridSearchCV(rf, param_grid, scoring='recall', cv=5, n_jobs=-1)
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(class_weight='balanced',
                                              random_state=42),
             n_jobs=-1,
             param_grid={'max_depth': [10, None], 'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [100, 200]},
             scoring='recall')

In [23]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_val)
y_proba = best_model.predict_proba(X_val)[:, 1]

In [24]:
print("Best Parameters:", grid.best_params_)
print("Accuracy:", accuracy_score(y_val, y_pred))
print("Recall:", recall_score(y_val, y_pred))
print("Precision:", precision_score(y_val, y_pred))
print("F1 Score:", f1_score(y_val, y_pred))
print("ROC AUC Score:", roc_auc_score(y_val, y_proba))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))

Best Parameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.7625
Recall: 0.6428571428571429
Precision: 0.6666666666666666
F1 Score: 0.6545454545454545
ROC AUC Score: 0.8246909340659341
Confusion Matrix:
 [[86 18]
 [20 36]]
Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.83      0.82       104
           1       0.67      0.64      0.65        56

    accuracy                           0.76       160
   macro avg       0.74      0.73      0.74       160
weighted avg       0.76      0.76      0.76       160



In [None]:
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test_scaled)
y_proba = best_model.predict_proba(X_test_scaled)[:, 1]