# Fall‑risk prediction on the gait analysis data

This notebook demonstrates how to load the combined gait dataset, preprocess it, handle class imbalance, train multiple models (Logistic Regression, Random Forest and XGBoost) **optimized for ROC AUC**, and evaluate them on a hold‑out test set.

## Key Features:
- **Hyperparameter optimization using GridSearchCV/RandomizedSearchCV with ROC AUC as the scoring metric**
- 5-fold cross-validation with ROC AUC scoring
- Comprehensive evaluation including ROC curves and confusion matrices
- Model comparison based on ROC AUC performance

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset
df = pd.read_csv('combined_output.csv')
# Map labels to binary
y = df['Faller'].map({'F': 1, 'NF': 0})
# Drop ID and label, convert to numeric and fill missing values
X = df.drop(columns=['ID', 'Faller']).apply(pd.to_numeric, errors='coerce')
X = X.fillna(X.median())
print('Dataset shape:', X.shape)
print('Class distribution:', y.value_counts())

In [None]:
# Split into train/test with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Oversample the minority class in the training set
train_df = X_train.copy()
train_df['label'] = y_train
majority = train_df[train_df['label'] == 0]
minority = train_df[train_df['label'] == 1]
minority_over = resample(minority, replace=True, n_samples=len(majority), random_state=42)
train_bal = pd.concat([majority, minority_over])
X_train_bal = train_bal.drop(columns=['label'])
y_train_bal = train_bal['label']

print('Balanced training set shape:', X_train_bal.shape)
print('Balanced class distribution:', y_train_bal.value_counts())

In [None]:
# Logistic Regression with GridSearchCV optimizing for ROC AUC
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_bal)
X_test_scaled = scaler.transform(X_test)

# Define parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [1000]
}

# GridSearchCV with ROC AUC scoring
log_reg_cv = GridSearchCV(
    LogisticRegression(),
    param_grid_lr,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)
log_reg_cv.fit(X_train_scaled, y_train_bal)

print('Best parameters for Logistic Regression:', log_reg_cv.best_params_)
print('Best cross-validation ROC AUC score:', log_reg_cv.best_score_)

# Use best model for predictions
log_reg = log_reg_cv.best_estimator_
y_pred_lr = log_reg.predict(X_test_scaled)
y_prob_lr = log_reg.predict_proba(X_test_scaled)[:,1]

acc_lr = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr, zero_division=0)
rec_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)
auc_lr = roc_auc_score(y_test, y_prob_lr)

print('\nLogistic Regression Test Set Results:')
print('Accuracy:', acc_lr)
print('Precision:', prec_lr)
print('Recall:', rec_lr)
print('F1:', f1_lr)
print('ROC AUC:', auc_lr)

# Confusion matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted NF','Predicted F'], yticklabels=['True NF','True F'])
plt.title('Logistic Regression Confusion Matrix')
plt.show()

In [None]:
# Random Forest with RandomizedSearchCV optimizing for ROC AUC
param_dist_rf = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [None, 10, 20, 30, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True, False],
    'class_weight': ['balanced', 'balanced_subsample', None]
}

# RandomizedSearchCV with ROC AUC scoring
rf_cv = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_dist_rf,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
rf_cv.fit(X_train_bal, y_train_bal)

print('Best parameters for Random Forest:', rf_cv.best_params_)
print('Best cross-validation ROC AUC score:', rf_cv.best_score_)

# Use best model for predictions
rf = rf_cv.best_estimator_
y_pred_rf = rf.predict(X_test)
y_prob_rf = rf.predict_proba(X_test)[:,1]

acc_rf = accuracy_score(y_test, y_pred_rf)
prec_rf = precision_score(y_test, y_pred_rf, zero_division=0)
rec_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
auc_rf = roc_auc_score(y_test, y_prob_rf)

print('\nRandom Forest Test Set Results:')
print('Accuracy:', acc_rf)
print('Precision:', prec_rf)
print('Recall:', rec_rf)
print('F1:', f1_rf)
print('ROC AUC:', auc_rf)

cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted NF','Predicted F'], yticklabels=['True NF','True F'])
plt.title('Random Forest Confusion Matrix')
plt.show()

In [None]:
# XGBoost with RandomizedSearchCV optimizing for ROC AUC
param_dist_xgb = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6, 7],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.5, 1],
    'min_child_weight': [1, 3, 5],
    'scale_pos_weight': [1, 2, 3]  # For handling class imbalance
}

# RandomizedSearchCV with ROC AUC scoring
xgb_cv = RandomizedSearchCV(
    XGBClassifier(random_state=42, eval_metric='auc', use_label_encoder=False),
    param_dist_xgb,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
xgb_cv.fit(X_train_bal, y_train_bal)

print('Best parameters for XGBoost:', xgb_cv.best_params_)
print('Best cross-validation ROC AUC score:', xgb_cv.best_score_)

# Use best model for predictions
xgb_model = xgb_cv.best_estimator_
y_pred_xgb = xgb_model.predict(X_test)
y_prob_xgb = xgb_model.predict_proba(X_test)[:,1]

acc_xgb = accuracy_score(y_test, y_pred_xgb)
prec_xgb = precision_score(y_test, y_pred_xgb, zero_division=0)
rec_xgb = recall_score(y_test, y_pred_xgb)
f1_xgb = f1_score(y_test, y_pred_xgb)
auc_xgb = roc_auc_score(y_test, y_prob_xgb)

print('\nXGBoost Test Set Results:')
print('Accuracy:', acc_xgb)
print('Precision:', prec_xgb)
print('Recall:', rec_xgb)
print('F1:', f1_xgb)
print('ROC AUC:', auc_xgb)

cm_xgb = confusion_matrix(y_test, y_pred_xgb)
sns.heatmap(cm_xgb, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted NF','Predicted F'], yticklabels=['True NF','True F'])
plt.title('XGBoost Confusion Matrix')
plt.show()

In [None]:
# Summarize results in a DataFrame
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'CV ROC AUC': [log_reg_cv.best_score_, rf_cv.best_score_, xgb_cv.best_score_],
    'Test ROC AUC': [auc_lr, auc_rf, auc_xgb],
    'Test Accuracy': [acc_lr, acc_rf, acc_xgb],
    'Test Precision': [prec_lr, prec_rf, prec_xgb],
    'Test Recall': [rec_lr, rec_rf, rec_xgb],
    'Test F1': [f1_lr, f1_rf, f1_xgb]
})
results = results.sort_values('Test ROC AUC', ascending=False).reset_index(drop=True)
print('\n=== Model Performance Summary (Optimized for ROC AUC) ===')
results

In [None]:
# ROC Curve Comparison for all models
plt.figure(figsize=(10, 8))

# Calculate ROC curves for each model
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_prob_rf)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_prob_xgb)

# Plot ROC curves
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.3f})', linewidth=2)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {auc_rf:.3f})', linewidth=2)
plt.plot(fpr_xgb, tpr_xgb, label=f'XGBoost (AUC = {auc_xgb:.3f})', linewidth=2)

# Plot diagonal reference line
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier (AUC = 0.500)', linewidth=1)

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('ROC Curves - Model Comparison', fontsize=14)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, alpha=0.3)
plt.show()

print('\nROC AUC Summary:')
print(f'Logistic Regression: {auc_lr:.4f}')
print(f'Random Forest: {auc_rf:.4f}')
print(f'XGBoost: {auc_xgb:.4f}')