# Advanced Thyroid Cancer Prediction Model Training

This notebook implements an advanced machine learning pipeline for thyroid cancer prediction using multiple models and advanced feature engineering.

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report
from sklearn.ensemble import VotingClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import optuna
import joblib
from feature_engineering import create_features

## Data Loading and Preparation

In [None]:
print("Loading and preparing data...")

# Load the dataset
data = pd.read_csv('thyroid_cancer_risk_data.csv')

# Apply feature engineering
data = create_features(data)

# Split features and target
X = data.drop('Cancer', axis=1)
y = data['Cancer']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize preprocessors
scaler = StandardScaler()
le = LabelEncoder()

# Get list of categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Encode categorical variables
for col in categorical_cols:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])

# Scale numerical features
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

## Feature Selection

In [None]:
print("\nPerforming feature selection...")

# Train a simple XGBoost model for feature importance
feature_selector = xgb.XGBClassifier(random_state=42)
feature_selector.fit(X_train, y_train)

# Get feature importance scores
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': feature_selector.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

# Select top features (you can adjust the threshold)
selected_features = feature_importance['feature'][:30].tolist()

# Update datasets with selected features
X_train = X_train[selected_features]
X_test = X_test[selected_features]

## XGBoost Model Optimization

In [None]:
print("\nOptimizing XGBoost hyperparameters...")

def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True)
    }
    
    model = xgb.XGBClassifier(**params, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=35)

# Train XGBoost with best parameters
xgb_model = xgb.XGBClassifier(**study.best_params, random_state=42)
xgb_model.fit(X_train, y_train)

## LightGBM Model

In [None]:
# Train LightGBM model
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)

## CatBoost Model

In [None]:
# Train CatBoost model
cb_model = cb.CatBoostClassifier(random_state=42, verbose=0)
cb_model.fit(X_train, y_train)

## Voting Ensemble

In [None]:
# Create and train voting ensemble
voting_model = VotingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('lgb', lgb_model),
        ('cb', cb_model)
    ],
    voting='soft'
)
voting_model.fit(X_train, y_train)

## Model Evaluation

In [None]:
print("\nEvaluating models...")

def evaluate_model(model, name):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    
    print(f"\n{name} Results:")
    print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    print(f"Average Precision: {average_precision_score(y_test, y_pred_proba):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    return roc_auc_score(y_test, y_pred_proba)

# Evaluate all models
scores = {
    'XGBoost': evaluate_model(xgb_model, 'XGBoost'),
    'LightGBM': evaluate_model(lgb_model, 'LightGBM'),
    'CatBoost': evaluate_model(cb_model, 'CatBoost'),
    'Voting': evaluate_model(voting_model, 'Voting Ensemble')
}

## Save Best Model and Preprocessors

In [None]:
print("\nSaving best model and preprocessors...")

# Find best model
best_model_name = max(scores.items(), key=lambda x: x[1])[0]
best_model = {
    'XGBoost': xgb_model,
    'LightGBM': lgb_model,
    'CatBoost': cb_model,
    'Voting': voting_model
}[best_model_name]

# Save model and preprocessors
joblib.dump(best_model, 'best_thyroid_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(le, 'label_encoder.joblib')
joblib.dump(selected_features, 'selected_features.joblib')
joblib.dump(X.columns.tolist(), 'feature_names.joblib')

print(f"\nBest model saved with ROC AUC: {scores[best_model_name]:.4f}")
print("Training complete!")