# Thyroid Cancer Prediction - Advanced Machine Learning

## Sections:
1. Data Preprocessing
2. Feature Engineering
3. Model Development
4. Hyperparameter Tuning
5. Ensemble Methods
6. Model Evaluation
7. Feature Importance
8. Prediction Analysis

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, PolynomialFeatures
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import optuna
import shap
import warnings
warnings.filterwarnings('ignore')

# Set random seed
np.random.seed(42)

## 1. Data Preprocessing and Feature Engineering

In [None]:
# Load data
df = pd.read_csv('thyroid_cancer_risk_data.csv')

# Define features
categorical_vars = ['Gender', 'Country', 'Ethnicity', 'Family_History', 
                   'Radiation_Exposure', 'Iodine_Deficiency', 'Smoking', 
                   'Obesity', 'Diabetes']
numerical_vars = ['Age', 'TSH_Level', 'T3_Level', 'T4_Level', 'Nodule_Size']

# Feature Engineering
# 1. Create risk score
df['Risk_Score'] = (
    (df['Family_History'] == 'Yes').astype(int) * 2 +
    (df['Radiation_Exposure'] == 'Yes').astype(int) * 1.5 +
    (df['Iodine_Deficiency'] == 'Yes').astype(int) * 1.5 +
    (df['Smoking'] == 'Yes').astype(int) +
    (df['Obesity'] == 'Yes').astype(int) +
    (df['Diabetes'] == 'Yes').astype(int)
)

# 2. Create hormone ratios
df['TSH_T3_Ratio'] = df['TSH_Level'] / df['T3_Level']
df['TSH_T4_Ratio'] = df['TSH_Level'] / df['T4_Level']
df['T3_T4_Ratio'] = df['T3_Level'] / df['T4_Level']

# 3. Create age groups
df['Age_Group'] = pd.cut(df['Age'], bins=[0, 30, 45, 60, 75, 100], 
                        labels=['<30', '30-45', '45-60', '60-75', '>75'])

# 4. Create interaction features
df['Age_TSH'] = df['Age'] * df['TSH_Level']
df['Nodule_TSH'] = df['Nodule_Size'] * df['TSH_Level']

# Prepare features for modeling
# Add engineered features to the list
numerical_vars += ['Risk_Score', 'TSH_T3_Ratio', 'TSH_T4_Ratio', 'T3_T4_Ratio', 
                   'Age_TSH', 'Nodule_TSH']
categorical_vars += ['Age_Group']

# Prepare X and y
X = df[categorical_vars + numerical_vars].copy()
y = (df['Diagnosis'] == 'Malignant').astype(int)

# Encode categorical variables
le = LabelEncoder()
for col in categorical_vars:
    X[col] = le.fit_transform(X[col])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
X_train[numerical_vars] = scaler.fit_transform(X_train[numerical_vars])
X_test[numerical_vars] = scaler.transform(X_test[numerical_vars])

## 2. Model Development with Optuna Optimization

In [None]:
def objective_xgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'subsample': trial.suggest_uniform('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
        'random_state': 42
    }
    
    model = XGBClassifier(**param)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return scores.mean()

# Optimize XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgb, n_trials=50)

# Get best XGBoost model
best_xgb = XGBClassifier(**study_xgb.best_params)
best_xgb.fit(X_train, y_train)

def objective_lgb(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'random_state': 42
    }
    
    model = LGBMClassifier(**param)
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='roc_auc')
    return scores.mean()

# Optimize LightGBM
study_lgb = optuna.create_study(direction='maximize')
study_lgb.optimize(objective_lgb, n_trials=50)

# Get best LightGBM model
best_lgb = LGBMClassifier(**study_lgb.best_params)
best_lgb.fit(X_train, y_train)

# Create CatBoost model
catboost = CatBoostClassifier(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_state=42,
    verbose=False
)
catboost.fit(X_train, y_train)

## 3. Ensemble Methods

In [None]:
# Create voting classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', best_xgb),
        ('lgb', best_lgb),
        ('catboost', catboost)
    ],
    voting='soft'
)

# Create stacking classifier
estimators = [
    ('xgb', best_xgb),
    ('lgb', best_lgb),
    ('catboost', catboost)
]

stacking_clf = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(),
    cv=5
)

# Fit ensemble models
voting_clf.fit(X_train, y_train)
stacking_clf.fit(X_train, y_train)

## 4. Model Evaluation

In [None]:
# Create dictionary of models
models = {
    'XGBoost': best_xgb,
    'LightGBM': best_lgb,
    'CatBoost': catboost,
    'Voting': voting_clf,
    'Stacking': stacking_clf
}

# Evaluate all models
results = {}
plt.figure(figsize=(10, 8))

for name, model in models.items():
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    # Store results
    results[name] = {
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'classification_report': classification_report(y_test, y_pred),
        'confusion_matrix': confusion_matrix(y_test, y_pred),
        'roc_auc': roc_auc
    }
    
    # Plot ROC curve
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Different Models')
plt.legend()
plt.show()

# Print results
for name, result in results.items():
    print(f"\n{name} Results:")
    print("-" * 50)
    print(f"ROC AUC: {result['roc_auc']:.4f}")
    print("\nClassification Report:")
    print(result['classification_report'])

## 5. Feature Importance Analysis

In [None]:
# SHAP values for feature importance
explainer = shap.TreeExplainer(best_xgb)
shap_values = explainer.shap_values(X_test)

# Plot SHAP summary
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, plot_type="bar")
plt.title('Feature Importance (SHAP values)')
plt.show()

# Plot detailed SHAP values
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test)
plt.title('SHAP Value Distribution')
plt.show()

## 6. Prediction Analysis

In [None]:
# Get best model
best_model_name = max(results.items(), key=lambda x: x[1]['roc_auc'])[0]
best_model = models[best_model_name]

# Create prediction function
def predict_thyroid_cancer(patient_data):
    # Preprocess patient data
    patient_df = pd.DataFrame([patient_data])
    
    # Apply same preprocessing steps
    for col in categorical_vars:
        patient_df[col] = le.fit_transform(patient_df[col])
    
    patient_df[numerical_vars] = scaler.transform(patient_df[numerical_vars])
    
    # Make prediction
    prob = best_model.predict_proba(patient_df)[0, 1]
    prediction = 'Malignant' if prob > 0.5 else 'Benign'
    
    return {
        'prediction': prediction,
        'probability': prob,
        'risk_level': 'High' if prob > 0.7 else 'Medium' if prob > 0.3 else 'Low'
    }

# Save best model and preprocessing objects
import joblib
joblib.dump(best_model, 'best_thyroid_model.joblib')
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(le, 'label_encoder.joblib')

print(f"Best Model: {best_model_name}")
print(f"ROC AUC Score: {results[best_model_name]['roc_auc']:.4f}")

# Example prediction
example_patient = {
    'Age': 45,
    'Gender': 'Female',
    'Country': 'USA',
    'Ethnicity': 'Caucasian',
    'Family_History': 'No',
    'Radiation_Exposure': 'No',
    'Iodine_Deficiency': 'No',
    'Smoking': 'No',
    'Obesity': 'No',
    'Diabetes': 'No',
    'TSH_Level': 2.5,
    'T3_Level': 1.8,
    'T4_Level': 8.0,
    'Nodule_Size': 1.5
}

prediction_result = predict_thyroid_cancer(example_patient)
print("\nExample Prediction:")
print(prediction_result)