# Environment

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

# Load data

In [None]:
df = pd.read_csv('../dados/diabetes_train.csv')

In [None]:
df_cols = pd.read_csv('../dados/diabetes_columns.csv')

In [None]:
df_cols

In [None]:
df

# Feature engineering

In [None]:
TARGET_COL = 'Diabetes'

In [None]:
df_cols = df_cols[df_cols['column'] != TARGET_COL]

In [None]:
# 2. Define Feature Groups
numeric_feats = list(df_cols.loc[df_cols['type'] == 'Float', 'column'].values)
nominal_feats = list(df_cols.loc[df_cols['type'] == 'Categorical', 'column'].values) # Unordered
ordinal_feats = list(df_cols.loc[df_cols['type'] == 'Ordinal', 'column'].values)    # Ordered
boolean_feats = list(df_cols.loc[df_cols['type'] == 'Boolean', 'column'].values) # Binary

In [None]:
# 3. Define Transformers

# A. Nominal: Use OneHotEncoder
nominal_transformer = OneHotEncoder(handle_unknown='ignore')

# B. Ordinal: Use OrdinalEncoder with defined categories
# Note: You must list the categories in ascending order for each ordinal column
# Here: Junior (0) < Mid (1) < Senior (2)
ordinal_transformer = OrdinalEncoder(categories='auto')

# C. Boolean: Use OrdinalEncoder (maps No->0, Yes->1 alphabetically usually)
# Or define explicit order like below to be safe: ['No', 'Yes'] maps to 0 and 1
boolean_transformer = OrdinalEncoder(categories='auto')

# D. Numeric: Standard Scaler
numeric_transformer = StandardScaler()

# 4. Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feats),
        ('nom', nominal_transformer, nominal_feats),
        ('ord', ordinal_transformer, ordinal_feats),
        ('bool', boolean_transformer, boolean_feats)
    ])

In [None]:
# 5. Build Pipeline
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Test it immediately
X = df.drop(TARGET_COL, axis=1)
y = df[TARGET_COL]

In [None]:
X.shape, y.shape

# Train and test

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline (preprocessing + training happens here)
model_pipeline.fit(X_train, y_train)

# Predict
y_pred = model_pipeline.predict(X_test)

# Evaluation Metrics
print("### Hold-out Test Evaluation ###")
print(classification_report(y_test, y_pred))

# Hyperparameter Optimization

In [None]:
import os

# Create a specific folder for temporary joblib files
os.makedirs('./joblib_temp', exist_ok=True)
os.environ['JOBLIB_TEMP_FOLDER'] = './joblib_temp'

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np
import joblib

# 1. Define the grid of hyperparameters to search
# Note the prefix 'classifier__' matching the name in your Pipeline steps
param_dist = {
    'classifier__n_estimators': [100, 300, 500],
    'classifier__max_depth': [None, 10, 30],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__min_samples_leaf': [2, 4],
    'classifier__bootstrap': [True, False],
    'classifier__criterion': ['gini']
}

# 2. Initialize RandomizedSearchCV
# n_iter=50 means it will try 50 random combinations (adjust for speed vs accuracy)
# cv=3 means it uses 3-fold cross-validation for every try
# scoring='f1_macro' for prioritizing rare predictions
random_search = RandomizedSearchCV(
    estimator=model_pipeline,
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1_macro',
    cv=2,
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available CPU cores
)

print("### Starting Hyperparameter Optimization ###")
# 3. Fit the search object (this takes time!)
with joblib.parallel_backend('threading'):
    random_search.fit(X_train, y_train)

#random_search.fit(X_train, y_train)

# 4. Results
print(f"\nBest Parameter Combination Found:")
print(random_search.best_params_)

print(f"\nBest Cross-Validation Score (F1 Macro): {random_search.best_score_:.4f}")

# 5. Update your pipeline to be the best estimator found
best_model_pipeline = random_search.best_estimator_

# Now you can proceed to predict using 'best_model_pipeline'
print("\nValidating on Test Set with Best Model...")
y_pred_optimized = best_model_pipeline.predict(X_test)
print(classification_report(y_test, y_pred_optimized))

# Backtest

In [None]:
# Initialize 3-Fold Stratified Cross-Validation
# Stratified ensures the ratio of Target 0s and 1s is preserved in each fold
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

fold_results = []

print("### Starting 3-Fold Backtest ###\n")

for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    # Split data for this fold
    X_train_fold, X_test_fold = X.iloc[train_index], X.iloc[test_index]
    y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]
    
    # Clone the pipeline to ensure a fresh model for each fold
    from sklearn.base import clone
    fold_pipeline = clone(best_model_pipeline)
    
    # Train
    fold_pipeline.fit(X_train_fold, y_train_fold)
    
    # Predict
    y_pred_fold = fold_pipeline.predict(X_test_fold)
    
    # Calculate Metrics
    acc = accuracy_score(y_test_fold, y_pred_fold)
    prec = precision_score(y_test_fold, y_pred_fold, average='macro',  zero_division=0)
    rec = recall_score(y_test_fold, y_pred_fold, average='macro',  zero_division=0)
    f1 = f1_score(y_test_fold, y_pred_fold, average='macro', zero_division=0)
    
    # Store and Print
    fold_results.append({'Fold': i+1, 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1': f1})
    print(f"Fold {i+1}: Accuracy={acc:.2f} | Precision={prec:.2f} | Recall={rec:.2f} | F1={f1:.2f}")

# Average Performance
avg_acc = np.mean([res['Accuracy'] for res in fold_results])
print(f"\nAverage Accuracy across 3 folds: {avg_acc:.2f}")

# Feature importances

## RandomForest

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# 1. Access the classifier step from the pipeline
# 'classifier' is the name we gave it in the Pipeline definition
rf_model = model_pipeline.named_steps['classifier']

# 2. Get the feature importances
importances = rf_model.feature_importances_

# 3. Get the feature names from the preprocessor
# This is crucial because OneHotEncoder adds new columns
feature_names = model_pipeline.named_steps['preprocessor'].get_feature_names_out()

# 4. Create a DataFrame to organize the data
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
})

# 5. Sort by importance (descending)
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# 6. Plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df, palette='viridis')
plt.title('Random Forest Feature Importances (Gini Importance)')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Optional: Print the top 5 features
print("Top 5 Features:")
print(feature_importance_df.head(5).to_string(index=False))

# Save model

In [None]:
import joblib

# Assuming 'model_pipeline' is your fitted pipeline
# (e.g., from Section 2 or 3 of the previous code)

# Define the file path
pipeline_filename = 'trained_classification_pipeline.joblib'

# Save the pipeline object
joblib.dump(model_pipeline, pipeline_filename)

print(f"âœ… Pipeline successfully saved to {pipeline_filename}")

# Inference

In [None]:
df_predict = pd.read_csv('../dados/diabetes_predict.csv')

In [None]:
# Predict using the main fitted pipeline
df_predict['prediction'] = model_pipeline.predict(df_predict)
df_predict_final = pd.concat([df_predict, pd.DataFrame(model_pipeline.predict_proba(df_predict)).add_prefix('prob_')], axis = 1)