In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import (
    GradientBoostingClassifier, 
    RandomForestClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    BaggingClassifier
)
from sklearn.feature_selection import (
    SelectFromModel, 
    RFE, 
    SelectKBest, 
    f_classif,
    mutual_info_classif,
    VarianceThreshold
)
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
import time
from datetime import datetime

def run_ml_pipeline(df, target_cols, n_features=30):
    """
    Run complete ML pipeline with multiple classifiers and feature selectors.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input dataframe
    target_cols : list
        List of target columns to exclude from features
    n_features : int
        Number of features to select (default: 30)
    """
    
    # Prepare results dictionary
    results = {
        'classifier': [],
        'selector': [],
        'sampler': [],
        'n_features': [],
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'roc_auc': [],
        'selected_features': [],
        'training_time': [],
        'confusion_matrix': []
    }
    
    # Encode categorical variables
    le = LabelEncoder()
    categorical_columns = df.select_dtypes(include=['object']).columns
    df_encoded = df.copy()
    for column in categorical_columns:
        df_encoded[column] = le.fit_transform(df_encoded[column].astype(str))

    X = df_encoded.drop(columns=target_cols, errors='ignore')
    y = df_encoded['target'].astype(int)  #

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define samplers for imbalanced data
    samplers = {
        'SMOTE': SMOTE(random_state=42),
        'ADASYN': ADASYN(random_state=42)
    }

    # Define classifiers with optimized parameters
    classifiers = {
        'GradientBoosting': GradientBoostingClassifier(n_estimators=170, learning_rate=0.1, 
                                                      random_state=42, min_samples_split=8, 
                                                      min_samples_leaf=2),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost': XGBClassifier(n_estimators=100, random_state=42),
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
        'SVC': SVC(probability=True, random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42),
        'ExtraTrees': ExtraTreesClassifier(random_state=42),
        'KNeighbors': KNeighborsClassifier(n_neighbors=5),
        'GaussianNB': GaussianNB(),
        'DecisionTree': DecisionTreeClassifier(random_state=42),
        'RidgeClassifier': RidgeClassifier(random_state=42),
        'Bagging': BaggingClassifier(random_state=42)
    }

    # Define feature selectors
    base_estimator = DecisionTreeClassifier(random_state=42)
    selectors = {
        'SelectFromModel(GB)': SelectFromModel(
            GradientBoostingClassifier(n_estimators=100, random_state=42),
            max_features=n_features
        ),
        'SelectFromModel(RF)': SelectFromModel(
            RandomForestClassifier(n_estimators=100, random_state=42),
            max_features=n_features
        ),
        'SelectFromModel(XGB)': SelectFromModel(
            XGBClassifier(n_estimators=100, random_state=42),
            max_features=n_features
        ),
        'RFE': RFE(
            estimator=base_estimator,
            n_features_to_select=n_features
        ),
        'SelectKBest(f_classif)': SelectKBest(
            score_func=f_classif,
            k=n_features
        ),
        'SelectKBest(mutual_info)': SelectKBest(
            score_func=mutual_info_classif,
            k=n_features
        ),
        'VarianceThreshold': VarianceThreshold(threshold=0.01)
    }

    # Run all combinations
    for sampler_name, sampler in samplers.items():
        print(f"\nUsing {sampler_name} for imbalanced data handling")
        
        # Apply sampling
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
        
        # Verificar as classes presentes após o resampling
        print(f"Classes after resampling: {np.unique(y_train_resampled)}")
        
        for clf_name, clf in classifiers.items():
            for selector_name, selector in selectors.items():
                print(f"\nRunning {clf_name} with {selector_name} and {sampler_name}")
                start_time = time.time()
                
                try:
                    # Feature selection
                    X_train_selected = selector.fit_transform(X_train_resampled, y_train_resampled)
                    X_test_selected = selector.transform(X_test)
                    
                    # Check if we have enough features
                    if X_train_selected.shape[1] < 2:
                        print(f"Warning: {selector_name} selected too few features. Skipping...")
                        continue
                    
                    # Training with sample weights
                    weights = np.where(y_train_resampled == 1, 1.6, 1)
                    if isinstance(clf, (SVC, KNeighborsClassifier, GaussianNB)):
                        clf.fit(X_train_selected, y_train_resampled)
                    else:
                        clf.fit(X_train_selected, y_train_resampled, sample_weight=weights)
                    
                    # Predictions
                    predictions = clf.predict(X_test_selected)
                    if hasattr(clf, "predict_proba"):
                        probabilities = clf.predict_proba(X_test_selected)[:, 1]
                    else:
                        probabilities = clf.predict(X_test_selected)
                    
                    # Get metrics
                    report = classification_report(y_test, predictions, output_dict=True)
                    conf_matrix = confusion_matrix(y_test, predictions)
                    roc_auc = roc_auc_score(y_test, probabilities)
                    
                    # Get selected features
                    if hasattr(selector, 'get_support'):
                        selected_features = X.columns[selector.get_support()].tolist()
                    else:
                        selected_features = X.columns[:X_train_selected.shape[1]].tolist()
                    
                    # Store results
                    results['classifier'].append(clf_name)
                    results['selector'].append(selector_name)
                    results['sampler'].append(sampler_name)
                    results['n_features'].append(len(selected_features))
                    results['accuracy'].append(report['accuracy'])
                    
                    # Verificar se a classe '1' existe no relatório
                    if '1' in report:
                        results['precision'].append(report['1']['precision'])
                        results['recall'].append(report['1']['recall'])
                        results['f1'].append(report['1']['f1-score'])
                    else:
                        # Usar a classe positiva (assumindo classificação binária)
                        positive_class = str(max(map(int, report.keys() - {'accuracy', 'macro avg', 'weighted avg'})))
                        results['precision'].append(report[positive_class]['precision'])
                        results['recall'].append(report[positive_class]['recall'])
                        results['f1'].append(report[positive_class]['f1-score'])
                    
                    results['roc_auc'].append(roc_auc)
                    results['selected_features'].append(selected_features)
                    results['training_time'].append(time.time() - start_time)
                    results['confusion_matrix'].append(conf_matrix)
                    
                    # Print detailed report
                    print(f"\nResults for {clf_name} with {selector_name} and {sampler_name}")
                    print("="*80)
                    print(f"Number of selected features: {len(selected_features)}")
                    print(f"\nClassification Report:")
                    print(classification_report(y_test, predictions))
                    print(f"\nConfusion Matrix:")
                    print(conf_matrix)
                    print(f"\nROC AUC: {roc_auc:.4f}")
                    print(f"\nTraining Time: {results['training_time'][-1]:.2f} seconds")
                    print("\nTop 10 Selected Features:")
                    print(selected_features[:10])
                    print("="*80)
                    
                except Exception as e:
                    print(f"Error with {clf_name} and {selector_name}:")
                    print(f"Detailed error: {str(e)}")
                    continue
    
    # Create summary DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_df.to_csv(f'ml_pipeline_results_{timestamp}.csv', index=False)
    
    # Print summary of best results
    print("\nTop 10 Models by ROC AUC:")
    print(results_df.sort_values('roc_auc', ascending=False).head(10)[
        ['classifier', 'selector', 'sampler', 'n_features', 'accuracy', 
         'precision', 'recall', 'f1', 'roc_auc', 'training_time'
    ]])
    
    return results_df

# Execute o pipeline
target_columns = ["target", "T_score_fn", "T_score_tf", "T_score_sp",
                 "DXXSPN_DXXOSBMD", "DXXFEM_DXXNKBMD", "DXXFEM_DXXOFBMD"]

# Primeiro, vamos verificar os dados
print("Verificando os dados antes de executar o pipeline:")
print("\nDistribuição da variável target:")
print(df_final['target'].value_counts())
print("\nTipos de dados das colunas:")
print(df_final.dtypes)

# Executar o pipeline
results_df = run_ml_pipeline(df_final, target_columns)

In [None]:
# Ordenar resultados por diferentes métricas
print("\nTop 5 modelos por ROC AUC:")
print(results_df.sort_values('roc_auc', ascending=False).head()[
    ['classifier', 'selector', 'sampler', 'n_features', 'roc_auc', 'f1', 'training_time']
])

print("\nTop 5 modelos por F1-Score:")
print(results_df.sort_values('f1', ascending=False).head()[
    ['classifier', 'selector', 'sampler', 'n_features', 'roc_auc', 'f1', 'training_time']
])

# Criar uma métrica composta
results_df['score_composto'] = (
    0.7 * results_df['roc_auc'] + 
    0.3 * results_df['f1'] 
)

print("\nTop 5 modelos por Score Composto:")
print(results_df.sort_values('score_composto', ascending=False).head()[
    ['classifier', 'selector', 'sampler', 'n_features', 'roc_auc', 'f1', 'training_time', 'score_composto']
])

# Análise detalhada do melhor modelo
melhor_modelo = results_df.sort_values('score_composto', ascending=False).iloc[0]
print("\nAnálise detalhada do melhor modelo:")
print(f"Classifier: {melhor_modelo['classifier']}")
print(f"Selector: {melhor_modelo['selector']}")
print(f"Sampler: {melhor_modelo['sampler']}")
print(f"Número de features: {melhor_modelo['n_features']}")
print(f"ROC AUC: {melhor_modelo['roc_auc']:.4f}")
print(f"F1-Score: {melhor_modelo['f1']:.4f}")
print(f"Tempo de treinamento: {melhor_modelo['training_time']:.2f} segundos")
print("\nFeatures selecionadas:")
print(melhor_modelo['selected_features'])