In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from lightgbm import LGBMClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier
categorical_cols = df.select_dtypes(include=['object', 'category'])#.columns.difference(exclude_cols)

# Label Encoding for categorical columns
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Defining X and y
X = df.drop(["RISK_DEATH", "DTA NASC_ano", "DIAS NA INSTITUICAO", "OBITO", "INT DES DESTINO", "4_IDADE"], axis=1)
y = df["RISK_DEATH"]
# Splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

# Applying SMOTE to balance classes in the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Normalizing the data
scaler = MinMaxScaler()
X_train_resampled_normalized = scaler.fit_transform(X_train_resampled)
X_test_normalized = scaler.transform(X_test)

# Using SelectFromModel with GradientBoostingClassifier for feature selection
selector = SelectFromModel(estimator=GradientBoostingClassifier(), threshold=-np.inf, max_features=11)
selector.fit(X_train_resampled_normalized, y_train_resampled)
selected_features_indices = np.where(selector.get_support())[0]

# Using saved indices to select relevant features
X_train_selected = X_train_resampled_normalized[:, selected_features_indices]
X_test_selected = X_test_normalized[:, selected_features_indices]
class_weights=  {0: 1, 1: 1}

base_learners = [
    ('gbt', GradientBoostingClassifier(n_estimators=100, subsample=0.8, max_features=0.8, random_state=42)),   
    ('DecisionTree', DecisionTreeClassifier()),
    ('XGBoost', XGBClassifier()),
    ('CatBoost', CatBoostClassifier(silent=True)),]

meta_classifier = classifier = LGBMClassifier(class_weight=class_weights,learning_rate=0.8, max_depth=6, n_estimators=20, num_leaves=20)
# Training the LGBMClassifier model with the best hyperparameters

stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_classifier, cv=5, stack_method='predict_proba')

stacking_model.fit(X_train_selected, y_train_resampled)
predictions = stacking_model.predict(X_test_selected)
probabilities = stacking_model.predict_proba(X_test_selected)[:, 1]  # For binary classification

# Converting y_test to integer if necessary
y_test = y_test.astype(int)

# Calculating and displaying the AUC-ROC score for binary classification
auc_roc = roc_auc_score(y_test, probabilities)

print("AUC-ROC Score:", auc_roc)

# Displaying the results
print("Feature Selector: SelectFromModel_GB, Classifier: LGBM")
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))

# Performing k-fold cross-validation only on the training set
cv_scores = cross_val_score(classifier, X_train_selected, y_train_resampled, cv=5, scoring='roc_auc')

# Displaying cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))