## Trabalho 1 (Classificação) - Introdução ao Aprendizado de Máquina (EEL891)
> Nome: Danilo Davi Gomes Fróes
>
> DRE: 124026825

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# from skopt import BayesSearchCV

class ClassificationEvaluator:
    def __init__(self, output_path):
        self.output_path = output_path

    def _plot_confusion_matrix(self, key, y_true, y_pred, cross_val=False, all_conf_matrices=[]):

        if cross_val and y_pred == None and y_true == None:
            cm = np.mean(all_conf_matrices, axis=0)
        else:
            cm = confusion_matrix(y_true, y_pred)

        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Não Inadimplente', 'Inadimplente'])
        disp.plot(cmap=plt.cm.Blues)

        os.makedirs(self.output_path, exist_ok=True)
        plt.savefig(os.path.join(self.output_path, f'confusion_matrix_{key}.jpg'))
        plt.close()

    def evaluate(self, key, fold_results):
        scores = []  # Lista para armazenar os scores de cada fold

        for fold in fold_results:
            y_true = fold['y_true']
            y_pred = fold['y_pred']

            score = {'accuracy' : accuracy_score(y_true, y_pred)}
            scores.append(score)

        if len(fold_results) > 1:
            all_conf_matrices = [] 
            conf_matrix = confusion_matrix(y_true, y_pred, labels=[0, 1])
            all_conf_matrices.append(conf_matrix)

            self._plot_confusion_matrix(key, None, None, cross_val=True, all_conf_matrices=all_conf_matrices)
        else:
            self._plot_confusion_matrix(key, y_true, y_pred)

        return scores

class ModeloML:
    def __init__(self):
        self.pipeline_registry = {}

    def load_data(self, path_test, path_train):

        df_train = pd.read_csv(path_train) # Carrega os dados de treino como DataFrames do Pandas
        df_train.replace([' ', 'N/A', '', '?'], np.nan, inplace=True)

        y_col = 'inadimplente'  # Coluna target
        cols_to_drop = ['id_solicitante', 'local_onde_reside', 'local_onde_trabalha']
        x_cols = df_train.drop(columns=[y_col] + cols_to_drop).columns  # Colunas de features

        # Divide os dados de treino em X (features) e y (target)
        self.y_train = df_train[y_col]
        self.X_train = df_train[x_cols]

        df_predict = pd.read_csv(path_test) # Carrega os dados de previsão como DataFrames do Pandas
        X_predict = df_predict.copy() # Cópia para evitar alterações no DataFrame original

    def holdout(self, test_size=0.2, random_state=42):

        holdout_results = [] # Lista para armazenar os resultados da divisão holdout

        # Divide os dados de treino em X e y
        X_train, X_test, y_train, y_test = train_test_split(self.X_train, self.y_train, test_size=test_size, random_state=random_state)

        # Armazena os resultados da divisão em um dicionário
        holdout_results.append({
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        })

        key = 'holdout' # Chave para armazenar os resultados

        # Armazena os resultados no registro de pipeline para poder ser acessado posteriormente
        self.pipeline_registry[key] = {
            'split_data' : holdout_results
        }

        return holdout_results
    
    def KFold_cross_validation(self, n_splits=5, shuffle=True, random_state=42):

        folds_results = [] # Lista para armazenar os folds criados

        kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) # Cria um objeto KFold para dividir os dados em k-folds

        # Divide os dados de treino em k-folds
        for train_index, test_index in kf.split(self.X_train, self.y_train):
            X_train, X_test = self.X_train.iloc[train_index], self.X_train.iloc[test_index]
            y_train, y_test = self.y_train.iloc[train_index], self.y_train.iloc[test_index]
            folds_results.append({
                'fold': len(folds_results) + 1,
                'X_train': X_train,
                'X_test': X_test,
                'y_train': y_train,
                'y_test': y_test
            })

        key = 'KFold' 

        self.pipeline_registry[key] = {
            'split_data': folds_results
        }

        return folds_results
    
    def KFold_cross_validation_stratified(self, n_splits=5, shuffle=True, random_state=42):

        folds_results = [] # Lista para armazenar os folds criados

        skf = StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state) # Cria um objeto StratifiedKFold para dividir os dados em k-folds estratificados

        for train_index, test_index in skf.split(self.X_train, self.y_train):
            X_train, X_test = self.X_train[train_index], self.X_train[test_index]
            y_train, y_test = self.y_train[train_index], self.y_train[test_index]
            folds_results.append({
                'fold': len(folds_results) + 1,
                'X_train': X_train,
                'X_test': X_test,
                'y_train': y_train,
                'y_test': y_test
            })

        key = 'KFold_stratified'

        self.pipeline_registry[key] = {
            'split_data': folds_results
        }

        return folds_results

    def scale_data(self, num_cols, scalers = []):

        new_pipeline_registry = {}

        for key in self.pipeline_registry.keys():
            for scaler in scalers:
                num_scaler = scaler() # Cria uma instância do scaler

                key_scaler = f'scaled_{type(scaler).__name__}_{key}'
                new_pipeline_registry[key_scaler] = {
                    **self.pipeline_registry[key],
                    'scaler': num_scaler
                }

                for fold_data in self.pipeline_registry[key]['split_data']:
                    # for fold_data in data.values():
                    X_num_train = fold_data['X_train'][num_cols]
                    X_num_test = fold_data['X_test'][num_cols]

                    fold_data['X_train'][num_cols] = num_scaler.fit_transform(X_num_train)
                    fold_data['X_test'][num_cols] = num_scaler.transform(X_num_test)

        self.pipeline_registry = new_pipeline_registry # Atualiza o registro de pipeline com os dados escalados

    def encode_data(self, cat_cols, encoders = []):
            
        new_pipeline_registry = {}

        for key in self.pipeline_registry.keys():
            for encoder in encoders:
                cat_encoder = encoder() # Cria uma instância do encoder
                
                key_encoder = f'encoded_{type(encoder).__name__}_{key}'
                new_pipeline_registry[key_encoder] = {
                    **self.pipeline_registry[key],
                    'encoder': cat_encoder
                }

                for fold_data in self.pipeline_registry[key]['split_data']:
                    # for fold_data in data.values():
                    X_cat_train = fold_data['X_train'][cat_cols]
                    X_cat_test = fold_data['X_test'][cat_cols]

                    fold_data['X_train'][cat_cols] = cat_encoder.fit_transform(X_cat_train)
                    fold_data['X_test'][cat_cols] = cat_encoder.transform(X_cat_test)

        self.pipeline_registry = new_pipeline_registry # Atualiza o registro de pipeline com os dados codificados

    def impute_data(self, imputers = []):
        
        new_pipeline_registry = {}

        for key in self.pipeline_registry.keys():
            for imputer in imputers:
                num_imputer = imputer()

                key_imputer = f'imputed_{type(imputer).__name__}_{key}'
                new_pipeline_registry[key_imputer] = {
                    **self.pipeline_registry[key],
                    'imputer': num_imputer
                }

                for data in self.pipeline_registry[key]['split_data']:
                    for fold_data in data.values():
                        X_num_train = fold_data['X_train']
                        X_num_test = fold_data['X_test']

                        fold_data['X_train'] = num_imputer.fit_transform(X_num_train)
                        fold_data['X_test'] = num_imputer.transform(X_num_test)

        self.pipeline_registry = new_pipeline_registry # Atualiza o registro de pipeline com os dados imputados

    def select_model(self, models):

        new_pipeline_registry = {}
        fold_results = []
        summary_rows = []
        original_registry = self.pipeline_registry.copy()

        default_path = Path.cwd() / 'output'
        default_path.mkdir(parents=True, exist_ok=True)
        output_path = default_path

        for model_name, model_info in models.items():
            best_models = {}

            model = model_info['model']

            model = model_info['model']
            hyperparameters = model_info['hyperparameters'] if 'hyperparameters' in model_info else {}
            selection_method = model_info['selection_method'] if 'selection_method' in model_info else 'grid'
            scoring = model_info['scoring'] if 'scoring' in model_info else 'accuracy'
            cv = model_info['cv'] if 'cv' in model_info else 5
            n_iter = model_info['n_iter'] if 'n_iter' in model_info else 10
            random_state = model_info['random_state'] if 'random_state' in model_info else 0

            evaluator = ClassificationEvaluator(output_path)

            for key in original_registry.keys():
                key_model = f'{model_name}_{key}'
                new_pipeline_registry[key_model] = {
                    **original_registry[key],
                    'model': model,
                    'hyperparameters': hyperparameters,
                    'scoring': scoring,
                    'cv': cv
                }
                best_models[key_model] = []

                for data in original_registry[key]['split_data']:
                    X_train, y_train = data['X_train'], data['y_train']
                    X_test, y_test = data['X_test'], data['y_test']

                    X_train.colums = X_train.columns.astype(str)
                    X_test.columns = X_test.columns.astype(str)

                    if selection_method == 'grid':
                        search = GridSearchCV(model, X_train, y_train, hyperparameters=hyperparameters, scoring=scoring, cv=cv)
                    elif selection_method == 'random':
                        search = RandomizedSearchCV(model, X_train, y_train, hyperparameters=hyperparameters, scoring=scoring, cv=cv, n_iter=n_iter, random_state=random_state)
                    # elif selection_method == 'bayes':
                    #     search = BayesSearchCV(model, X_train, y_train, hyperparameters=hyperparameters, scoring=scoring, cv=cv, n_iter=n_iter, random_state=random_state)
                    
                    best_models[key_model].append(search.best_estimator_)

                    y_pred = search.predict(X_test)

                    fold_results.append({'y_true': y_test, 'y_pred': y_pred})

                score = evaluator.evaluate(key_model, fold_results)

                row = {'Key': key_model, 'Accuracy': score}
                summary_rows.append(row)

            self.pipeline_registry = new_pipeline_registry # Atualiza o registro de pipeline com os modelos selecionados

            df_models = pd.DataFrame(summary_rows).round(4)
            main_metric = 'accuracy'
            csv_path = self.output_path / "all_models.csv"
            df_models.to_csv(csv_path, index=False)
            try:
                from IPython.display import display
                print("\n--- Model Selection Summary ---")
                display(df_models)
            except ImportError:
                print("\n--- Model Selection Summary ---")
                print(df_models.to_string())


    

In [3]:
num_cols = [
    'idade',
    'qtde_dependentes',
    'meses_na_residencia',
    'renda_mensal_regular',
    'renda_extra',
    'qtde_contas_bancarias',
    'qtde_contas_bancarias_especiais',
    'valor_patrimonio_pessoal',
    'meses_no_trabalho'
]

cat_cols = [
    'produto_solicitado',
    'dia_vencimento',
    'forma_envio_solicitacao',
    'tipo_endereco',
    'sexo',
    'estado_civil',
    'grau_instrucao',
    'nacionalidade',
    'estado_onde_nasceu',
    'estado_onde_reside',
    'possui_telefone_residencial',
    'codigo_area_telefone_residencial',
    'tipo_residencia',
    'possui_telefone_celular',
    'possui_email',
    'possui_cartao_visa',
    'possui_cartao_mastercard',
    'possui_cartao_diners',
    'possui_cartao_amex',
    'possui_outros_cartoes',
    'possui_carro',
    'vinculo_formal_com_empresa',
    'estado_onde_trabalha',
    'possui_telefone_trabalho',
    'codigo_area_telefone_trabalho',
    'profissao',
    'ocupacao',
    'profissao_companheiro',
    'grau_instrucao_companheiro'
]

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, MaxAbsScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, AdaBoostClassifier
# from xgboost import XGBClassifier

scalers = [
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    MaxAbsScaler
]

encoders = [
    OneHotEncoder,
    OrdinalEncoder,
]

imputers = [
    SimpleImputer(strategy='mean')
]

models = {
    # 'RandomForest': {
    #     'model': RandomForestClassifier(random_state=2),
    #     'hyperparameters': {
    #         'n_estimators': [100, 200],
    #         'max_depth': [5, 10, None],
    #         'min_samples_split': [2, 5, 10]
    #     },
    #     'selection_method': 'grid',
    #     'scoring': 'accuracy'
    # },

    # 'SVM': {
    #     'model': SVC(random_state=2),
    #     'hyperparameters': {
    #         'C': [0.1, 1, 10],
    #         'kernel': ['rbf', 'linear'],
    #         'gamma': ['scale', 'auto']
    #     },
    #     'selection_method': 'grid',
    #     'scoring': 'accuracy',
    #     'cv': 5
    # },

    'LogisticRegression': {
        'model': LogisticRegression(random_state=2),
        'hyperparameters': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'saga']
        }
    },

    # 'GradientBoosting': {
    #     'model': GradientBoostingClassifier(random_state=2),
    #     'hyperparameters': {
    #         'n_estimators': [100, 200],
    #         'learning_rate': [0.01, 0.1, 0.2],
    #         'max_depth': [3, 5, 7]
    #     }
    # },

    # 'XGBoost': {
    #     'model': XGBClassifier(random_state=2),
    #     'hyperparameters': {
    #         'n_estimators': [100, 200],
    #         'learning_rate': [0.01, 0.1, 0.3],
    #         'max_depth': [3, 6, 10]
    #     }
    # },

    # 'AdaBoost': {
    #     'model': AdaBoostClassifier(random_state=2),
    #     'hyperparameters': {
    #         'n_estimators': [50, 100, 200],
    #         'learning_rate': [0.01, 0.1, 1]
    #     },
    #     'selection_method': 'grid',
    #     'scoring': 'accuracy'
    # },

    # 'ExtraTrees': {
    #     'model': ExtraTreesClassifier(random_state=2),
    #     'hyperparameters': {
    #         'n_estimators': [100, 200],
    #         'max_depth': [5, 10, None],
    #         'min_samples_split': [2, 5, 10]
    #     },
    #     'selection_method': 'grid',
    #     'scoring': 'accuracy'
    # },

    # 'KNN': {
    #     'model': KNeighborsClassifier(),
    #     'hyperparameters': {
    #         'n_neighbors': [3, 5, 7],
    #         'weights': ['uniform', 'distance'],
    #         'metric': ['euclidean', 'manhattan']
    #     },
    #     'selection_method': 'grid',
    #     'scoring': 'accuracy'
    # },

    # 'DecisionTree': {
    #     'model': DecisionTreeClassifier(random_state=2),
    #     'hyperparameters': {
    #         'max_depth': [5, 10, None],
    #         'min_samples_split': [2, 5, 10],
    #         'criterion': ['gini', 'entropy']
    #     },
    #     'selection_method': 'grid',
    #     'scoring': 'accuracy'
    # }
}



In [None]:
classificador = ModeloML()

classificador.load_data(path_test='conjunto_de_teste.csv', path_train='conjunto_de_treinamento.csv')

classificador.holdout()
classificador.KFold_cross_validation()
classificador.KFold_cross_validation_stratified()

classificador.scale_data(num_cols=num_cols, scalers=scalers)
classificador.encode_data(cat_cols=cat_cols, encoders=encoders)
classificador.impute_data(imputers=imputers)

classificador.select_model(models=models)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold_data['X_train'][num_cols] = num_scaler.fit_transform(X_num_train)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold_data['X_test'][num_cols] = num_scaler.transform(X_num_test)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fold_data['X_train'][num_cols] = num_scaler.fit_transform(X_num_train

ValueError: Columns must be same length as key