### IMPORTANDO AS BIBLIOTECAS

In [49]:
from numpy import mean, std, arange
import time
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

### LEITURA DA BASE DE DADOS

In [62]:
full_df = pd.read_parquet('../data/processed/prototype3.parquet')

### VISUALIZAÇÃO DA BASE DE DADOS

In [63]:
suicidio_0 = full_df[full_df['SUICIDIO'] == 0]
suicidio_1 = full_df[full_df['SUICIDIO'] == 1]

suicidio_0 = suicidio_0.sample(n=len(suicidio_1) // 1, replace=False)

balanced_df = pd.concat((suicidio_0, suicidio_1), ignore_index=True).reset_index(drop=True)
balanced_df
balanced_df["SUICIDIO"].value_counts(normalize=True)

SUICIDIO
0    0.5
1    0.5
Name: proportion, dtype: float64

### BALANCEAMENTO DA BASE DE DADOS EM CASOS DE SUICÍDIO

#### Definindo X e Y de acordo com a base de dados balanceada

In [64]:
X = balanced_df[['ESC', 'ESTCIV', 'IDADE' , 'NATURAL' , 'OCUP', 'RACACOR' ,'MASCULINO',	'FEMININO'	]].values
Y = balanced_df['SUICIDIO'].values

#### Divisão entre teste e treino 

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=666)

### TREINANDO O MODELO
Modelo escolhido: Centróides.
Como nossa base de dados já apresenta os dados de suicídio, não foi necessária a clusterização. Portanto, os centróides de clusters podem ser utilizados para identificar os pontos centrais. Cada centróide representa os aspectos gerais de uma determinada classe.

In [68]:
from numpy import std

modelo_prototipos = NearestCentroid(metric='manhattan', shrink_threshold= 0.15)


### Cross-validation

In [69]:
from sklearn.metrics import recall_score
cv = RepeatedStratifiedKFold(n_splits=9, n_repeats=3, random_state=666)
scores = cross_val_score(modelo_prototipos, X, Y, scoring='precision', cv=cv, n_jobs=-1)

print('Precision : %.3f (%.3f)' % (mean(scores), std(scores)))

Precision : 0.780 (0.008)


### Hyperparameters

In [None]:
grid = dict()
grid['shrink_threshold'] = arange(0, 1.01, 0.05)
# define search
search = GridSearchCV(modelo_prototipos, grid, scoring='precision', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('Precision: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

In [70]:
modelo_prototipos.fit(X_train, y_train)

### PREVISÃO DO MODELO TREINADO E ACURÁCIA

In [71]:
y_pred = modelo_prototipos.predict(X_test)

In [72]:
relatorio = classification_report(y_test, y_pred, output_dict=True)
matriz = confusion_matrix(y_test, y_pred)

In [73]:
print("Relatório de Classificação:\n", relatorio)
print("Matriz de Confusão:\n", matriz)

Relatório de Classificação:
 {'0': {'precision': 0.898474463851426, 'recall': 0.7508314241371665, 'f1-score': 0.8180445679086902, 'support': 27062.0}, '1': {'precision': 0.788055948451988, 'recall': 0.9161064016369482, 'f1-score': 0.8472703310072149, 'support': 27368.0}, 'accuracy': 0.8339334925592504, 'macro avg': {'precision': 0.843265206151707, 'recall': 0.8334689128870574, 'f1-score': 0.8326574494579526, 'support': 54430.0}, 'weighted avg': {'precision': 0.8429548252430884, 'recall': 0.8339334925592504, 'f1-score': 0.8327396016121703, 'support': 54430.0}}
Matriz de Confusão:
 [[20319  6743]
 [ 2296 25072]]


### VISUALIZAÇÃO DOS DADOS

In [None]:
import seaborn as sns

# Plotar gráficos de distribuição das features em relação à variável alvo
#sns.pairplot(balanced_df, hue='SUICIDIO', vars=['ESC', 'ESTCIV', 'IDADE', 'NATURAL', 'OCUP', 'RACACOR', 'MASCULINO', 'FEMININO'])

In [88]:
geovanna = pd.DataFrame({'ESC': 5,    'ESTCIV': 1,    'IDADE': 22,    'NATURAL': 53,    'OCUP': -1,    'RACACOR': 4,    'MASCULINO': False,    'FEMININO': True}, index=[0])

arthur = pd.DataFrame({'ESC': 5,    'ESTCIV': 1,    'IDADE': 31,    'NATURAL': 52,    'OCUP': -1,    'RACACOR': 1,    'MASCULINO': True,    'FEMININO': False}, index=[0])

chaves = pd.DataFrame({'ESC': 4,    'ESTCIV': 2,    'IDADE': 25,    'NATURAL': 35,    'OCUP': 2410,    'RACACOR': 1,    'MASCULINO': True,    'FEMININO': False}, index=[0])

doug = pd.DataFrame({'ESC': 5,    'ESTCIV': 2,    'IDADE': 21,    'NATURAL': 53,    'OCUP': 241005,    'RACACOR': 4,    'MASCULINO': True,    'FEMININO': False}, index=[0])

cozinheiro = pd.DataFrame({'ESC': 4,    'ESTCIV': 2,    'IDADE': 34,    'NATURAL': 29,    'OCUP': 5132,    'RACACOR': 4,    'MASCULINO': True,    'FEMININO': False}, index=[0])


In [89]:
teste = pd.concat((cozinheiro, chaves, arthur, geovanna, arthur), ignore_index=True)

In [90]:
with open("../models/prototype.pkl", "rb") as f:
    std=pickle.load((f))

In [91]:
columns = ['ESC', 'ESTCIV', 'OCUP', 'IDADE', 'RACACOR', 'NATURAL']
teste[columns] = std.transform(teste[columns])

x = teste[['ESC', 'ESTCIV', 'IDADE', 'OCUP', 'RACACOR', 'NATURAL', 'MASCULINO', 'FEMININO']].values

In [93]:
y_pred = modelo_prototipos.predict(x)
y_pred


array([0, 0, 0, 0, 0])

### ML Flow

In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:8080/")

In [None]:
def generate_mlflow_experiment(version, training_run_name, name):
    experiment = mlflow.get_experiment_by_name(name)
    if not experiment:
        mlflow.create_experiment(name)
        experiment = mlflow.get_experiment_by_name(name)

    mlflow_experiment = {
        "VERSION": version,
        "TRAINING_RUN_NAME": training_run_name,
        "EXPERIMENT_ID": experiment.experiment_id
    }

    print(f"Experiment_id: {experiment.experiment_id}")
    print(f"Artifact Location: {experiment.artifact_location}")
    print(f"Tags: {experiment.tags}")
    print(f"Lifecycle_stage: {experiment.lifecycle_stage}")

    return mlflow_experiment


def generate_mlflow_params(mlflow_experiment, df):
    mlflow_tags = {
        "release.version": mlflow_experiment.get("VERSION"),
        "model.name": "PROTOTYPE", 
        "num_samples": str(len(df))
    }

    mlflow_params = dict(
        run_name=mlflow_experiment.get("TRAINING_RUN_NAME"),
        experiment_id=mlflow_experiment.get("EXPERIMENT_ID"),
        tags=mlflow_tags
    )

    return mlflow_params


In [None]:
def plot_cm(y_test, y_pred):
    matriz = confusion_matrix(y_test, y_pred)
    plt.close("all")
    plt.switch_backend("Agg")
    fig = ConfusionMatrixDisplay(matriz)
    fig.plot()
    return fig.figure_
def get_metrics(y_test, y_pred):
    relatorio = classification_report(y_test, y_pred, output_dict=True)
    
    metrics = {}

    for key, value in relatorio.items():
        if isinstance(value, dict):
            for inner_key, inner_value in value.items():
                metrics[f'{inner_key}_{key}'] = inner_value
        else:
            metrics[key] = value

    metrics['cm'] = plot_cm(y_test, y_pred)
    return metrics
    

In [None]:
def prototype_flow(prototype_params_grid, df):
    name = "PROTOTYPE"
    run_name = "prototype_acc"
    version = "prototype-1.0"
    mlflow_experiment = generate_mlflow_experiment(version, run_name, name)
    mlflow_params = generate_mlflow_params(mlflow_experiment , df)
    
    X = df[['ESC', 'ESTCIV', 'IDADE' , 'NATURAL' , 'OCUP', 'RACACOR' ,'MASCULINO',	'FEMININO'	]].values
    Y = df['SUICIDIO'].values
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=666)

    for prototype_params in prototype_params_grid:
        with mlflow.start_run(**mlflow_params) as run:
            mlflow.log_params({f"prototype_{key}": value for key, value in prototype_params.items()})
            modelo_prototipos = NearestCentroid(**prototype_params)

            tic = time.time()
            modelo_prototipos.fit(X_train, y_train)
            tac = time.time()
            fit_time = tac-tic

            y_pred = modelo_prototipos.predict(X_test)

            tic = time.time()
            metrics = get_metrics(y_test, y_pred)
            tac = time.time()
            evaluate_time = tac-tic

            mlflow.log_figure(metrics['cm'], 'cm.png')
            del metrics['cm']

            model_metrics = {
                **metrics,
                "prototype_fit_time": fit_time,
                "prototype_evaluate_time": evaluate_time
            }
            # for key, value in metrics.items():
            #     print(f"{key}: {value}")
            # break
            mlflow.log_metrics(model_metrics)



In [None]:
%%time
from sklearn.model_selection import ParameterGrid
prototype_grid = {
    'metric': ['manhattan', 'euclidean'], 
    'shrink_threshold': np.arange(0.01, 1.01, 0.10)
}
prototype_params_grid = ParameterGrid(prototype_grid)
prototype_flow(prototype_params_grid, balanced_df)


Experiment_id: 391844426102414961
Artifact Location: mlflow-artifacts:/391844426102414961
Tags: {}
Lifecycle_stage: active
CPU times: user 10.6 s, sys: 1.96 s, total: 12.6 s
Wall time: 34.1 s
