### IMPORTANDO AS BIBLIOTECAS

In [1]:
from numpy import mean, std, arange
import time
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, GridSearchCV
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import pickle

### LEITURA DA BASE DE DADOS

In [90]:
df = pd.read_parquet('../data/processed/sim_2006_2017_pre_processed.parquet')
suicidio_0 = df[df['SUICIDIO'] == 0]
suicidio_1 = df[df['SUICIDIO'] == 1]
suicidio_0_sample = suicidio_0.sample(n=len(suicidio_1)*5, replace=False)
balanced_df = pd.concat([suicidio_0_sample, suicidio_1], ignore_index=True).reset_index(drop=True)
df

Unnamed: 0,ESC,ESTCIV,IDADE,NATURAL,OCUP,RACACOR,MASCULINO,FEMININO,SUICIDIO
0,-1.732328,0.809986,-0.187545,-1,0.844239,-0.484485,True,False,0
1,-0.226709,-0.329698,0.753377,-1,-1.066134,3.994021,True,False,0
2,-0.226709,0.240144,1.380658,-1,-0.337181,-0.484485,False,True,0
3,-0.226709,-0.899540,1.171564,-2,-1.066134,-0.484485,True,False,0
4,-1.732328,0.240144,0.701103,-1,-1.587212,-0.484485,False,True,0
...,...,...,...,...,...,...,...,...,...
3045770,-0.979518,-0.899540,-1.337560,35,0.844239,3.994021,False,True,0
3045771,-0.000199,3.659197,-0.082998,35,-1.587212,-0.484485,True,False,0
3045772,-0.226709,-0.899540,-0.605732,29,0.844239,1.194955,True,False,0
3045773,-0.226709,0.240144,1.067017,35,0.844239,-0.484485,True,False,0


### VISUALIZAÇÃO DA BASE DE DADOS

In [91]:
balanced_df

Unnamed: 0,ESC,ESTCIV,IDADE,NATURAL,OCUP,RACACOR,MASCULINO,FEMININO,SUICIDIO
0,0.526100,0.240144,0.648830,35,0.844239,-0.484485,False,True,0
1,0.526100,-0.329698,0.648830,29,0.844239,3.994021,True,False,0
2,-0.000199,-0.899540,-0.501185,35,-1.025031,1.194955,False,True,0
3,-0.000199,3.659197,-0.135271,27,0.844239,3.994021,False,True,0
4,-0.226709,-0.329698,0.910197,29,0.844239,-0.484485,True,False,0
...,...,...,...,...,...,...,...,...,...
141487,-0.226709,-0.899540,-2.435302,35,-0.338883,1.194955,False,True,1
141488,2.031719,-0.899540,-1.912568,35,-0.822745,1.194955,True,False,1
141489,-0.226709,-0.329698,0.962471,35,0.844239,1.194955,True,False,1
141490,-1.732328,-0.899540,-0.762552,35,-1.587212,-0.484485,True,False,1


#### Definindo X e Y de acordo com a base de dados balanceada

In [92]:
X = balanced_df[['ESC', 'ESTCIV', 'IDADE' , 'NATURAL' , 'OCUP', 'RACACOR' ,'MASCULINO',	'FEMININO'	]].values
Y = balanced_df['SUICIDIO'].values

#### Divisão entre teste e treino 

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=666)

### TREINANDO O MODELO
Modelo escolhido: Centróides.
Como nossa base de dados já apresenta os dados de suicídio, não foi necessária a clusterização. Portanto, os centróides de clusters podem ser utilizados para identificar os pontos centrais. Cada centróide representa os aspectos gerais de uma determinada classe.

In [98]:
from numpy import std

modelo_prototipos = NearestCentroid(metric='manhattan', shrink_threshold= 0.95)


### Cross-validation

In [95]:
from sklearn.metrics import recall_score
cv = RepeatedStratifiedKFold(n_splits=9, n_repeats=3, random_state=666)
scores = cross_val_score(modelo_prototipos, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)

print('accuracy : %.3f (%.3f)' % (mean(scores), std(scores)))

Precision : 0.192 (0.003)


### Hyperparameters

In [96]:
grid = dict()
grid['shrink_threshold'] = arange(0, 1.01, 0.05)
grid['metric'] = ['manhattan', 'euclidean']
# define search
search = GridSearchCV(modelo_prototipos, grid, scoring='accuracy', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(X_train, y_train)
# summarize
print('accuracy: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)

accuracy: 0.557
Config: {'metric': 'manhattan', 'shrink_threshold': 0.9500000000000001}


54 fits failed out of a total of 1134.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
54 fits failed with the following error:
Traceback (most recent call last):
  File "/mnt/c/Users/arthu/Desktop/Grupo/Aprendizado de Maquina/Apoio-vida/apoio-venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/mnt/c/Users/arthu/Desktop/Grupo/Aprendizado de Maquina/Apoio-vida/apoio-venv/lib/python3.10/site-packages/sklearn/base.py", line 1467, in wrapper
    estimator._validate_params()
  File "/mnt/c/Users/arthu/Desktop/Grupo/Aprendizado de Maquina/Apoio-vida/apoio-venv/lib/python3.10/site-packages/sklearn/base.py", line 666, in _validate_params

In [99]:
modelo_prototipos.fit(X_train, y_train)

### PREVISÃO DO MODELO TREINADO E ACURÁCIA

In [100]:
y_pred = modelo_prototipos.predict(X_test)

In [101]:
relatorio = classification_report(y_test, y_pred, output_dict=True)
matriz = confusion_matrix(y_test, y_pred)

In [102]:
print("Relatório de Classificação:\n", relatorio)
print("Matriz de Confusão:\n", matriz)

Relatório de Classificação:
 {'0': {'precision': 0.8592961530899796, 'recall': 0.5542311268915378, 'f1-score': 0.673844335380425, 'support': 23658.0}, '1': {'precision': 0.19125766871165645, 'recall': 0.537384184443008, 'f1-score': 0.2821107403427408, 'support': 4641.0}, 'accuracy': 0.5514682497614757, 'macro avg': {'precision': 0.5252769109008181, 'recall': 0.5458076556672729, 'f1-score': 0.4779775378615829, 'support': 28299.0}, 'weighted avg': {'precision': 0.7497386914835695, 'recall': 0.5514682497614757, 'f1-score': 0.6096005241302079, 'support': 28299.0}}
Matriz de Confusão:
 [[13112 10546]
 [ 2147  2494]]


### VISUALIZAÇÃO DOS DADOS

In [None]:
import seaborn as sns

# Plotar gráficos de distribuição das features em relação à variável alvo
#sns.pairplot(balanced_df, hue='SUICIDIO', vars=['ESC', 'ESTCIV', 'IDADE', 'NATURAL', 'OCUP', 'RACACOR', 'MASCULINO', 'FEMININO'])

In [103]:
geovanna = pd.DataFrame({'ESC': 5,    'ESTCIV': 1,    'IDADE': 22,    'NATURAL': 53,    'OCUP': -1,    'RACACOR': 4,    'MASCULINO': False,    'FEMININO': True}, index=[0])

arthur = pd.DataFrame({'ESC': 5,    'ESTCIV': 2,    'IDADE': 21,    'NATURAL': 52,    'OCUP': -1,    'RACACOR': 1,    'MASCULINO': True,    'FEMININO': False}, index=[0])

iranildo = pd.DataFrame({'ESC': 2,    'ESTCIV': 3,    'IDADE': 57,    'NATURAL': 23,    'OCUP': 999993,    'RACACOR': 1,    'MASCULINO': True,    'FEMININO': False}, index=[0])

genivaldo = pd.DataFrame({'ESC': 3,    'ESTCIV': 3,    'IDADE': 37,    'NATURAL': 35,    'OCUP': 2410,    'RACACOR': 1,    'MASCULINO': True,    'FEMININO': False}, index=[0])

doug = pd.DataFrame({'ESC': 5,    'ESTCIV': 2,    'IDADE': 21,    'NATURAL': 53,    'OCUP': 2410,    'RACACOR': 4,    'MASCULINO': True,    'FEMININO': False}, index=[0])

cozinheiro = pd.DataFrame({'ESC': 3,    'ESTCIV': 2,    'IDADE': 34,    'NATURAL': 29,    'OCUP': 5132,    'RACACOR': 4,    'MASCULINO': True,    'FEMININO': False}, index=[0])

In [104]:
teste = pd.concat((iranildo, genivaldo, arthur, cozinheiro, geovanna, doug), ignore_index=True)

In [105]:
with open("../models/prototype.pkl", "rb") as f:
    std=pickle.load((f))

In [106]:
columns = ['ESC', 'ESTCIV', 'OCUP', 'IDADE', 'RACACOR', 'NATURAL']
teste[columns] = std.transform(teste[columns])

x = teste[['ESC', 'ESTCIV', 'IDADE', 'OCUP', 'RACACOR', 'NATURAL', 'MASCULINO', 'FEMININO']].values

In [107]:
y_pred = modelo_prototipos.predict(x)
y_pred


array([0, 0, 0, 0, 0, 0])

### ML Flow

In [None]:
import mlflow

In [None]:
mlflow.set_tracking_uri("http://127.0.0.1:8080/")

In [None]:
def generate_mlflow_experiment(version, training_run_name, name):
    experiment = mlflow.get_experiment_by_name(name)
    if not experiment:
        mlflow.create_experiment(name)
        experiment = mlflow.get_experiment_by_name(name)

    mlflow_experiment = {
        "VERSION": version,
        "TRAINING_RUN_NAME": training_run_name,
        "EXPERIMENT_ID": experiment.experiment_id
    }

    print(f"Experiment_id: {experiment.experiment_id}")
    print(f"Artifact Location: {experiment.artifact_location}")
    print(f"Tags: {experiment.tags}")
    print(f"Lifecycle_stage: {experiment.lifecycle_stage}")

    return mlflow_experiment


def generate_mlflow_params(mlflow_experiment, df):
    mlflow_tags = {
        "release.version": mlflow_experiment.get("VERSION"),
        "model.name": "PROTOTYPE", 
        "num_samples": str(len(df))
    }

    mlflow_params = dict(
        run_name=mlflow_experiment.get("TRAINING_RUN_NAME"),
        experiment_id=mlflow_experiment.get("EXPERIMENT_ID"),
        tags=mlflow_tags
    )

    return mlflow_params


In [None]:
def plot_cm(y_test, y_pred):
    matriz = confusion_matrix(y_test, y_pred)
    plt.close("all")
    plt.switch_backend("Agg")
    fig = ConfusionMatrixDisplay(matriz)
    fig.plot()
    return fig.figure_
def get_metrics(y_test, y_pred):
    relatorio = classification_report(y_test, y_pred, output_dict=True)
    
    metrics = {}

    for key, value in relatorio.items():
        if isinstance(value, dict):
            for inner_key, inner_value in value.items():
                metrics[f'{inner_key}_{key}'] = inner_value
        else:
            metrics[key] = value

    metrics['cm'] = plot_cm(y_test, y_pred)
    return metrics
    

In [None]:
def prototype_flow(prototype_params_grid, df):
    name = "PROTOTYPE"
    run_name = "prototype_acc"
    version = "prototype-1.0"
    mlflow_experiment = generate_mlflow_experiment(version, run_name, name)
    mlflow_params = generate_mlflow_params(mlflow_experiment , df)
    
    X = df[['ESC', 'ESTCIV', 'IDADE' , 'NATURAL' , 'OCUP', 'RACACOR' ,'MASCULINO',	'FEMININO'	]].values
    Y = df['SUICIDIO'].values
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=666)

    for prototype_params in prototype_params_grid:
        with mlflow.start_run(**mlflow_params) as run:
            mlflow.log_params({f"prototype_{key}": value for key, value in prototype_params.items()})
            modelo_prototipos = NearestCentroid(**prototype_params)

            tic = time.time()
            modelo_prototipos.fit(X_train, y_train)
            tac = time.time()
            fit_time = tac-tic

            y_pred = modelo_prototipos.predict(X_test)

            tic = time.time()
            metrics = get_metrics(y_test, y_pred)
            tac = time.time()
            evaluate_time = tac-tic

            mlflow.log_figure(metrics['cm'], 'cm.png')
            del metrics['cm']

            model_metrics = {
                **metrics,
                "prototype_fit_time": fit_time,
                "prototype_evaluate_time": evaluate_time
            }
            # for key, value in metrics.items():
            #     print(f"{key}: {value}")
            # break
            mlflow.log_metrics(model_metrics)



In [None]:
%%time
from sklearn.model_selection import ParameterGrid
prototype_grid = {
    'metric': ['manhattan', 'euclidean'], 
    'shrink_threshold': np.arange(0.01, 1.01, 0.10)
}
prototype_params_grid = ParameterGrid(prototype_grid)
prototype_flow(prototype_params_grid, balanced_df)


Experiment_id: 391844426102414961
Artifact Location: mlflow-artifacts:/391844426102414961
Tags: {}
Lifecycle_stage: active
CPU times: user 10.6 s, sys: 1.96 s, total: 12.6 s
Wall time: 34.1 s
