In [106]:
import pandas as pd
import numpy as np

df = pd.read_csv("./data/diabetes.csv")

In [107]:
target = "Outcome"
num_cols = df.drop(columns=[target]).columns # "X"

# Pipeline

In [108]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline([
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
])

# Methods - Prepare

In [109]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split

def stratify(df: pd.DataFrame, col_name: str):
    columns_predictors = df.drop(columns=[target]) # "X"
    labels = df[target] # "Y"

    # train_set, test_set, train_labels, test_labels = train_test_split(
    #     columns_predictors,
    #     labels,
    #     test_size = 0.20,
    #     random_state = 42,
    #     stratify = labels,
    # )
    #
    # return train_set, train_labels, test_set, test_labels

    splitter = StratifiedShuffleSplit(
        n_splits=1,
        test_size=0.2,
        random_state=42 # seed
    )

    split_data = splitter.split(
        df,
        labels,
    )

    for train_index, test_index in split_data:
        train_set = columns_predictors.loc[train_index].reset_index(drop=True)
        test_set = columns_predictors.loc[test_index].reset_index(drop=True)

        train_labels = labels.loc[train_index].reset_index(drop=True)
        test_labels = labels.loc[test_index].reset_index(drop=True)

    return train_set, train_labels, test_set, test_labels

def prepare_train(df: pd.DataFrame):
    prepared_data = preprocessor.fit_transform(df)

    all_feature_names = num_cols

    df_transformed = pd.DataFrame(
        prepared_data,
        columns=all_feature_names
    )

    return df_transformed, all_feature_names

def prepare_predict(df: pd.DataFrame, all_feature_names: list):
    prepared_data = preprocessor.transform(df)
    df_transformed = pd.DataFrame(prepared_data, columns=all_feature_names, index=df.index)

    return df_transformed

# Methods - Evaluation (R2, MAE, MSE...)

In [110]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

def evaluate(y_true, y_pred):
    print("Avaliação do Modelo:")
    print(f"Acurácia: {accuracy_score(y_true, y_pred):.4f}")
    print(f"Precisão: {precision_score(y_true, y_pred):.4f}")
    print(f"Recall: {recall_score(y_true, y_pred):.4f}")
    print(f"F1-Score: {f1_score(y_true, y_pred):.4f}")
    print("\nMatriz de Confusão:\n", confusion_matrix(y_true, y_pred))

In [111]:
train_set, train_labels, test_set, test_labels = stratify(df, '')

In [112]:
train_prepared, all_feature_names = prepare_train(train_set)

# Models

In [114]:
from sklearn.linear_model import LogisticRegression
def logisticRegressionModel():
    model = LogisticRegression()
    model.fit(train_prepared, train_labels)

    test_prepared = prepare_predict(test_set, all_feature_names)

    predictions = model.predict(test_prepared)
    # print(predictions[:5])
    # print(test_labels[:5])
    evaluate(test_labels, predictions)

logisticRegressionModel()

Avaliação do Modelo:
Acurácia: 0.7143
Precisão: 0.6087
Recall: 0.5185
F1-Score: 0.5600

Matriz de Confusão:
 [[82 18]
 [26 28]]
