# Métricas de Pré-Treino


In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('datasets/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


# Class Imbalance

In [2]:
def class_imbalance(data: pd.DataFrame, attribute: str, privileged_class) -> float:
    class_counts = data[attribute].value_counts()
        
    n_a = class_counts[privileged_class]
    n_d = class_counts.sum() - n_a
    
    ci = (n_a - n_d) / (n_a + n_d)
    
    return ci

In [3]:
df_ci = pd.concat([df[df['sex'] == 1].sample(8), df[df['sex'] == 0].sample(2)], ignore_index=True)

class_imbalance(df_ci, "sex", 1)

0.6

# KL Divergence

In [4]:
def kl_divergence_from_df(data: pd.DataFrame, target, protected_attr, p, d) -> float:
    # número de instancias com sex == 1 e target==1 dividido por instancias com sex == 1
    p_p_y1 = (data[(data[protected_attr] == p) & (data[target] == 1)].shape[0]) / (data[data[protected_attr] == p].shape[0])
    p_d_y1 = (data[(data[protected_attr] == d) & (data[target] == 1)].shape[0]) / (data[data[protected_attr] == d].shape[0])
    p_p_y0 = (data[(data[protected_attr] == p) & (data[target] == 0)].shape[0]) / (data[data[protected_attr] == p].shape[0])
    p_d_y0 = (data[(data[protected_attr] == d) & (data[target] == 0)].shape[0]) / (data[data[protected_attr] == d].shape[0])
        
    # Calcular a divergência KL
    return p_p_y1 * np.log(p_p_y1 / p_d_y1) + p_p_y0 * np.log(p_p_y0 / p_d_y0)

In [5]:
df_only_male = df.loc[df['sex'] == 1]
df_only_female = df.loc[df['sex'] == 0]


In [6]:
df_mixed = pd.concat(
    [
        df_only_female.loc[df_only_female["target"] == 1].sample(7),
        df_only_female.loc[df_only_female["target"] == 0].sample(3),
        df_only_male.loc  [df_only_male  ["target"] == 1].sample(2),
        df_only_male.loc  [df_only_male  ["target"] == 0].sample(8),
    ]
)

In [7]:
kl_divergence_from_df(df_mixed, 'target', 'sex', 1, 0)

0.5341108087103075

# Post-Training

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
model = LogisticRegression(max_iter=10000)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

male_indices = X_test['sex'] == 1
female_indices = X_test['sex'] == 0

cm_male = confusion_matrix(y_test[male_indices], y_pred[male_indices], labels=[1, 0])
cm_female = confusion_matrix(y_test[female_indices], y_pred[female_indices], labels=[1, 0])

print("Confusion Matrix for Male:")
print(cm_male)
print("\nConfusion Matrix for Female:")
print(cm_female)

# true positives for male
TPp = cm_male[0][0]
FNp = cm_male[0][1]
FPp = cm_male[1][0]
TNp = cm_male[1][1]
recall_male = (TPp / (TPp + FNp))
print("TPp: ", TPp)
print("FNp:", FNp)
print("FPp:", FPp)
print("TNp:", TNp)

# true positives for female
TPd = cm_female[0][0]
FNd = cm_female[0][1]
FPd = cm_female[1][0]
TNd = cm_female[1][1]

print("TPd: ", TPd)
print("FNd:", FNd)
print("FPd:", FPd)
print("TNd:", TNd)
recall_female = (TPd / (TPd + FNd))

print("Recall for Female: ", recall_female)
print("Recall for Male: ", recall_male)
RD = recall_male - recall_female
print("Recall Diference: ", RD)

DAR = (TPp / (TPp+FPp)) - (TPd / (TPd+FPd))
print("DAR: ", DAR)

DRR = (TNd / (TNd + FNd)) - (TNp / (TNp + FNp))
print("DRR: ", DRR)

Confusion Matrix for Male:
[[33 10]
 [ 9 36]]

Confusion Matrix for Female:
[[26  1]
 [ 0  7]]
TPp:  33
FNp: 10
FPp: 9
TNp: 36
TPd:  26
FNd: 1
FPd: 0
TNd: 7
Recall for Female:  0.9629629629629629
Recall for Male:  0.7674418604651163
Recall Diference:  -0.1955211024978466
DAR:  -0.2142857142857143
DRR:  0.09239130434782605
