1. Data Preprocessing

In [10]:


import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df=pd.read_csv("data/raw/adult.csv")

# Drop rows with missing values
df=df.dropna()

# Encode categorial variables
le = LabelEncoder()
for col in df.select_dtypes(include="object"):
    df[col] = le.fit_transform(df[col])

# Split features an target
X = df.drop("income", axis=1)
y = df["income"]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape



((22792, 14), (9769, 14))

2. Train the Base Model

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(solver='saga', max_iter=5000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nReport:\n", classification_report(y_test, y_pred))

Accuracy: 0.7951683898044836

Report:
               precision    recall  f1-score   support

           0       0.80      0.97      0.88      7429
           1       0.70      0.25      0.37      2340

    accuracy                           0.80      9769
   macro avg       0.75      0.61      0.62      9769
weighted avg       0.78      0.80      0.76      9769



In [2]:
# === Model 1 — Logistic Regression (Baseline) ===
import os
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

BASE_DIR = os.path.dirname(os.path.abspath(os.getcwd())) if "ipykernel" in globals() else os.getcwd()
DATA_PATH  = os.path.normpath(os.path.join(BASE_DIR, "data", "raw", "adult.csv"))
MODELS_DIR = os.path.normpath(os.path.join(BASE_DIR, "models"))
os.makedirs(MODELS_DIR, exist_ok=True)

df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["sex", "race", "income"])
df["income"] = df["income"].apply(lambda x: 1 if ">50K" in str(x) else 0)

# --- Encode categóricas ---
def encode_dataframe(df_):
    df_ = df_.copy()
    encoders = {}
    for col in df_.select_dtypes(include="object"):
        if col != "income":
            le = LabelEncoder()
            df_[col] = le.fit_transform(df_[col].astype(str))
            encoders[col] = le
    return df_, encoders

df_enc, encoders = encode_dataframe(df)

X = df_enc.drop(columns=["income"])
y = df_enc["income"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# --- Padronização ---
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

# --- Modelo ---
model_lr = LogisticRegression(solver="saga", max_iter=5000, random_state=42)
model_lr.fit(X_train_sc, y_train)

# --- Avaliação ---
proba = model_lr.predict_proba(X_test_sc)[:, 1]
pred = (proba >= 0.5).astype(int)

acc = accuracy_score(y_test, pred)
f1  = f1_score(y_test, pred)
auc = roc_auc_score(y_test, proba)

print(f"[Test] Accuracy: {acc:.4f} | F1: {f1:.4f} | ROC-AUC: {auc:.4f}")

# --- Salvar artefatos ---
joblib.dump(model_lr, os.path.join(MODELS_DIR, "logistic_regression.joblib"))
joblib.dump(scaler,    os.path.join(MODELS_DIR, "scaler.joblib"))
joblib.dump(encoders,  os.path.join(MODELS_DIR, "encoders_label.joblib"))

print("✅ Logistic Regression salva em:", MODELS_DIR)


[Test] Accuracy: 0.8253 | F1: 0.5572 | ROC-AUC: 0.8541
✅ Logistic Regression salva em: C:\Users\diego\OneDrive\Área de Trabalho\TÓKIO\TAREFAS\Inteligencia Artificial\ai-ethics-dashboard\models


In [1]:
# === Model 2 — RandomForest + Tuning (CV) ===
import os
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# -----------------------------
# Configs & paths
# -----------------------------
BASE_DIR = os.path.dirname(os.path.abspath(os.getcwd())) if "ipykernel" in globals() else os.getcwd()
# Ajuste se necessário para apontar corretamente ao projeto; exemplo comum:
# BASE_DIR = os.path.dirname(os.path.abspath(__file__))  # se rodar como script
DATA_PATH  = os.path.normpath(os.path.join(BASE_DIR, "data", "raw", "adult.csv"))
MODELS_DIR = os.path.normpath(os.path.join(BASE_DIR, "models"))
os.makedirs(MODELS_DIR, exist_ok=True)

print("Using data:", DATA_PATH)
print("Saving to :", MODELS_DIR)

# -----------------------------
# Load & prepare (mesma lógica do dashboard)
# -----------------------------
df = pd.read_csv(DATA_PATH)
df = df.dropna(subset=["sex", "race", "income"])
df["income"] = df["income"].apply(lambda x: 1 if ">50K" in str(x) else 0)

def encode_dataframe(df_: pd.DataFrame):
    df_ = df_.copy()
    encoders = {}
    for col in df_.select_dtypes(include="object"):
        if col != "income":  # não re-encodar o alvo
            le = LabelEncoder()
            df_[col] = le.fit_transform(df_[col].astype(str))
            encoders[col] = le
    return df_, encoders

df_enc, encoders = encode_dataframe(df)

X = df_enc.drop(columns=["income"])
y = df_enc["income"]

# split fixo para reprodutibilidade e comparabilidade com o dashboard
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, stratify=y
)

# padronização (dashboard usa StandardScaler antes da Logistic Regression)
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc  = scaler.transform(X_test)

# -----------------------------
# Tuning: RandomForest (GridSearchCV)
# -----------------------------
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=42, n_jobs=-1)
param_grid = {
    "n_estimators": [150, 250, 350],
    "max_depth": [None, 8, 12, 16],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring="f1",    # pode trocar para "roc_auc" se preferir
    cv=cv,
    n_jobs=-1,
    verbose=1
)
grid.fit(X_train_sc, y_train)

best_rf = grid.best_estimator_
best_params = grid.best_params_
best_cv_f1 = grid.best_score_

print("Best params:", best_params)
print("Best CV F1:", round(best_cv_f1, 4))

# -----------------------------
# Avaliação em holdout (teste)
# -----------------------------
proba = best_rf.predict_proba(X_test_sc)[:, 1]
pred  = (proba >= 0.5).astype(int)

acc  = accuracy_score(y_test, pred)
f1   = f1_score(y_test, pred)
auc  = roc_auc_score(y_test, proba)

print(f"[Test] Accuracy: {acc:.4f} | F1: {f1:.4f} | ROC-AUC: {auc:.4f}")

# -----------------------------
# Salvar artefatos
# -----------------------------
joblib.dump(best_rf, os.path.join(MODELS_DIR, "random_forest.joblib"))
# Se você já tiver salvo a LR, mantenha; aqui salvamos/atualizamos o scaler
joblib.dump(scaler,  os.path.join(MODELS_DIR, "scaler.joblib"))
# (opcional) salvar os encoders de categorias para reuso
joblib.dump(encoders, os.path.join(MODELS_DIR, "encoders_label.joblib"))

print("Artifacts saved to:", MODELS_DIR)


Using data: C:\Users\diego\OneDrive\Área de Trabalho\TÓKIO\TAREFAS\Inteligencia Artificial\ai-ethics-dashboard\data\raw\adult.csv
Saving to : C:\Users\diego\OneDrive\Área de Trabalho\TÓKIO\TAREFAS\Inteligencia Artificial\ai-ethics-dashboard\models
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 350}
Best CV F1: 0.6896
[Test] Accuracy: 0.8583 | F1: 0.6757 | ROC-AUC: 0.9110
Artifacts saved to: C:\Users\diego\OneDrive\Área de Trabalho\TÓKIO\TAREFAS\Inteligencia Artificial\ai-ethics-dashboard\models
