### Centralizado

#### Importações

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, f1_score
from sklearn.model_selection import RandomizedSearchCV
from ctgan import CTGAN
import numpy as np

#### Carrega e pré processa os dados

In [3]:
# Column names for the dataset
columns = [
    "age", "workclass", "fnlwgt", "education", "education.num",
    "marital.status", "occupation", "relationship", "race", "sex",
    "capital.gain", "capital.loss", "hours.per.week", "native.country", "income"
]

# Load the Adult dataset
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

# Load training data
train_data = pd.read_csv(train_url, header=None, names=columns, na_values=["?"], skipinitialspace=True)

# Load test data
test_data = pd.read_csv(test_url, header=None, names=columns, na_values=["?"], skipinitialspace=True, skiprows=1)
test_data["income"] = test_data["income"].str.strip(".")  # Clean income column in test set

# Remove the 'fnlwgt' column
train_data = train_data.drop(columns=['fnlwgt', 'education.num'])
test_data = test_data.drop(columns=['fnlwgt', 'education.num'])

In [4]:
# Retirando numeros sem sentido para criacao de dados sinteticos
train_data['capital.gain'] = train_data['capital.gain'].astype(int)
test_data['capital.gain'] = test_data['capital.gain'].astype(int)
train_data = train_data[train_data['capital.gain'] != 99999]
test_data = test_data[test_data['capital.gain'] != 99999]

#### Geração de dados sintéticos

In [5]:
discrete_columns = train_data.select_dtypes(include=["object"]).columns

In [6]:
ctgan = CTGAN(epochs=1)
ctgan.fit(train_data, discrete_columns)

In [7]:
syndata = ctgan.sample(20000)

In [8]:
# Remove 1000 random samples from train_data
train_data = train_data.drop(train_data.sample(n=20000).index)

#### Transformações para modelagem

In [9]:
# Combine train and test data for consistent preprocessing
data = pd.concat([train_data, syndata, test_data], axis=0, ignore_index=True) #adicione e remova train_data e syndata para treinar ou nao com dados reais e sinteticos

# Combinar categorias da variável 'native.country'
data['native.country'] = data['native.country'].replace({
    country: 'Other' for country in data['native.country'].unique() if country != 'United-States'
})

# Encode the target variable
label_encoder = LabelEncoder()
data["income"] = label_encoder.fit_transform(data["income"])  # Encodes '<=50K' as 0 and '>50K' as 1


# Split features and target
X = data.drop(columns=["income"])
y = data["income"]

# Label Encoding para education, sex e race
for col in ["education", "sex", "native.country"]:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Define categorical and numerical columns
categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),  # Scale numerical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),  # One-hot encode categorical features
    ],
    remainder="passthrough"
)

# Split back into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(test_data), shuffle=False)

#### Pipeline para Regressão Logística

In [60]:
# Create a pipeline with preprocessing and classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000)),
])

#### Pipeline para Random Forest

In [62]:
# Create a pipeline with preprocessing and Random Forest classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
])

#### Treina o modelo

In [63]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("ROC AUC:", roc_auc_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Accuracy: 0.8468153061851238

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90     12435
        >50K       0.71      0.60      0.65      3846

    accuracy                           0.85     16281
   macro avg       0.79      0.76      0.78     16281
weighted avg       0.84      0.85      0.84     16281

ROC AUC: 0.7621508913432532
F1 Score: 0.6498174670036506


#### Hyperparameter tunning Random Forest

In [None]:
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring="accuracy",
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Fit the model
random_search.fit(X_train_preprocessed, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_preprocessed)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#### Treina XGBoost

##### Usando XGBClassifier (mais alto nível)

In [10]:
# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Train the XGBoost model directly
model = XGBClassifier(eval_metric="logloss", n_jobs=-1)
model.fit(X_train_preprocessed, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_preprocessed)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test_preprocessed)[:, 1]))


Accuracy: 0.8603976290442085

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.88      0.94      0.91     12435
        >50K       0.76      0.58      0.66      3761

    accuracy                           0.86     16196
   macro avg       0.82      0.76      0.79     16196
weighted avg       0.85      0.86      0.85     16196

ROC-AUC: 0.911107832518514


##### Usando xgb.train (mais baixo nível)

In [11]:
import xgboost as xgb

In [12]:
train_dmatrix = xgb.DMatrix(data=X_train_preprocessed, label=y_train)

In [13]:
test_dmatrix = xgb.DMatrix(data=X_test_preprocessed, label=y_test)

In [14]:
model = xgb.train(
    params={"objective": "binary:logistic",  # Classificação binária
    "eval_metric": "auc",        # Métrica de avaliação
    "eta": 0.1,                      # Taxa de aprendizado (equivalente a learning_rate)
    "max_depth": 6,                   # Profundidade das árvores
    "nthread": -1},
    dtrain=train_dmatrix,
    num_boost_round=100,
    evals=[(test_dmatrix, "Test")]
)

[0]	Test-auc:0.84761
[1]	Test-auc:0.86934
[2]	Test-auc:0.88136
[3]	Test-auc:0.88562
[4]	Test-auc:0.88871
[5]	Test-auc:0.89105
[6]	Test-auc:0.89183
[7]	Test-auc:0.89255
[8]	Test-auc:0.89285
[9]	Test-auc:0.89345
[10]	Test-auc:0.89401
[11]	Test-auc:0.89720
[12]	Test-auc:0.89747
[13]	Test-auc:0.89778
[14]	Test-auc:0.89825
[15]	Test-auc:0.89867
[16]	Test-auc:0.89887
[17]	Test-auc:0.89970
[18]	Test-auc:0.89989
[19]	Test-auc:0.90016
[20]	Test-auc:0.90072
[21]	Test-auc:0.90112
[22]	Test-auc:0.90229
[23]	Test-auc:0.90258
[24]	Test-auc:0.90274
[25]	Test-auc:0.90295
[26]	Test-auc:0.90386
[27]	Test-auc:0.90411
[28]	Test-auc:0.90427
[29]	Test-auc:0.90504
[30]	Test-auc:0.90574
[31]	Test-auc:0.90608
[32]	Test-auc:0.90618
[33]	Test-auc:0.90622
[34]	Test-auc:0.90635
[35]	Test-auc:0.90695
[36]	Test-auc:0.90713
[37]	Test-auc:0.90719
[38]	Test-auc:0.90729
[39]	Test-auc:0.90768
[40]	Test-auc:0.90784
[41]	Test-auc:0.90792
[42]	Test-auc:0.90831
[43]	Test-auc:0.90854
[44]	Test-auc:0.90866
[45]	Test-auc:0.9087

In [15]:
y_pred = model.predict(test_dmatrix)
y_pred_binary = np.where(y_pred > 0.5, 1, 0)

AUC sem usar funcao do sklear

In [16]:
eval_results = model.eval_set(
            evals=[(test_dmatrix, "valid")],
            iteration=model.num_boosted_rounds() - 1,
        )
auc = round(float(eval_results.split("\t")[1].split(":")[1]), 4)

In [18]:
auc

0.9113

AUC com sklearn

In [17]:
roc_auc_score(y_test, y_pred)

0.9112973209158777

In [19]:
print("Accuracy:", accuracy_score(y_test, y_pred_binary))
print("\nClassification Report:\n", classification_report(y_test, y_pred_binary, target_names=label_encoder.classes_))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))
print("f1_score_macro:", f1_score(y_test, y_pred_binary, average='macro'))
print("f1_score_micro:", f1_score(y_test, y_pred_binary, average='micro'))
print("f1_score_weighted:", f1_score(y_test, y_pred_binary, average='weighted'))
print("f1_score_none:", f1_score(y_test, y_pred_binary, average=None))

Accuracy: 0.859718448999753

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.88      0.95      0.91     12435
        >50K       0.77      0.56      0.65      3761

    accuracy                           0.86     16196
   macro avg       0.83      0.75      0.78     16196
weighted avg       0.85      0.86      0.85     16196

ROC-AUC: 0.9112973209158777
f1_score_macro: 0.7805966228253182
f1_score_micro: 0.859718448999753
f1_score_weighted: 0.8511603407193186
f1_score_none: [0.91235244 0.6488408 ]


#### Hyperparameter tunning XGBoost

In [None]:
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.5],
    "reg_alpha": [0, 0.01, 0.1, 1],
    "reg_lambda": [0.1, 1, 10]
}

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

class SklearnXGBClassifier(XGBClassifier):
    def __sklearn_tags__(self):
        return {}

# Create the model
xgb = SklearnXGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Use RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring="accuracy",
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all processors
)


# Fit RandomizedSearchCV on training data
random_search.fit(X_train_preprocessed, y_train)

# Best parameters and model evaluation
print("Best Parameters:", random_search.best_params_)

# Make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_preprocessed)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


### Federated

#### Importacoes

In [None]:
from flwr_datasets.partitioner import IidPartitioner, DirichletPartitioner
from flwr_datasets import FederatedDataset
import xgboost as xgb
from sklearn.preprocessing import OrdinalEncoder
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score
import warnings
from sklearn.exceptions import UndefinedMetricWarning

  from .autonotebook import tqdm as notebook_tqdm


#### Carregamento e preprocessamento dos dados

In [21]:
num_clients = 4

In [22]:
#Se for particionar de maneira IID
partitioner = IidPartitioner(num_partitions=num_clients)

In [23]:
#Se for particionar de maneira Nao IID
partitioner = DirichletPartitioner(
                num_partitions=num_clients,
                partition_by="income",
                alpha=0.1,
                min_partition_size=30,
                self_balancing=False
                )

In [24]:
# Create a FederatedDataset
fds = FederatedDataset(
            dataset="scikit-learn/adult-census-income",
            partitioners={"train": partitioner},
        )

In [25]:
# Load the partitions
partitions = [fds.load_partition(i, split="train") for i in range(num_clients)]

In [27]:
# Split each partition into train and test sets
splits = [partition.train_test_split(test_size=0.2, seed=42) for partition in partitions]

# Separate train and test splits into lists
train_partitions = [split["train"] for split in splits]
test_partitions = [split["test"] for split in splits]


In [28]:
# Convert each partition to a pandas DataFrame
train_partitions_df = [partition.to_pandas() for partition in train_partitions]
test_partitions_df = [partition.to_pandas() for partition in test_partitions]

In [29]:
# Combine train and test dataframes, para que o encoder capte todas categorias
combined_data_dfs = [pd.concat([train, test]) for train, test in zip(train_partitions_df, test_partitions_df)]

In [47]:
# Para teste em desempenho global
test_data_g = pd.read_csv(test_url, header=None, names=columns, skipinitialspace=True, skiprows=1)
test_data_g["income"] = test_data_g["income"].str.strip(".")

In [30]:
# Retirando numeros provavelmente incorretos
for df in combined_data_dfs:
    df['capital.gain'] = df['capital.gain'].astype(int)
    df = df[df['capital.gain'] != 99999]

In [48]:
test_data_g["capital.gain"] = test_data_g["capital.gain"].astype(int)
test_data_g = test_data_g[test_data_g["capital.gain"] != 99999]

In [33]:
categorical_cols = combined_data_dfs[0].select_dtypes(include=["object"]).columns

In [35]:
# Gera dados sinteticos para cada particao
ctgan = CTGAN(epochs=1)
syndata = []
for data in train_partitions_df:
    ctgan.fit(data, categorical_cols)
    syndata.append(ctgan.sample(1000))

In [36]:
# Remove random samples from train_data e adiciona sintetico
for i, df in enumerate(train_partitions_df):
    df.drop(df.sample(n=1000).index)
    df = pd.concat([df, syndata[i]])

In [37]:
ordinal_encoder = OrdinalEncoder()

In [38]:
# Encode categorical columns em cada particao de treino e teste
for i in range(num_clients):
    combined_data_dfs[i][categorical_cols] = ordinal_encoder.fit_transform(combined_data_dfs[i][categorical_cols])
    train_partitions_df[i][categorical_cols] = ordinal_encoder.transform(train_partitions_df[i][categorical_cols])
    test_partitions_df[i][categorical_cols] = ordinal_encoder.transform(test_partitions_df[i][categorical_cols])

In [49]:
# encode no teste global
test_data_g[categorical_cols] = ordinal_encoder.transform(test_data_g[categorical_cols])

In [40]:
# Create DMatrixes for each partition
train_dmatrixes = []
test_dmatrixes = []

for partition in train_partitions_df:
    X_train = partition.drop(columns=["income"]).values
    y_train = partition["income"].values
    train_dmatrix = xgb.DMatrix(data=X_train, label=y_train)
    train_dmatrixes.append(train_dmatrix)

for partition in test_partitions_df:
    X_test = partition.drop(columns=["income"]).values
    y_test = partition["income"].values
    test_dmatrix = xgb.DMatrix(data=X_test, label=y_test)
    test_dmatrixes.append(test_dmatrix)

In [50]:
# dmatrix para teste global
X_test_g = test_data_g.drop(columns=["income"]).values
y_test_g = test_data_g["income"].values
test_dmatrix_g = xgb.DMatrix(data=X_test_g, label=y_test_g)

#### Treinamento

In [None]:
models = [xgb.train(
      params={"objective": "binary:logistic",  # Classificação binária
            "eval_metric": "auc",        # Métrica de avaliação
            "eta": 0.1,                      # Taxa de aprendizado (equivalente a learning_rate)
            "max_depth": 6,                   # Profundidade das árvores
            "nthread": -1},
      dtrain=train_dmatrix,
      num_boost_round=100,
      evals=[(test_dmatrix, "validate")],
  ) for train_dmatrix, test_dmatrix in zip(train_dmatrixes, test_dmatrixes)]

In [43]:
# Estatisticas basicas de predicao
for i, (model, test_dmatrix) in enumerate(zip(models, test_dmatrixes)):
    y_pred = model.predict(test_dmatrix)
    print("\n🔹 Estatísticas Básicas de y_pred")
    print("-" * 50)
    print("📌 Média:", np.mean(y_pred))
    print("📌 Mediana:", np.median(y_pred))
    print("📌 Q1 (1º Quartil - 25%):", np.percentile(y_pred, 25))
    print("📌 Q2 (Mediana - 50%):", np.percentile(y_pred, 50))
    print("📌 Q3 (3º Quartil - 75%):", np.percentile(y_pred, 75))
    print("📌 Desvio Padrão:", np.std(y_pred))
    print("📌 Variância:", np.var(y_pred))
    print("📌 Mínimo:", np.min(y_pred))
    print("📌 Máximo:", np.max(y_pred))


🔹 Estatísticas Básicas de y_pred
--------------------------------------------------
📌 Média: 0.00023408809
📌 Mediana: 0.00023408807
📌 Q1 (1º Quartil - 25%): 0.0002340880746487528
📌 Q2 (Mediana - 50%): 0.0002340880746487528
📌 Q3 (3º Quartil - 75%): 0.0002340880746487528
📌 Desvio Padrão: 1.4551915e-11
📌 Variância: 2.1175824e-22
📌 Mínimo: 0.00023408807
📌 Máximo: 0.00023408807

🔹 Estatísticas Básicas de y_pred
--------------------------------------------------
📌 Média: 0.0019344116
📌 Mediana: 0.00017763772
📌 Q1 (1º Quartil - 25%): 8.198329305741936e-05
📌 Q2 (Mediana - 50%): 0.0001776377175701782
📌 Q3 (3º Quartil - 75%): 0.0005713249993277714
📌 Desvio Padrão: 0.017157607
📌 Variância: 0.0002943835
📌 Mínimo: 3.740869e-05
📌 Máximo: 0.57996666

🔹 Estatísticas Básicas de y_pred
--------------------------------------------------
📌 Média: 0.00015968108
📌 Mediana: 0.00015968109
📌 Q1 (1º Quartil - 25%): 0.00015968109073583037
📌 Q2 (Mediana - 50%): 0.00015968109073583037
📌 Q3 (3º Quartil - 75%): 0.0

In [51]:
# Estatisticas basicas de predicao no teste global
y_pred_g = model.predict(test_dmatrix_g)
print("\n🔹 Estatísticas Básicas de y_pred_g")
print("-" * 50)
print("📌 Média:", np.mean(y_pred_g))
print("📌 Mediana:", np.median(y_pred_g))
print("📌 Q1 (1º Quartil - 25%):", np.percentile(y_pred_g, 25))
print("📌 Q2 (Mediana - 50%):", np.percentile(y_pred_g, 50))
print("📌 Q3 (3º Quartil - 75%):", np.percentile(y_pred_g, 75))
print("📌 Desvio Padrão:", np.std(y_pred_g))
print("📌 Variância:", np.var(y_pred_g))
print("📌 Mínimo:", np.min(y_pred_g))
print("📌 Máximo:", np.max(y_pred_g))


🔹 Estatísticas Básicas de y_pred_g
--------------------------------------------------
📌 Média: 0.08053482
📌 Mediana: 0.0055859834
📌 Q1 (1º Quartil - 25%): 0.00105914322193712
📌 Q2 (Mediana - 50%): 0.005585983628407121
📌 Q3 (3º Quartil - 75%): 0.043701413087546825
📌 Desvio Padrão: 0.19939682
📌 Variância: 0.039759092
📌 Mínimo: 0.00012255205
📌 Máximo: 0.9950789


In [None]:
# Ignore UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

# Desempenho dos modelos
for i, (model, test_dmatrix) in enumerate(zip(models, test_dmatrixes)):
    y_pred = model.predict(test_dmatrix)
    y_pred_binary = np.where(y_pred > 0.5, 1, 0)
    y_test = test_dmatrix.get_label()

    print(f"\n🔹 **Resultados do Modelo {i+1}**")
    print("-" * 60)

    print("✅ **Accuracy:**", accuracy_score(y_test, y_pred_binary))

    print("\n📊 **Classification Report:**\n",
          classification_report(y_test, y_pred_binary,
                                target_names=ordinal_encoder.categories_[-1],
                                labels=[0., 1.])
    )

    # Tratar erro do ROC-AUC quando há apenas uma classe presente
    try:
        roc_auc = roc_auc_score(y_test, y_pred)
    except ValueError:
        roc_auc = "Indefinido (apenas uma classe presente)"

    print("📈 **ROC-AUC:**", roc_auc)

    print("⚡ **F1 Scores:**")
    print("  - Macro:", f1_score(y_test, y_pred_binary, average='macro'))
    print("  - Micro:", f1_score(y_test, y_pred_binary, average='micro'))
    print("  - Weighted:", f1_score(y_test, y_pred_binary, average='weighted'))
    print("  - None:", f1_score(y_test, y_pred_binary, average=None))

    print("-" * 60)  # Separador entre iterações do loop



🔹 **Resultados do Modelo 1**
------------------------------------------------------------
✅ **Accuracy:** 1.0

📊 **Classification Report:**
               precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      1059
        >50K       0.00      0.00      0.00         0

    accuracy                           1.00      1059
   macro avg       0.50      0.50      0.50      1059
weighted avg       1.00      1.00      1.00      1059

📈 **ROC-AUC:** nan
⚡ **F1 Scores:**
  - Macro: 1.0
  - Micro: 1.0
  - Weighted: 1.0
  - None: [1.]
------------------------------------------------------------

🔹 **Resultados do Modelo 2**
------------------------------------------------------------
✅ **Accuracy:** 0.9979973297730307

📊 **Classification Report:**
               precision    recall  f1-score   support

       <=50K       1.00      1.00      1.00      2991
        >50K       0.00      0.00      0.00         5

    accuracy                           1.00      2

In [55]:
for i, model in enumerate(models):
    y_pred_g = model.predict(test_dmatrix_g)
    y_pred_g_binary = np.where(y_pred_g > 0.5, 1, 0)
    y_test_g = test_dmatrix_g.get_label()
    print(f"\n🔹 **Resultados do Modelo {i+1} no Teste Global**")
    print("-" * 60)

    print("✅ **Accuracy:**", accuracy_score(y_test_g, y_pred_g_binary))

    print("\n📊 **Classification Report:**\n",
        classification_report(y_test_g, y_pred_g_binary,
                                target_names=ordinal_encoder.categories_[-1],
                                labels=[0., 1.])
    )

    # Tratar erro do ROC-AUC quando há apenas uma classe presente
    try:
        roc_auc = roc_auc_score(y_test_g, y_pred_g)
    except ValueError:
        roc_auc = "Indefinido (apenas uma classe presente)"

    print("📈 **ROC-AUC:**", roc_auc)

    print("⚡ **F1 Scores:**")
    print("  - Macro:", f1_score(y_test_g, y_pred_g_binary, average='macro'))
    print("  - Micro:", f1_score(y_test_g, y_pred_g_binary, average='micro'))
    print("  - Weighted:", f1_score(y_test_g, y_pred_g_binary, average='weighted'))
    print("  - None:", f1_score(y_test_g, y_pred_g_binary, average=None))

    print("-" * 60)  # Separador entre iterações do loop



🔹 **Resultados do Modelo 1 no Teste Global**
------------------------------------------------------------
✅ **Accuracy:** 0.767782168436651

📊 **Classification Report:**
               precision    recall  f1-score   support

       <=50K       0.77      1.00      0.87     12435
        >50K       0.00      0.00      0.00      3761

    accuracy                           0.77     16196
   macro avg       0.38      0.50      0.43     16196
weighted avg       0.59      0.77      0.67     16196

📈 **ROC-AUC:** 0.5
⚡ **F1 Scores:**
  - Macro: 0.43431944395934474
  - Micro: 0.767782168436651
  - Weighted: 0.6669254489546125
  - None: [0.86863889 0.        ]
------------------------------------------------------------

🔹 **Resultados do Modelo 2 no Teste Global**
------------------------------------------------------------
✅ **Accuracy:** 0.7796986910348234

📊 **Classification Report:**
               precision    recall  f1-score   support

       <=50K       0.78      1.00      0.87     1

#### Modelo pre-treinado

In [29]:
# Load saved model
with open("global_model_round1.json", "r") as f:  # Change round number as needed
    model_str = f.read()
    
# Convert string back to bytes
model_bytes = model_str.encode("utf-8")

# Define XGBoost parameters as dictionary
xgb_params = {
    'objective': 'binary:logistic',
    'eta': 0.1,
    'max_depth': 8,
    'eval_metric': 'auc',
    'nthread': 16,
    'num_parallel_tree': 1,
    'subsample': 1,
    'tree_method': 'hist'
}

# Create new booster and load model
bst = xgb.Booster(params=xgb_params)
bst.load_model(bytearray(model_bytes))

In [30]:
# Calcula metricas para cada particao
total_weighted_acc = 0
total_weighted_f1 = 0
len_test_partitions = 0
for test_dmatrix in test_dmatrixes:
    eval_results = bst.eval_set(
        evals=[(test_dmatrix, "valid")],
        iteration=bst.num_boosted_rounds() - 1,
    )
    auc = round(float(eval_results.split("\t")[1].split(":")[1]), 4)
    print(f"AUC: {auc}")
    acc = accuracy_score(
                test_dmatrix.get_label(),
                bst.predict(test_dmatrix) > 0.5,
            )
    print(f"Accuracy: {acc}")
    f1 = f1_score(
                test_dmatrix.get_label(),
                bst.predict(test_dmatrix) > 0.5,
                average="macro"
            )
    print(f"F1 Score: {f1}")
    total_weighted_acc += acc * len(test_dmatrix.get_label())
    total_weighted_f1 += f1 * len(test_dmatrix.get_label())
    print(f"Partition weight: {len(test_dmatrix.get_label())}")
    len_test_partitions += len(test_dmatrix.get_label())



AUC: nan
Accuracy: 1.0
F1 Score: 1.0
Partition weight: 1059
AUC: 0.8942
Accuracy: 0.9983311081441922
F1 Score: 0.4995824285952898
Partition weight: 2996
AUC: nan
Accuracy: 1.0
F1 Score: 1.0
Partition weight: 1526
AUC: 0.8733
Accuracy: 0.962486602357985
F1 Score: 0.4904423812124522
Partition weight: 933




In [31]:
#Calcula media das metricas de cada particao
avg_acc = total_weighted_acc / len_test_partitions
avg_f1 = total_weighted_f1 / len_test_partitions

print(f"Average Accuracy: {avg_acc}")
print(f"Average F1 Score: {avg_f1}")

Average Accuracy: 0.9938593797973595
Average F1 Score: 0.6968577982411278


In [32]:
for combined_data_df in combined_data_dfs:
    print(f"Class distribution: {combined_data_df['income'].value_counts()}")

Class distribution: income
0.0    5294
Name: count, dtype: int64
Class distribution: income
0.0    14946
1.0       30
Name: count, dtype: int64
Class distribution: income
0.0    7627
Name: count, dtype: int64
Class distribution: income
0.0    4480
1.0     184
Name: count, dtype: int64
