In [None]:
import pandas as pd
import numpy as np
import random
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# 1. Carregar dados
file_path = 'data/glass.data'
columns = ['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type_of_glass']
df = pd.read_csv(file_path, header=None, names=columns)
df.drop(columns=['Id'], inplace=True)

# 2. Backup original
df_original = df.copy()

# 3. Inserir valores ausentes na coluna categ√≥rica
random.seed(100)
missing_indices = df.sample(frac=0.2, random_state=42).index
df.loc[missing_indices, 'Type_of_glass'] = np.nan

# 4. Separar dados com e sem valores ausentes
df_train = df.dropna(subset=['Type_of_glass']).copy()
df_test = df[df['Type_of_glass'].isna()].copy()

X_train = df_train.drop(columns='Type_of_glass')
X_test = df_test.drop(columns='Type_of_glass')

# Codificar classes para inteiros consecutivos
le = LabelEncoder()
y_train = le.fit_transform(df_train['Type_of_glass'].astype(int))

# 5. Treinar XGBoost
model = XGBClassifier(
    objective='multi:softmax',
    num_class=len(le.classes_),  # n√∫mero correto de classes
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
model.fit(X_train, y_train)

# 6. Prever e decodificar
y_pred_encoded = model.predict(X_test)
y_pred = le.inverse_transform(y_pred_encoded)

# 7. Substituir valores imputados
df_imputed = df.copy()
df_imputed.loc[df_test.index, 'Type_of_glass'] = y_pred

# 8. Avaliar taxa de acerto
true_values = df_original.loc[df_test.index, 'Type_of_glass']
acertos = (y_pred == true_values).sum()
taxa_acerto = acertos / len(true_values)

print(f"Taxa de acerto da imputa√ß√£o com XGBoost: {taxa_acerto:.2%}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Taxa de acerto da imputa√ß√£o com XGBoost: 83.72%


‚úÖ XGBoost (Extreme Gradient Boosting)
√Årvores treinadas em sequ√™ncia: cada nova √°rvore tenta corrigir os erros da anterior, focando nos exemplos mais dif√≠ceis.

- Usa gradiente descendente para minimizar a perda (como erro de classifica√ß√£o).

- Possui v√°rias t√©cnicas integradas:

- Regulariza√ß√£o (L1 e L2) para evitar overfitting.

- Poda inteligente de √°rvores.

- Import√¢ncia de features.

- Aprendizado mais fino (taxa de aprendizado, shrinkage).

üîç Consequ√™ncia no caso:

Os dados t√™m rela√ß√µes complexas e sutis entre as vari√°veis qu√≠micas e a classe (Type_of_glass).

O XGBoost aprende gradualmente essas rela√ß√µes, resultando em previs√µes mais precisas para as classes ausentes.

‚úÖ Random Forest
- Treina muitas √°rvores em paralelo, cada uma em um subset aleat√≥rio dos dados e features (bagging).

- Previs√£o final = maioria das √°rvores (vota√ß√£o).

- N√£o "aprende com os erros" como no XGBoost, apenas tira proveito da diversidade entre √°rvores.

üîç Consequ√™ncia no caso:

Como n√£o foca nos erros, ele n√£o ajusta t√£o bem em casos dif√≠ceis ou classes menos representadas.

Funciona bem com dados tabulares em geral, mas n√£o alcan√ßa o mesmo refinamento que o XGBoost.



In [2]:
import pandas as pd
import numpy as np
import random
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

# -------------------------------------------------------------
# Fun√ß√£o que executa 1 experimento de imputa√ß√£o com XGBoost
# -------------------------------------------------------------
def run_experiment_xgb(seed, missing_frac=0.2):

    # 1. Carregar dados
    file_path = 'data/glass.data'
    columns = ['Id','RI','Na','Mg','Al','Si','K','Ca','Ba','Fe','Type_of_glass']
    df = pd.read_csv(file_path, header=None, names=columns)
    df.drop(columns=['Id'], inplace=True)

    df_original = df.copy()

    # 2. Inserir valores ausentes MCAR
    random.seed(seed)
    n_missing = int(missing_frac * len(df))
    missing_indices = random.sample(list(df.index), n_missing)
    df.loc[missing_indices, 'Type_of_glass'] = np.nan

    # 3. Separar dados completos e incompletos
    df_train = df.dropna(subset=['Type_of_glass']).copy()
    df_test = df[df['Type_of_glass'].isna()].copy()

    X_train = df_train.drop(columns='Type_of_glass')
    X_test = df_test.drop(columns='Type_of_glass')

    # 4. Codificar classes (necess√°rio para o XGBoost)
    le = LabelEncoder()
    y_train = le.fit_transform(df_train['Type_of_glass'].astype(int))

    # 5. Treinar modelo XGBoost
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=len(le.classes_),
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42,
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1
    )

    model.fit(X_train, y_train)

    # 6. Prever classes ausentes
    y_pred_encoded = model.predict(X_test)
    y_pred = le.inverse_transform(y_pred_encoded)

    # 7. Avaliar acur√°cia
    true_values = df_original.loc[df_test.index, 'Type_of_glass']
    acc = accuracy_score(true_values, y_pred)

    return acc


# -------------------------------------------------------------
# Rodar os experimentos com v√°rias seeds
# -------------------------------------------------------------
seeds = list(range(10, 40))  # 30 execu√ß√µes
results = []

for s in seeds:
    print(f"Rodando seed {s}...")
    acc = run_experiment_xgb(s)
    results.append({'seed': s, 'accuracy_xgb': acc})

df_results_xgb = pd.DataFrame(results)

# Resumo estat√≠stico
summary_xgb = df_results_xgb.describe().loc[['mean', 'std']]
print("\nResumo dos resultados (XGBoost):")
print(summary_xgb)


Rodando seed 10...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 11...
Rodando seed 12...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 13...
Rodando seed 14...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 15...
Rodando seed 16...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 17...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 18...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 19...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 20...
Rodando seed 21...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 22...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 23...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 24...
Rodando seed 25...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 26...
Rodando seed 27...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 28...
Rodando seed 29...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 30...
Rodando seed 31...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 32...
Rodando seed 33...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 34...
Rodando seed 35...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 36...
Rodando seed 37...
Rodando seed 38...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Rodando seed 39...

Resumo dos resultados (XGBoost):
           seed  accuracy_xgb
mean  24.500000      0.740476
std    8.803408      0.049767


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
