In [5]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer  # necessário para ativar o IterativeImputer
from sklearn.impute import IterativeImputer
from sklearn.metrics import accuracy_score


In [8]:

# 1. Carregar os dados
file_path = 'adult_data/adult.data'
columns = [
    'age', 'workclass', 'fnlwgt', 'education', 'education-num',
    'marital-status', 'occupation', 'relationship', 'race', 'sex',
    'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income'
]
df = pd.read_csv(file_path, header=None, names=columns, skipinitialspace=True)

# 2. Backup dos dados originais
df_original = df.copy()

# 3. Inserir valores ausentes artificialmente em 'education'
np.random.seed(42)
missing_mask = df['education'].notna()
missing_indices = df[missing_mask].sample(frac=0.10).index
df_missing = df.copy()
df_missing.loc[missing_indices, 'education'] = np.nan

# 4. Codificar variáveis categóricas como números
df_encoded = df_missing.copy()
category_mappings = {}

for col in df_encoded.columns:
    if df_encoded[col].dtype == 'object':
        df_encoded[col] = df_encoded[col].astype('category')
        category_mappings[col] = df_encoded[col].cat.categories
        df_encoded[col] = df_encoded[col].cat.codes.replace(-1, np.nan)
        

# 5. Imputação com MICE usando RandomForestClassifier
mice_estimator = IterativeImputer(
    estimator=RandomForestClassifier(n_estimators=5, max_depth=5, random_state=0),
    max_iter=10,
    random_state=0
)

df_imputed_array = mice_estimator.fit_transform(df_encoded)
df_imputed = pd.DataFrame(df_imputed_array, columns=df_encoded.columns)

# 6. Arredondar a coluna 'education' para converter de float para categoria
df_imputed['education'] = df_imputed['education'].round().astype(int)

# Corrigir possíveis valores fora do intervalo de categorias
n_categories = len(category_mappings['education'])
df_imputed['education'] = df_imputed['education'].clip(0, n_categories - 1)

# 7. Decodificar a coluna 'education' de volta para categorias
df_imputed['education'] = pd.Categorical.from_codes(
    df_imputed['education'],
    categories=category_mappings['education']
)

# 8. Avaliar a acurácia apenas nos valores imputados
true_values = df_original.loc[missing_indices, 'education']
imputed_values = df_imputed.loc[missing_indices, 'education']
accuracy = accuracy_score(true_values, imputed_values)

print(f"\n✅ Acurácia do MICE com RandomForestClassifier: {accuracy * 100:.2f}%")





✅ Acurácia do MICE com RandomForestClassifier: 77.49%
