#### Importações

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix
from sklearn.model_selection import RandomizedSearchCV
from ctgan import CTGAN

#### Carrega e pré processa os dados

In [82]:
# Column names for the dataset
columns = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# Load the Adult dataset
train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"

# Load training data
train_data = pd.read_csv(train_url, header=None, names=columns, na_values=["?"], skipinitialspace=True)

# Load test data
test_data = pd.read_csv(test_url, header=None, names=columns, na_values=["?"], skipinitialspace=True, skiprows=1)
test_data["income"] = test_data["income"].str.strip(".")  # Clean income column in test set

# Remove the 'fnlwgt' column
train_data = train_data.drop(columns=['fnlwgt', 'education-num'])
test_data = test_data.drop(columns=['fnlwgt', 'education-num'])

In [83]:
train_data['capital-gain'] = train_data['capital-gain'].astype(int)
test_data['capital-gain'] = test_data['capital-gain'].astype(int)

#### Geração de dados sintéticos

In [23]:
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]
ctgan = CTGAN(epochs=1)
ctgan.fit(train_data, discrete_columns)

In [121]:
syndata = ctgan.sample(30000)

In [79]:
# Remove 1000 random samples from train_data
train_data = train_data.drop(train_data.sample(n=20000).index)

#### Transformações para modelagem

In [122]:
# Combine train and test data for consistent preprocessing
data = pd.concat([syndata, test_data], axis=0, ignore_index=True) #adicione e remova train_data e syndata para treinar ou nao com dados reais e sinteticos

# Combinar categorias da variável 'native-country'
data['native-country'] = data['native-country'].replace({
    country: 'Other' for country in data['native-country'].unique() if country != 'United-States'
})

# Encode the target variable
label_encoder = LabelEncoder()
data["income"] = label_encoder.fit_transform(data["income"])  # Encodes '<=50K' as 0 and '>50K' as 1

# Split features and target
X = data.drop(columns=["income"])
y = data["income"]

# Label Encoding para education, sex e race
for col in ["education", "sex", "native-country"]:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])

# Define categorical and numerical columns
categorical_columns = X.select_dtypes(include=["object"]).columns
numerical_columns = X.select_dtypes(include=["int64", "float64"]).columns

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_columns),  # Scale numerical features
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),  # One-hot encode categorical features
    ],
    remainder="passthrough"
)

# Split back into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=len(test_data), shuffle=False)

#### Pipeline para Regressão Logística

In [123]:
# Create a pipeline with preprocessing and classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", LogisticRegression(max_iter=1000)),
])

#### Pipeline para Random Forest

In [125]:
# Create a pipeline with preprocessing and Random Forest classifier
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)),
])

#### Treina o modelo

In [126]:
# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.7637123026841103

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.76      1.00      0.87     12435
        >50K       0.48      0.00      0.01      3846

    accuracy                           0.76     16281
   macro avg       0.62      0.50      0.44     16281
weighted avg       0.70      0.76      0.66     16281



#### Hyperparameter tunning Random Forest

In [None]:
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 10, 20, 30, 40],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Use RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring="accuracy",
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all available processors
)

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Fit the model
random_search.fit(X_train_preprocessed, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_preprocessed)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#### Treina XGBoost

In [127]:
# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Train the XGBoost model directly
model = XGBClassifier(eval_metric="logloss", n_jobs=-1)
model.fit(X_train_preprocessed, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_preprocessed)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("ROC-AUC:", roc_auc_score(y_test, model.predict_proba(X_test_preprocessed)[:, 1]))


Accuracy: 0.7638351452613475

Classification Report:
               precision    recall  f1-score   support

       <=50K       0.77      0.99      0.87     12435
        >50K       0.50      0.02      0.05      3846

    accuracy                           0.76     16281
   macro avg       0.63      0.51      0.46     16281
weighted avg       0.70      0.76      0.67     16281

ROC-AUC: 0.4967441721392218


#### Hyperparameter tunning XGBoost

In [None]:
# Define the parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [3, 5, 7, 10],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "gamma": [0, 0.1, 0.2, 0.5],
    "reg_alpha": [0, 0.01, 0.1, 1],
    "reg_lambda": [0.1, 1, 10]
}

# Preprocess the data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

class SklearnXGBClassifier(XGBClassifier):
    def __sklearn_tags__(self):
        return {}

# Create the model
xgb = SklearnXGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state=42)

# Use RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_grid,
    n_iter=50,  # Number of random combinations to try
    scoring="accuracy",
    cv=3,  # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1  # Use all processors
)


# Fit RandomizedSearchCV on training data
random_search.fit(X_train_preprocessed, y_train)

# Best parameters and model evaluation
print("Best Parameters:", random_search.best_params_)

# Make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test_preprocessed)

# Evaluate the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
