In [None]:
#!pip install umap-learn
#!pip install sdv

In [None]:
from UMAPSMOTENC import UMAPSMOTENC
from sdv.single_table import TVAESynthesizer, CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sdv.sampling import Condition
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import time

In [38]:
#Define helper functions to load and handle the data
def binnarize(data, cat_cols):

    for i in cat_cols:
        bin = pd.get_dummies(data[i])
        bin_columns = [i + "_" + str(j) for j in bin.columns]
        bin.columns = bin_columns
        data = pd.concat([data, bin], axis=1)

    data = data.loc[:, [i for i in data.columns if i not in cat_cols]]

    return data


def preprocess_adult_dataset(random_seed=42):
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

    column_names = [
        "age", "workclass", "fnlwgt", "education", "education-num",
        "marital-status", "occupation", "relationship", "race", "sex",
        "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
    ]

    data = pd.read_csv(url, header=None, names=column_names)

    data.income = data.income.apply(lambda x: 1 if x.strip() == ">50K" else 0)

    num_cols = ["age", "capital-gain", "capital-loss", "hours-per-week", "fnlwgt"]
    cat_cols = [
        "workclass", "education", "education-num", "marital-status",
        "occupation", "relationship", "race", "sex", "native-country"
    ]

    data[cat_cols] = data[cat_cols].astype(str)

    X = data.drop('income', axis=1)
    y = data['income']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

    scaler = MinMaxScaler()
    X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test[num_cols] = scaler.transform(X_test[num_cols])

    for col in X_train.columns:
        if col not in X_test.columns:
            X_test[col] = False

    X_test = X_test[X_train.columns]

    return X_train, X_test, y_train, y_test, cat_cols, num_cols




In [None]:
X_train, X_test, y_train, y_test, cat_cols, num_cols = preprocess_adult_dataset()
train = X_train.copy()
target ="income"
train[target] = y_train

In [39]:
#Generate new dataset using UMAPSMOTENC

start_time = time.time()

umap_smote_nc = UMAPSMOTENC(
    num_cols.copy(),
    target,
    cat_cols.copy(),
    seed=22,
)


X_train_umapsmotenc = umap_smote_nc.fit_transform(train.copy())

for tipo, coluna in zip(train.dtypes, train.dtypes.index):
    X_train_umapsmotenc[coluna] = X_train_umapsmotenc[coluna].astype(tipo)


end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


Time taken: 163.69 seconds


In [40]:
#Generate new dataset using CTGAN
#Using T4 GPU

start_time = time.time()

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(train)

synthesizer = CTGANSynthesizer(metadata)
synthesizer.fit(train)

positive_class = Condition(
    num_rows=train[train["income"]==1].shape[0],
    column_values={'income': 1}
)

negative_class = Condition(
    num_rows=train[train["income"]==0].shape[0],
    column_values={'income': 0}
)

X_train_gan = synthesizer.sample_from_conditions(
    conditions=[positive_class, negative_class],
)

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

  pid = os.fork()
Sampling conditions: 100%|██████████| 26048/26048 [00:02<00:00, 11446.26it/s]

Time taken: 534.83 seconds





In [43]:
X_train_binnarize= binnarize(X_train.copy(), cat_cols)
X_test_binnarize= binnarize(X_test.copy(), cat_cols)
X_train_gan_binnarize= binnarize(X_train_gan.copy(), cat_cols)
X_train_umapsmotenc_binnarize= binnarize(X_train_umapsmotenc.copy(), cat_cols)

#While fnlwgt introduces diversity in the sampling process, it does not help in the classificaton task and is typically removed from the adult dataset in such settings
X_train_umapsmotenc_binnarize.drop("fnlwgt", axis=1, inplace=True)
X_test_binnarize.drop("fnlwgt", axis=1, inplace=True)
X_train_binnarize.drop("fnlwgt", axis=1, inplace=True)
X_train_gan_binnarize.drop("fnlwgt", axis=1, inplace=True)

In [45]:
for col in X_train_binnarize.columns:
    if col not in X_test_binnarize.columns:
        X_test_binnarize[col] = False
X_test_binnarize = X_test_binnarize[X_train_binnarize.columns]

clf = RandomForestClassifier(random_state=22)

clf.fit(X_train_binnarize, y_train)
print(f'F1 Score achieved with the original train dataset: {f1_score(y_test, clf.predict(X_test_binnarize))}')

clf.fit(X_train_umapsmotenc_binnarize[[col for col in X_train_umapsmotenc_binnarize if col != "income"]], X_train_umapsmotenc_binnarize["income"])
print(f'F1 Score achieved with the UMAPSMOTE-NC synthetic train set: {f1_score(y_test, clf.predict(X_test_binnarize[X_train_umapsmotenc_binnarize[[col for col in X_train_umapsmotenc_binnarize if col != "income"]].columns]))}')

clf.fit(X_train_gan_binnarize[[col for col in X_train_gan_binnarize if col != "income"]], X_train_gan_binnarize["income"])
print(f'F1 Score achieved with the CTGAN synthetic train set: {f1_score(y_test, clf.predict(X_test_binnarize[X_train_gan_binnarize[[col for col in X_train_gan_binnarize if col != "income"]].columns]))}')

F1 Score achieved with the original train dataset: 0.6747149564050972
F1 Score achieved with the UMAPSMOTE-NC synthetic train set: 0.6459930313588851
F1 Score achieved with the CTGAN synthetic train set: 0.5972323379461035
