# Experts - IDS Experiment

Imbalanced data set with 2,327 records (826 positive cases and 1,501 negative cases) and 26 attributes. As the original data set contained numerous negative cases when compared to the number of positive cases (40,936 negative cases and 826 positive cases), we used the random undersampling technique to reduce the difference between the positive and negative congenital syphilis cases, setting a ratio of 55% of the number of samples in the minority class (positive cases) over the number of samples in the majority class (negative cases) after resampling.

In [None]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
print('ENV variables loaded successfully!')

module_path = os.path.abspath(os.path.join('../../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from lib.env_var_keys import EnvVarKeys

pre_processed_dataset_path = os.getenv(EnvVarKeys.PRE_PROCESSED_DATASET_PATH_KEY.value)
df = pd.read_csv(pre_processed_dataset_path, sep=',', low_memory=False)

print(f'Pre-processed dataset shape: {df.shape}')

RANDOM_STATE = 28

In [None]:
from lib.dataframe_helper import vdrl_count

vdrl_count(df)

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from lib.dataframe_helper import fill_nan

print(f'Shape before: {df.shape}')

df = fill_nan(df)

expert_attributes = [
  'VDRL_RESULT',
  'HAS_PREG_RISK',
  'NUM_ABORTIONS',
  'PLAN_PREGNANCY',
  'MARITAL_STATUS',
  'FOOD_INSECURITY',
  'NUM_LIV_CHILDREN',
  'NUM_PREGNANCIES',
  'FAM_PLANNING',
  'LEVEL_SCHOOLING',
  'FAM_INCOME',
  'AGE',
]

df = df[expert_attributes]

print(f'Shape after: {df.shape}')

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

# Create X and y
X = np.array(df.drop('VDRL_RESULT', axis=1))
y = np.array(df['VDRL_RESULT'])

undersampler = RandomUnderSampler(sampling_strategy=0.55, random_state=RANDOM_STATE)
X, y = undersampler.fit_resample(X, y)

print(f'\nShape after undersampling: ({X.shape[0]}, {X.shape[1] + 1})')

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

train_unique, train_counts = np.unique(y_train, return_counts=True)
test_unique, test_counts = np.unique(y_test, return_counts=True)
print(f'Shape after splitting: train={X_train.shape} [0 = {train_counts[0]}, 1 = {train_counts[1]}] | test={X_test.shape} [0 = {test_counts[0]}, 1 = {test_counts[1]}]')

feature_names = df.drop('VDRL_RESULT', axis=1).columns.to_list()

In [None]:
from lib.classifier_helper import ClassifierHelper

clf_helper = ClassifierHelper(X_train, X_test, y_train, y_test, feature_names, False)

clf_helper.exec_random_forest()
clf_helper.exec_knn()
clf_helper.exec_decision_tree()
clf_helper.exec_ada_boost()
clf_helper.exec_gradient_boosting()
clf_helper.exec_svm()
clf_helper.exec_logistic_regression()
clf_helper.exec_xgboost()