# Creation of BlackBox Models for the Adult dataset

In [1]:
import sys
import os
from pathlib import Path
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce

In [2]:
# ADD OTHER FOLDERS TO THIS LIST TO ADD THEM TO THE sys.path
modules_to_add = [""]

this_file = os.path.abspath('')

for module in modules_to_add:
    p = Path(this_file).parent / module 
    if p.exists():
        sys.path.append(str(p))
        print(f"ADDED: {p}")
    else:
        print(f"ERROR: {p} doesn't exist")

ADDED: /home/gerardozinno/Desktop/Tesi/Code/mlem


In [3]:
print(sys.path)

['/home/gerardozinno/Desktop/Tesi/Code/mlem/notebooks', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python39.zip', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python3.9', '/home/gerardozinno/.pyenv/versions/3.9.9/lib/python3.9/lib-dynload', '', '/home/gerardozinno/.pyenv/versions/3.9.9/envs/ml-environment/lib/python3.9/site-packages', '/home/gerardozinno/.pyenv/versions/3.9.9/envs/ml-environment/lib/python3.9/site-packages/IPython/extensions', '/home/gerardozinno/.ipython', '/home/gerardozinno/Desktop/Tesi/Code/mlem']


In [4]:
DATASET_URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
WHOLE_DATASET_PATH = Path("../data/adult/adult.csv")
TRAIN_PATH = WHOLE_DATASET_PATH.parent / "train" / "train.csv"
TEST_PATH  = WHOLE_DATASET_PATH.parent / "test" / "test.csv"

In [5]:
TRAIN_SPLIT = .8
RAND_SEED   = 1234

## Dataset creation and cleaning

In [6]:
if not WHOLE_DATASET_PATH.exists():
    print(f"downloading dataset from {DATASET_URL}")
    columns = ['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education-num', 'Marital-status', 'Occupation', 'Relationship', 'Race', 'Sex', 'Capital-gain', 'Capital-loss', 'Hours-per-week', 'Native-country', 'Target']
    df = pd.read_csv(DATASET_URL, names=columns)
    print("dataset downloaded")
    print("Cleaning and preprocessing dataset:")
    print("\tdropping duplicates")
    df.drop_duplicates(inplace=True)
    print("\ttrimming strings")
    df_str = df.select_dtypes(['object'])
    df[df_str.columns] = df_str.apply(lambda x: x.str.strip())
    print("\tremoving rows with missing values (?)")
    for col in df_str.columns:
        df = df[df[col] != '?']
    print("Target Encoding the dataset")
    feat = df.iloc[:, :-1]
    targ = df.iloc[:, -1]
    map_targ = {
        '<=50K': 0,
        '>50K': 1
    }
    targ = targ.map(map_targ)

    targenc = ce.TargetEncoder(verbose=1,return_df=True)
    df = targenc.fit_transform(feat, targ)
    df['Target'] = targ
    
    df.reset_index(inplace=True)
    WHOLE_DATASET_PATH.parent.mkdir(exist_ok=True)
    df.to_csv(WHOLE_DATASET_PATH, index=False)

    
if not (TRAIN_PATH.exists() and TEST_PATH.exists()):
    print(f"Couldn't find the train and/or test dataset(s) in:\n\t{TRAIN_PATH}\n\t{TEST_PATH}\n")
    if not WHOLE_DATASET_PATH.exists():
        print(f"ERROR: Couldn't even find {WHOLE_DATASET_PATH}")
        raise Exception("Can't find dataset")
    else:
        
        print(f"Creating train and test sets with a split of {TRAIN_SPLIT}% - {1-TRAIN_SPLIT:.2f}% and {RAND_SEED} as random seed")
        print('The dataset is split "as is", without preprocessing. The selection of the right columns is made by the respective Dataloader')
        df = pd.read_csv(WHOLE_DATASET_PATH, index_col=0)
        train, test = train_test_split(df, train_size=TRAIN_SPLIT, shuffle=True, random_state=RAND_SEED)
        TRAIN_PATH.parent.mkdir(exist_ok=True)
        TEST_PATH.parent.mkdir(exist_ok=True)
        train.to_csv(TRAIN_PATH, index=False)
        test.to_csv(TEST_PATH, index=False)
        print("train and test datasets created")

downloading dataset from https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
dataset downloaded
Cleaning and preprocessing dataset:
	dropping duplicates
	trimming strings
	removing rows with missing values (?)
Target Encoding the dataset
Couldn't find the train and/or test dataset(s) in:
	../data/adult/train/train.csv
	../data/adult/test/test.csv

Creating train and test sets with a split of 0.8% - 0.20% and 1234 as random seed
The dataset is split "as is", without preprocessing. The selection of the right columns is made by the respective Dataloader
train and test datasets created


## Loading the datasets

In [8]:
df = pd.read_csv(WHOLE_DATASET_PATH, index_col=0)
test = pd.read_csv(TEST_PATH)
train = pd.read_csv(TRAIN_PATH)
assert(len(train) + len(test) == len(df))

Unnamed: 0_level_0,Age,Workclass,Fnlwgt,Education,Education-num,Marital-status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Target
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,39,0.268960,77516,0.421658,13,0.048399,0.133907,0.106689,0.263855,0.313955,2174,0,40,0.254411,0
1,50,0.285829,83311,0.421658,13,0.455011,0.485342,0.455728,0.263855,0.313955,0,0,13,0.254411,0
2,38,0.218918,215646,0.164328,9,0.107312,0.061527,0.106689,0.263855,0.313955,0,0,40,0.254411,0
3,53,0.218918,234721,0.056298,7,0.455011,0.061527,0.455728,0.129972,0.313955,0,0,40,0.254411,0
4,28,0.218918,338409,0.421658,13,0.455011,0.448686,0.493599,0.129972,0.113783,0,0,40,0.271739,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,0.218918,257302,0.253968,12,0.455011,0.305159,0.493599,0.263855,0.113783,0,0,38,0.254411,0
32557,40,0.218918,154374,0.164328,9,0.455011,0.124236,0.455728,0.263855,0.313955,0,0,40,0.254411,1
32558,58,0.218918,151910,0.164328,9,0.096735,0.133907,0.066334,0.263855,0.113783,0,0,40,0.254411,0
32559,22,0.218918,201490,0.164328,9,0.048399,0.133907,0.014343,0.263855,0.313955,0,0,20,0.254411,0


True