In [1]:
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer

In [2]:
original_data = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/adult-all.csv', na_values='?').reset_index(drop=True)
original_data.columns = ["age","workclass","fnlwgt","education",
    "education.num","marital.status","occupation","relationship",
    "race","sex","capital.gain","capital.loss",
    "hours.per.week","native.country","income"]

In [3]:
original_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [4]:
preprocessed_ds = original_data.copy(deep=True)

In [5]:
preprocessed_ds = preprocessed_ds.dropna()

In [6]:
preprocessed_ds.columns = [c.replace('.', '') for c in preprocessed_ds.columns]

In [7]:
preprocessed_ds.to_pickle(Path.joinpath(Path.cwd().resolve(), 'data', 'raw', 'adult_original.pkl'))

In [8]:
preprocessed_ds.drop(columns=['education'], inplace=True)

In [9]:
na_fill = {c:preprocessed_ds[c].mode()[0] for c in preprocessed_ds.select_dtypes('O')}
preprocessed_ds = preprocessed_ds.fillna(value= na_fill)

In [10]:
label_encoder = LabelEncoder()
preprocessed_ds['income'] = label_encoder.fit_transform(preprocessed_ds['income'])

In [11]:
# Rearranging columns by dtype
numeric_cols =  [col for col in preprocessed_ds.columns if preprocessed_ds[col].dtype != object]
categoric_cols = [col for col in preprocessed_ds.columns if preprocessed_ds[col].dtype == object]

preprocessed_ds = preprocessed_ds[numeric_cols + categoric_cols]

In [12]:
transf = ColumnTransformer([
    ('minmax', MinMaxScaler(), numeric_cols),
    ('onehot', OneHotEncoder(categories='auto'), categoric_cols),
])

# Preprocess the dataset
X = transf.fit_transform(preprocessed_ds).astype(float)

In [13]:
transformed_categoric_cols = transf.transformers_[1][1].get_feature_names_out(categoric_cols).tolist()

In [14]:
transformed_categoric_cols

['workclass_Federal-gov',
 'workclass_Local-gov',
 'workclass_Private',
 'workclass_Self-emp-inc',
 'workclass_Self-emp-not-inc',
 'workclass_State-gov',
 'workclass_Without-pay',
 'maritalstatus_Divorced',
 'maritalstatus_Married-AF-spouse',
 'maritalstatus_Married-civ-spouse',
 'maritalstatus_Married-spouse-absent',
 'maritalstatus_Never-married',
 'maritalstatus_Separated',
 'maritalstatus_Widowed',
 'occupation_Adm-clerical',
 'occupation_Armed-Forces',
 'occupation_Craft-repair',
 'occupation_Exec-managerial',
 'occupation_Farming-fishing',
 'occupation_Handlers-cleaners',
 'occupation_Machine-op-inspct',
 'occupation_Other-service',
 'occupation_Priv-house-serv',
 'occupation_Prof-specialty',
 'occupation_Protective-serv',
 'occupation_Sales',
 'occupation_Tech-support',
 'occupation_Transport-moving',
 'relationship_Husband',
 'relationship_Not-in-family',
 'relationship_Other-relative',
 'relationship_Own-child',
 'relationship_Unmarried',
 'relationship_Wife',
 'race_Amer-Indi

In [15]:
transformed_categoric_cols = [c.replace('&', '').replace('(', '').replace(')', '').replace('-', '') for c in transformed_categoric_cols] 

In [16]:
# Convert to DataFrame
newdata = pd.DataFrame(data=X.todense(), columns=numeric_cols + transformed_categoric_cols)

In [17]:
newdata.head()

Unnamed: 0,age,fnlwgt,educationnum,capitalgain,capitalloss,hoursperweek,income,workclass_Federalgov,workclass_Localgov,workclass_Private,...,nativecountry_Portugal,nativecountry_PuertoRico,nativecountry_Scotland,nativecountry_South,nativecountry_Taiwan,nativecountry_Thailand,nativecountry_TrinadadTobago,nativecountry_UnitedStates,nativecountry_Vietnam,nativecountry_Yugoslavia
0,0.452055,0.047274,0.8,0.0,0.0,0.122449,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.287671,0.136877,0.533333,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.493151,0.149792,0.4,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.150685,0.219998,0.8,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.273973,0.183552,0.866667,0.0,0.0,0.397959,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
newdata.to_pickle(Path.joinpath(Path.cwd().resolve(), 'data', 'processed', 'adult_minmax.pkl'))