In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow import keras
import keras_tuner as kt

In [2]:
# create column names
column_names = [
    'age','workclass','fnlwgt','education','education-num',
    'marital-status','occupation','relationship','race','sex',
    'capital-gain','capital-loss','hours-per-week','native-country','salary'
]
# read in the training data
df = pd.read_csv(
    'adult.data',
    names=column_names,
    na_values='?',
    skipinitialspace=True
)

In [3]:
print("Missingness (%):\n", (df.isna().mean()*100).sort_values(ascending=False), "\n")

numeric_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
print(df.groupby('salary')[numeric_cols].agg(['mean','std','count']), "\n")

# Correlation of numeric variables vs salary
df['salary_bin'] = df['salary'].map({'<=50K':0,'>50K':1})
corrs = df[numeric_cols + ['salary_bin']].corr()['salary_bin'].drop('salary_bin').abs().sort_values(ascending=False)
print("Numeric → salary correlation:\n", corrs, "\n")

# Chi-squared for categorical variables vs salary
cat_cols = [
    'workclass','education','marital-status','occupation',
    'relationship','race','sex','native-country'
]
chi2 = []
for c in cat_cols:
    tbl = pd.crosstab(df[c], df['salary'])
    stat,p,_,_ = chi2_contingency(tbl)
    chi2.append((c, stat, p))
chi2_df = pd.DataFrame(chi2, columns=['var','chi2','p']).sort_values('p')
print("Categorical → salary (Chi2 p-values):\n", chi2_df, "\n")

Missingness (%):
 occupation        5.660146
workclass         5.638647
native-country    1.790486
age               0.000000
fnlwgt            0.000000
education         0.000000
education-num     0.000000
marital-status    0.000000
relationship      0.000000
race              0.000000
sex               0.000000
capital-gain      0.000000
capital-loss      0.000000
hours-per-week    0.000000
salary            0.000000
dtype: float64 

              age                          fnlwgt                        \
             mean        std  count          mean            std  count   
salary                                                                    
<=50K   36.783738  14.020088  24720  190340.86517  106482.271195  24720   
>50K    44.249841  10.519028   7841  188005.00000  102541.775472   7841   

       education-num                  capital-gain                       \
                mean       std  count         mean           std  count   
salary                            

In [4]:
# drop columns that we classify as noise
df.drop(columns=['fnlwgt','native-country'], inplace=True)

# generate net-capital columns
df['net-capital'] = df['capital-gain'] - df['capital-loss']
df.drop(columns=['capital-loss'], inplace=True)

# generate age bins
df['age_bin'] = pd.cut(
    df['age'],
    bins=[16,25,35,45,55,65,100],
    labels=['17–25','26–35','36–45','46–55','56–65','65+']
)

# generate married flag
df['married'] = (
    df['marital-status'].str.startswith('Married') |
    df['relationship'].isin(['Husband','Wife'])
).astype(int)

# generate education groupings
def map_edu(x):
    if x=='Bachelors': return 'Bachelors'
    if x in ['Masters','Prof-school','Doctorate']: return 'Advanced'
    if x in ['HS-grad','Some-college','Assoc-acdm','Assoc-voc']: return 'High-school'
    return 'Less-than-HS'
df['education_group'] = df['education'].map(map_edu)

# drop original variables
df.drop(columns=['education','marital-status','relationship'], inplace=True)

# impute numeric variables
numeric_cols = ['age','education-num','hours-per-week','net-capital','married']
num_imp = SimpleImputer(strategy='median')
df[numeric_cols] = num_imp.fit_transform(df[numeric_cols])

# impute categorical variables with other
cat_cols = ['workclass','occupation','race','sex','age_bin','education_group']
for c in cat_cols:
    vals = df[c].dropna().unique()
    fill = 'Other' if 'Other' in vals else df[c].mode()[0]
    df[c].fillna(fill, inplace=True)
    freqs = df[c].value_counts(normalize=True)
    rare = freqs[freqs<0.01].index
    df[c] = df[c].replace(rare, 'Other')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(fill, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(fill, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.



In [5]:
# create target variable for classification
df['salary_bin'] = df['salary'].str.rstrip('.').map({'<=50K':0,'>50K':1})
y = df['salary_bin'].values

# encode cats
encoders = {}
X_cat = {}
cardinalities = {}
for c in cat_cols:
    le = LabelEncoder().fit(df[c])
    encoders[c] = le
    X_cat[c] = le.transform(df[c])
    cardinalities[c] = len(le.classes_) + 1

# numeric matrix
X_num = df[numeric_cols].values

# train/val split
idx = np.arange(len(y))
train_idx, val_idx = train_test_split(idx, test_size=0.2,
                                       random_state=42, stratify=y)
X_train_num = X_num[train_idx]; X_val_num = X_num[val_idx]
y_train     = y[train_idx];     y_val     = y[val_idx]
X_train_cat = {c: X_cat[c][train_idx] for c in cat_cols}
X_val_cat   = {c: X_cat[c][val_idx]   for c in cat_cols}

In [6]:
# stack cats into matrix
X_train_cat_arr = np.stack([X_train_cat[c] for c in cat_cols], axis=1)
X_train_combined = np.hstack([X_train_num, X_train_cat_arr])

# use smote to increase the cases of the minority class in a balanced way
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_train_combined, y_train)

# split back into numeric & categorical variables
n_num = X_train_num.shape[1]
X_train_num = X_res[:, :n_num]
cat_res = X_res[:, n_num:].astype(int)
X_train_cat = {c: cat_res[:, i] for i, c in enumerate(cat_cols)}
y_train = y_res



In [10]:
# Compute class weights on the resampled data
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
cw = dict(enumerate(weights))

In [11]:
# build model using hyperband optimisation
def build_model(hp):
    inputs, embeds = [], []
    # categorical embeddings
    for c in cat_cols:
        inp   = keras.Input(shape=(1,), name=c)
        vocab = cardinalities[c]
        emb_dim = hp.Int(f"{c}_emb", min_value=5, max_value=100, step=5)
        emb   = keras.layers.Embedding(input_dim=vocab, output_dim=emb_dim)(inp)
        flat  = keras.layers.Flatten()(emb)
        inputs.append(inp); embeds.append(flat)
    # numeric input
    num_inp = keras.Input(shape=(len(numeric_cols),), name='numeric')
    bn      = keras.layers.BatchNormalization()(num_inp)
    inputs.append(num_inp); embeds.append(bn)
    # concatenate
    x = keras.layers.Concatenate()(embeds)
    # tunable dense stack
    for i in range(hp.Int("num_layers", 1, 5)):
        units = hp.Int(f"units_{i}", min_value=32, max_value=1024, step=32)
        x = keras.layers.Dense(units, activation='relu')(x)
        x = keras.layers.Dropout(hp.Float(f"dropout_{i}", 0.0, 0.5, step=0.1))(x)
    out = keras.layers.Dense(1, activation='sigmoid')(x)

    lr = hp.Float("lr", 1e-5, 1e-2, sampling="log")
    model = keras.Model(inputs, out)
    model.compile(
        optimizer=keras.optimizers.Adam(lr),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model


In [12]:
# tune the hyperband optimiser to find the best hyperparameters
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=30,
    factor=3,
    directory='hyperband_dir',
    project_name='adult_income_no_focal'
)
early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(
    {**X_train_cat, 'numeric': X_train_num}, y_train,
    epochs=30,
    validation_data=({**X_val_cat, 'numeric': X_val_num}, y_val),
    callbacks=[early],
    class_weight=cw
)

best_hps = tuner.get_best_hyperparameters(1)[0]
print("Best hyperparameters:", best_hps.values)

Trial 90 Complete [00h 01m 19s]
val_accuracy: 0.8294180631637573

Best val_accuracy So Far: 0.8415476679801941
Total elapsed time: 00h 39m 08s
Best hyperparameters: {'workclass_emb': 90, 'occupation_emb': 85, 'race_emb': 65, 'sex_emb': 60, 'age_bin_emb': 15, 'education_group_emb': 95, 'num_layers': 4, 'units_0': 320, 'dropout_0': 0.4, 'lr': 0.0005941838658383296, 'units_1': 704, 'dropout_1': 0.0, 'units_2': 288, 'dropout_2': 0.1, 'units_3': 416, 'dropout_3': 0.0, 'units_4': 128, 'dropout_4': 0.1, 'tuner/epochs': 4, 'tuner/initial_epoch': 0, 'tuner/bracket': 2, 'tuner/round': 0}


In [13]:
# build the final model
model = tuner.hypermodel.build(best_hps)
history = model.fit(
    {**X_train_cat, 'numeric': X_train_num}, y_train,
    validation_data=({**X_val_cat, 'numeric': X_val_num}, y_val),
    epochs=30,
    batch_size=64,
    callbacks=[early],
    class_weight=cw
)

Epoch 1/30
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.8124 - loss: 0.4104 - val_accuracy: 0.8116 - val_loss: 0.3773
Epoch 2/30
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8414 - loss: 0.3554 - val_accuracy: 0.8187 - val_loss: 0.3692
Epoch 3/30
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8428 - loss: 0.3499 - val_accuracy: 0.8277 - val_loss: 0.3658
Epoch 4/30
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8408 - loss: 0.3526 - val_accuracy: 0.8228 - val_loss: 0.3777
Epoch 5/30
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.8453 - loss: 0.3442 - val_accuracy: 0.8214 - val_loss: 0.3615
Epoch 6/30
[1m618/618[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.8476 - loss: 0.3393 - val_accuracy: 0.8033 - val_loss: 0.3839
Epoch 7/30
[1m618/618[0m 

In [14]:
# read in test data
test = pd.read_csv(
    'adult.test',
    names=column_names,
    na_values='?',
    skipinitialspace=True,
    skiprows=1
)
# same preprocessing on test data as training data
test.drop(columns=['fnlwgt','native-country'], inplace=True)
test['net-capital'] = test['capital-gain'] - test['capital-loss']
test.drop(columns=['capital-loss'], inplace=True)
test['age_bin'] = pd.cut(
    test['age'], bins=[16,25,35,45,55,65,100],
    labels=['17–25','26–35','36–45','46–55','56–65','65+']
)
test['married'] = (
    test['marital-status'].str.startswith('Married') |
    test['relationship'].isin(['Husband','Wife'])
).astype(int)
test['education_group'] = test['education'].map(map_edu)
test.drop(columns=['education','marital-status','relationship'], inplace=True)
test[numeric_cols] = num_imp.transform(test[numeric_cols])
for c in cat_cols:
    vals = test[c].dropna().unique()
    fill = 'Other' if 'Other' in vals else test[c].mode()[0]
    test[c].fillna(fill, inplace=True)
    freqs = test[c].value_counts(normalize=True)
    rare = freqs[freqs < 0.01].index
    test[c] = test[c].replace(rare, 'Other')
test['salary_bin'] = test['salary'].str.rstrip('.').map({'<=50K':0,'>50K':1})
y_test     = test['salary_bin'].values
X_test_num = test[numeric_cols].values
X_test_cat = {c: encoders[c].transform(test[c]) for c in cat_cols}

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[c].fillna(fill, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[c].fillna(fill, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a cop

In [15]:
y_pred_proba = model.predict({**X_test_cat, 'numeric': X_test_num})
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step


In [16]:
# print performance metrics of the model
print(classification_report(y_test,y_pred,target_names=['<=50K','>50K']))
print("F1:", f1_score(y_test,y_pred))
print("Precision:", precision_score(y_test,y_pred))
print("Recall:", recall_score(y_test,y_pred))
print("Accuracy:", model.evaluate({**X_test_cat,'numeric':X_test_num}, y_test)[1])

              precision    recall  f1-score   support

       <=50K       0.93      0.82      0.87     12435
        >50K       0.58      0.79      0.67      3846

    accuracy                           0.82     16281
   macro avg       0.75      0.81      0.77     16281
weighted avg       0.85      0.82      0.82     16281

F1: 0.66996699669967
Precision: 0.5806636155606407
Recall: 0.7917316692667706
[1m509/509[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.8135 - loss: 0.3643
Accuracy: 0.8157361149787903
