# Deep Learning with Keras

### Step 1: Install Keras

In [40]:
# INSTALLS
!pip install -q scikeras tensorflow requests certifi


### Step 2: Imports

In [41]:
import pandas as pd
import numpy as np
import requests, certifi
from io import StringIO

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score, RocCurveDisplay
from sklearn.pipeline import Pipeline

from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier


### Step 3: Load the data

In [42]:
# Load the Adult data into a Pandas Dataframe.
# Ensure the dataset has properly named columns. Display the first five rows.

DATA_PATH = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
columns = [
    'age','workclass','fnlwgt','education','education-num',
    'marital-status','occupation','relationship','race','sex',
    'capital-gain','capital-loss','hours-per-week','native-country','income'
]


### Step 4: Load data into a dataframe

In [None]:
# SSL FIX: download with verified certificate
response = requests.get(DATA_PATH, verify=certifi.where())
response.raise_for_status()
df = pd.read_csv(StringIO(response.text), header=None, names=columns, na_values=' ?', skipinitialspace=True)

df.head()

URLError: <urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1002)>

### Step 5: Show first rows of the data

In [None]:
print(df.head())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country income  
0          2174             0              40  United-States  <=50K  
1             0             0             

### Step 6: Basic EDA

In [None]:
print("Shape:", df.shape)
print("\nNumber of NULLs per column:\n", df.isnull().sum())
print("\nTotal NULL values:", df.isnull().sum().sum())
print("\nPercentage of positive income cases (>50K):")
print((df['income'].value_counts(normalize=True) * 100))

Shape: (32561, 15)

Number of NULLs per column:
 age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

Total NULL values: 0

Percentage of positive income cases (>50K):
income
<=50K    75.919044
>50K     24.080956
Name: proportion, dtype: float64


### Step 7: Find all NULL values and drop them

In [None]:
df.dropna(inplace=True)
print("\nShape after dropping NULLs:", df.shape)


Shape after dropping NULLs: (32561, 15)


### Step 8: Use the LabelEncoder and convert the income column with a data type str to a binary variable

In [None]:
encoder = LabelEncoder()
df['income'] = encoder.fit_transform(df['income'])
print("\nUnique encoded values for income:", df['income'].unique())


Unique encoded values for income: [0 1]


### Step 9: Split the dataset into training / test sets

In [None]:
# Separate features (X) and label (y)
X = df.drop('income', axis=1)
y = df['income']

# Convert categorical columns into dummy variables
X = pd.get_dummies(X, drop_first=True)

# Split into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)
print("Positive label distribution in y_train:", y_train.mean().round(3))

Training set shape: (26048, 100)
Test set shape: (6513, 100)
Positive label distribution in y_train: 0.241


### Step 10: Use Scikit-Learn for ROC AUC Score

In [None]:
# Determine the majority class (0 or 1)
majority_class = y_train.mode()[0]

# Predict that class for all test samples
y_pred_majority = np.full_like(y_test, fill_value=majority_class)

# Compute the AUC score
auc_majority = roc_auc_score(y_test, y_pred_majority)

print("Majority class:", majority_class)
print("Baseline AUC score:", auc_majority)

Majority class: 0
Baseline AUC score: 0.5


### Step 11: ColumnTransformer to apply one hot encoding

In [None]:
X = df.drop('income', axis=1)
y = df['income']

categorical = [
    'workclass', 'education', 'marital-status', 'occupation',
    'relationship', 'race', 'sex', 'native-country'
]
numerical = [
    'age', 'fnlwgt', 'education-num', 'capital-gain',
    'capital-loss', 'hours-per-week'
]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

ct = ColumnTransformer([
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), categorical),
    ('scaler', MinMaxScaler(), numerical)
])

X_train_transformed = ct.fit_transform(X_train)
X_test_transformed = ct.transform(X_test)

print("Transformed columns:", X_train_transformed.shape[1])

Transformed columns: 108


### Step 12: How many cols will the dataframe have after these transformations?

In [None]:
# Get feature names after transformation
ohe = ct.named_transformers_['onehot']
num_ohe_features = len(ohe.get_feature_names_out(categorical))
num_numeric_features = len(numerical)

total_columns = num_ohe_features + num_numeric_features
print("Total columns after transformation:", total_columns)

Total columns after transformation: 108


### Step 13: Define the Keras model

In [None]:
def build_classifier():
    model = Sequential([
        Input(shape=(X_train.shape[1],)),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=1e-3),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

### Step 14: Create the Keras Classifier

In [None]:

clf = KerasClassifier(
    model=build_classifier,
    epochs=3,            # short test run; can increase later
    batch_size=256,
    verbose=1
)

clf.fit(X_train, y_train)
print("Test Accuracy:", clf.score(X_test, y_test))

ValueError: could not convert string to float: 'Private'