In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv("data/census.csv")

# Inspect the data
print(data.head())  # First few rows
print(data.info())  # Data types and non-null values
print(data.describe())  # Statistical summary for numeric column


   age         workclass   fnlgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country salary  
0          2174             0              40  United-States  <=50K  
1             0             0             

In [2]:
categorical_features = [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
]

In [3]:
label_column = "salary"


In [4]:
from ml.data import process_data

# Process the data for training
X, y, encoder, lb = process_data(
    data,
    categorical_features=categorical_features,
    label=label_column,
    training=True
)

# Inspect the processed features and labels
print("Processed Features Shape:", X.shape)
print("Processed Labels Shape:", y.shape)
print("Sample Processed Features (First 5 Rows):\n", X[:5])
print("Sample Processed Labels (First 5 Rows):\n", y[:5])


Processed Features Shape: (32561, 108)
Processed Labels Shape: (32561,)
Sample Processed Features (First 5 Rows):
 [[3.90000e+01 7.75160e+04 1.30000e+01 2.17400e+03 0.00000e+00 4.00000e+01
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+

In [5]:
num_categories = sum(len(encoder.categories_[i]) for i in range(len(encoder.categories_)))
print(f"Number of one-hot encoded features: {num_categories}")

Number of one-hot encoded features: 102


In [6]:
X_inference, y_inference, _, _ = process_data(
    data,
    categorical_features=categorical_features,
    label=label_column,
    training=False,
    encoder=encoder,
    lb=lb
)
print("Inference Features Shape:", X_inference.shape)
print("Sample Inference Features (First 5 Rows):\n", X_inference[:5])


Inference Features Shape: (32561, 108)
Sample Inference Features (First 5 Rows):
 [[3.90000e+01 7.75160e+04 1.30000e+01 2.17400e+03 0.00000e+00 4.00000e+01
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00

In [7]:
for i, category in enumerate(encoder.categories_):
    print(f"Categorical column {categorical_features[i]}: {len(category)} unique values")


Categorical column workclass: 9 unique values
Categorical column education: 16 unique values
Categorical column marital-status: 7 unique values
Categorical column occupation: 15 unique values
Categorical column relationship: 6 unique values
Categorical column race: 5 unique values
Categorical column sex: 2 unique values
Categorical column native-country: 42 unique values


In [10]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# Define numerical columns
numerical_columns = ["age", "fnlgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]

# Extract continuous features from the original dataset
X_continuous = data[numerical_columns].values

# Apply scaling
scaler = StandardScaler()
X_continuous_scaled = scaler.fit_transform(X_continuous)

# Combine with previously processed categorical features
X_combined = np.concatenate([X_continuous_scaled, X[:, len(numerical_columns):]], axis=1)

# Verify the result
print("Scaled Continuous Features (First 5 Rows):\n", X_continuous_scaled[:5])
print("Combined Features Shape:", X_combined.shape)


Scaled Continuous Features (First 5 Rows):
 [[ 0.03067056 -1.06361075  1.13473876  0.1484529  -0.21665953 -0.03542945]
 [ 0.83710898 -1.008707    1.13473876 -0.14592048 -0.21665953 -2.22215312]
 [-0.04264203  0.2450785  -0.42005962 -0.14592048 -0.21665953 -0.03542945]
 [ 1.05704673  0.42580136 -1.19745882 -0.14592048 -0.21665953 -0.03542945]
 [-0.77576787  1.40817572  1.13473876 -0.14592048 -0.21665953 -0.03542945]]
Combined Features Shape: (32561, 108)


In [11]:
from sklearn.preprocessing import StandardScaler

def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None, scaler=None
):
    # Split label from features
    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    # Separate continuous and categorical features
    X_categorical = X[categorical_features].values
    X_continuous = X.drop(columns=categorical_features).values

    if training:
        # Initialize encoders and scaler
        encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        scaler = StandardScaler()

        # Fit and transform categorical and continuous features
        X_categorical = encoder.fit_transform(X_categorical)
        X_continuous = scaler.fit_transform(X_continuous)
        y = lb.fit_transform(y.values).ravel()
    else:
        # Transform using existing encoders and scaler
        X_categorical = encoder.transform(X_categorical)
        X_continuous = scaler.transform(X_continuous)
        try:
            y = lb.transform(y.values).ravel()
        except AttributeError:
            pass

    # Combine scaled continuous and one-hot encoded categorical features
    X = np.concatenate([X_continuous, X_categorical], axis=1)

    return X, y, encoder, lb, scaler


In [12]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training Features Shape:", X_train.shape)
print("Testing Features Shape:", X_test.shape)


Training Features Shape: (26048, 108)
Testing Features Shape: (6513, 108)


In [30]:
from ml.model import (
    train_model,
    inference,
    save_model,
    load_model,
    performance_on_categorical_slice,
    compute_model_metrics
)
from ml.data import process_data
import pandas as pd



In [14]:
# Load the dataset
data = pd.read_csv("data/census.csv")

# Define categorical features and label column
categorical_features = [
    "workclass", "education", "marital-status", "occupation",
    "relationship", "race", "sex", "native-country"
]
label_column = "salary"

# Preprocess the data
X, y, encoder, lb = process_data(
    data, categorical_features=categorical_features, label=label_column, training=True
)


In [15]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [17]:
# Train the model
model = train_model(X_train, y_train)
print("Model trained successfully!")


Model trained successfully!


In [18]:
# Save the model
save_model(model, "model.pkl")

# Load the model
loaded_model = load_model("model.pkl")
print("Model loaded successfully!")


Model loaded successfully!


In [19]:
# Run inference
y_preds = inference(loaded_model, X_test)

# Compute metrics
precision, recall, fbeta = compute_model_metrics(y_test, y_preds)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F1-Score: {fbeta:.2f}")


Precision: 0.74, Recall: 0.64, F1-Score: 0.69


In [33]:
performance_on_categorical_slice(
    data=data,
    column_name="education",
    slice_value="Bachelors",
    categorical_features=categorical_features,
    label=label_column,
    encoder=encoder,
    lb=lb,
    model=loaded_model
)


Metrics for education = Bachelors
Precision: 0.95, Recall: 0.95, F1-Score: 0.95


(np.float64(0.9510647938377889),
 np.float64(0.945069788383611),
 np.float64(0.9480578139114725))

In [32]:
import importlib
from ml import model  # Import the module again (if not already imported)
importlib.reload(model)

# Re-import the functions if needed
from ml.model import performance_on_categorical_slice
