In [1]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd


data = pd.read_csv("./../data/census.csv", skipinitialspace = True)
data.sample(5)

Unnamed: 0,age,workclass,fnlgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
18124,32,Private,328060,9th,5,Separated,Other-service,Unmarried,Other,Female,0,0,40,Mexico,<=50K
25799,46,Federal-gov,344415,Masters,14,Married-civ-spouse,Armed-Forces,Husband,White,Male,0,1887,40,United-States,>50K
23893,43,Private,91949,HS-grad,9,Divorced,Machine-op-inspct,Not-in-family,Black,Female,0,0,40,United-States,<=50K
21950,34,Self-emp-inc,209538,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,50,United-States,>50K
21195,55,Private,196126,Bachelors,13,Separated,Craft-repair,Not-in-family,White,Male,0,0,40,?,<=50K


In [2]:
data.rename(columns={
    "education-num": "education_num",
    "marital-status": "marital_status",
    "hours-per-week": "hours_per_week",
    "native-country": "native_country"
}, inplace=True)

In [3]:
data

Unnamed: 0,age,workclass,fnlgt,education,education_num,marital_status,occupation,relationship,race,sex,capital-gain,capital-loss,hours_per_week,native_country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


## `Removing Extra Whitespace from Whole DataFrame by Creating some code :`

## Creating ydata_profiling

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(data, title="Pandas Profiling Report")
profile.to_widgets()


# Data Preprocessing

In [5]:
def process_data(
    X, categorical_features=[], label=None, training=True, encoder=None, lb=None
):
    """ Process the data used in the machine learning pipeline.

    Processes the data using one hot encoding for the categorical features and a
    label binarizer for the labels. This can be used in either training or
    inference/validation.

    Note: depending on the type of model used, you may want to add in functionality that
    scales the continuous data.

    Inputs
    ------
    X : pd.DataFrame
        Dataframe containing the features and label. Columns in `categorical_features`
    categorical_features: list[str]
        List containing the names of the categorical features (default=[])
    label : str
        Name of the label column in `X`. If None, then an empty array will be returned
        for y (default=None)
    training : bool
        Indicator if training mode or inference/validation mode.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained sklearn OneHotEncoder, only used if training=False.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained sklearn LabelBinarizer, only used if training=False.

    Returns
    -------
    X : np.array
        Processed data.
    y : np.array
        Processed labels if labeled=True, otherwise empty np.array.
    encoder : sklearn.preprocessing._encoders.OneHotEncoder
        Trained OneHotEncoder if training is True, otherwise returns the encoder passed
        in.
    lb : sklearn.preprocessing._label.LabelBinarizer
        Trained LabelBinarizer if training is True, otherwise returns the binarizer
        passed in.
    """

    if label is not None:
        y = X[label]
        X = X.drop([label], axis=1)
    else:
        y = np.array([])

    X_categorical = X[categorical_features].values
    X_continuous = X.drop(categorical_features, axis=1)

    if training is True:
        encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
        lb = LabelBinarizer()
        X_categorical = encoder.fit_transform(X_categorical)
        y = lb.fit_transform(y.values).ravel()
    else:
        if encoder is None or lb is None:
            raise ValueError("Encoder and LabelBinarizer must be provided in training=False mode.")
        X_categorical = encoder.transform(X_categorical)
        try:
            y = lb.transform(y.values).ravel()
        # Catch the case where y is None because we're doing inference.
        except AttributeError:
            pass

    X = np.concatenate([X_continuous, X_categorical], axis=1)
    return X, y, encoder, lb


# Model Evaluation

In [6]:
def evaluate_model(model, x_test, y_test):
    from sklearn import metrics

    # Predict Test Data 
    y_pred = model.predict(x_test)

    # Calculate accuracy, precision, recall, f1-score, and kappa score
    acc = metrics.accuracy_score(y_test, y_pred)
    prec = metrics.precision_score(y_test, y_pred)
    rec = metrics.recall_score(y_test, y_pred)
    f1 = metrics.f1_score(y_test, y_pred)
    kappa = metrics.cohen_kappa_score(y_test, y_pred)

    # Calculate area under curve (AUC)
    y_pred_proba = model.predict_proba(x_test)[::,1]
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)

    # Display confussion matrix
    cm = metrics.confusion_matrix(y_test, y_pred)

    return {'acc': acc, 'prec': prec, 'rec': rec, 'f1': f1, 'kappa': kappa, 
            'fpr': fpr, 'tpr': tpr, 'auc': auc, 'cm': cm}

# Model Training + Prediction + Model Metrics

In [7]:
def train_model(X_train, y_train):
    """
    Trains a machine learning model and returns it.

    Inputs
    ------
    X_train : np.array
        Training data.
    y_train : np.array
        Labels.
    Returns
    -------
    model
        Trained machine learning model.
    """

    clf_model = DecisionTreeClassifier(random_state=42)
    return clf_model.fit(X_train, y_train)

    
def inference(model, X):
    """ Run model inferences and return the predictions.

    Inputs
    ------
    model : ???
        Trained machine learning model.
    X : np.array
        Data used for prediction.
    Returns
    -------
    preds : np.array
        Predictions from the model.
    """
    return model.predict(X)



def compute_model_metrics(y, preds):
    """
    Validates the trained machine learning model using precision, recall, and F1.

    Inputs
    ------
    y : np.array
        Known labels, binarized.
    preds : np.array
        Predicted labels, binarized.
    Returns
    -------
    precision : float
    recall : float
    fbeta : float
    """
    fbeta = fbeta_score(y, preds, beta=1, zero_division=1)
    precision = precision_score(y, preds, zero_division=1)
    recall = recall_score(y, preds, zero_division=1)
    return precision, recall, fbeta




# Saving Trained Model

In [None]:
from joblib import dump

### Assuming you have a trained model named 'model'
### Save the model to a file
### dump(model, './../model/model.joblib')
def save_trained_model(trained_model_clf, path_to_save: str):
    dump(trained_model_clf, path_to_save)

# Executing all steps together:

In [8]:
from sklearn.metrics import fbeta_score, precision_score, recall_score

data = pd.read_csv("testdataset_unittest.csv")

# Droping duplicate rows
data.drop_duplicates(inplace=True)
# Removing 2 unneeded columns
data.drop(['capital-gain', 'capital-loss'], axis=1, inplace=True)
# Optional enhancement, use K-fold cross validation instead of a train-test split.
train, test = train_test_split(data, test_size=0.20)
#expected_train_or_test_shape = 13
#assert train.shape[1] == expected_train_or_test_shape
#assert test.shape[1] == expected_train_or_test_shape

# Proces the test data with the process_data function.
cat_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

X_train, y_train, encoder, lb = process_data(
    train, categorical_features=cat_features, label="salary", training=True)
    


training = train_model(X_train=X_train, y_train=y_train)

# Testing inference from model code
X_test, y_test, _, _ = process_data(
    test, categorical_features=cat_features, label="salary", training=False, encoder=encoder, lb=lb)
y_pred = inference(training, X_test)
expected_pred = [0, 0]


# Testing compute_model_metrics from model code
fbeta, precision, recall = compute_model_metrics(y_test, y_pred)




# Performance of the model on slices of the data

In [9]:
from model import train_model
from model import inference
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


def calculate_data_slice_performance(train_model, inference, data, slice_features, output_file="slice_output.txt"):

    """
    Calculates the performance of a model on slices of categorical features.

    Parameters:
    - model: The trained model object that has a `predict` method.
    - data: The input data as a pandas DataFrame.
    - slice_features: A list of categorical features to slice on.

    Returns:
    - A dictionary mapping each categorical feature to its slice performance dictionary.
    """

    slice_performance = {}

    # Separate input features (X) and target variable (y)
    X = data.drop(slice_features, axis=1)
    y = data[slice_features]

    # Perform one-hot encoding on the input features
    ct = ColumnTransformer([('encoder', OneHotEncoder(), list(X.columns))], remainder='passthrough')
    X_encoded = ct.fit_transform(X)

    for feature in slice_features:
        slice_performance[feature] = {}

        # Get unique values of the current slice feature
        slice_values = data[feature].unique()

        for value in slice_values:
            # Create a mask for the current slice value
            mask = data[feature] == value

            # Apply the mask to the data
            sliced_data = data[mask]

            # Separate input features (X_slice) and target variable (y_slice)
            X_slice = sliced_data.drop(slice_features, axis=1)
            y_slice = sliced_data[feature]

            # Perform one-hot encoding on the sliced input features
            X_slice_encoded = ct.transform(X_slice)

            # Train the model
            model = train_model(X_encoded, y[feature])

            # Make predictions on the sliced data
            y_pred = inference(model, X_slice_encoded)

            # Calculate accuracy score for the slice
            performance = accuracy_score(y_slice, y_pred)

            # Store the performance in the dictionary
            slice_performance[feature][value] = performance

    # Print the performance for each slice combination
    with open(output_file, 'w') as f:
        for feature, performance_dict in slice_performance.items():
            f.write(f"Performance for slices of '{feature}':\n")
            print(f"Performance for slices of '{feature}':")
            for value, performance in performance_dict.items():
                f.write(f" - Slice value '{value}': {performance}\n")
                print(f" - Slice value '{value}': {performance}")


# Define the categorical features to slice on
slice_features = ["workclass", "education", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]

# Calculate the performance on slices based on the categorical features
calculate_data_slice_performance(train_model, inference, data, slice_features)


Performance for slices of 'workclass':
 - Slice value 'Private': 1.0
 - Slice value 'Local-gov': 1.0
 - Slice value 'Federal-gov': 1.0
 - Slice value 'Self-emp-not-inc': 1.0
 - Slice value 'State-gov': 1.0
 - Slice value 'Self-emp-inc': 1.0
 - Slice value '?': 1.0
Performance for slices of 'education':
 - Slice value 'Some-college': 1.0
 - Slice value 'Bachelors': 1.0
 - Slice value 'HS-grad': 1.0
 - Slice value '9th': 1.0
 - Slice value '12th': 1.0
 - Slice value '10th': 1.0
 - Slice value '7th-8th': 1.0
 - Slice value 'Assoc-voc': 1.0
 - Slice value 'Prof-school': 1.0
 - Slice value 'Assoc-acdm': 1.0
 - Slice value 'Masters': 1.0
 - Slice value '11th': 1.0
Performance for slices of 'marital-status':
 - Slice value 'Never-married': 1.0
 - Slice value 'Married-civ-spouse': 1.0
 - Slice value 'Married-spouse-absent': 1.0
 - Slice value 'Divorced': 1.0
 - Slice value 'Separated': 1.0
 - Slice value 'Widowed': 1.0
Performance for slices of 'occupation':
 - Slice value 'Sales': 1.0
 - Slic