In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [80]:
## Preprocessing

# Enable and disable features

data = pd.read_csv('VDL_Norfolk_Island_Penal_Colony_Cleaned_No_Outliers.csv')
data = data.drop(columns=['trial_id'])

TARGET_VARIABLE = 'pp_sentence_years'

# A mapping to boolean indicating that whether a feature is enabled, do not use enabled_features
# ENABLED_FEATURE_MAPPING = {
#     'offence_pp_general': True,
#     'offence_pp': True,
#     'pp_sentence_years': True,
#     'trial_month': True,
#     'trial_year': True,
#     'trial_place': True,
#     'pris_ht_pp': True,
#     'def_age_pp': True,
#     'def_literacy': True,
#     'def_religion_pp': True,
#     'marital_status_pp': True,
#     'children_nr': False,
#     'cash_sav_pp': False,
#     'occupation_pp': True,
#     'coloffence_info': True,
#     'offence_ni': True,
#     'death_in_custody_pp': True,
#     'length_of_stay_until_probat': False,
#     'length_of_stay_until_tl': False,
#     'previous_convictions': True
# }
# # 4 disabled features


# ENABLED_FEATURE_MAPPING = {
#     'offence_pp_general': True,
#     'offence_pp': False,
#     'pp_sentence_years': True,
#     'trial_month': False,
#     'trial_year': False,
#     'trial_place': False,
#     'pris_ht_pp': False,
#     'def_age_pp': True,
#     'def_literacy': True,
#     'def_religion_pp': True,
#     'marital_status_pp': True,
#     'children_nr': False,
#     'cash_sav_pp': False,
#     'occupation_pp': False,
#     'coloffence_info': True,
#     'offence_ni': True,
#     'death_in_custody_pp': True,
#     'length_of_stay_until_probat': False,
#     'length_of_stay_until_tl': False,
#     'previous_convictions': True
# }
# # offence_pp_general, def_age_pp, def_literacy, def_religion_pp, marital_status_pp, coloffence_info, offence_ni, death_in_custody_pp, previous_convictions


ENABLED_FEATURE_MAPPING = {
    'offence_pp_general': True,
    'offence_pp': True,
    'pp_sentence_years': True,
    'trial_month': True,
    'trial_year': True,
    'trial_place': True,
    'pris_ht_pp': True,
    'def_age_pp': True,
    'def_literacy': True,
    'def_religion_pp': True,
    'marital_status_pp': True,
    'children_nr': True,
    'cash_sav_pp': True,
    'occupation_pp': True,
    'coloffence_info': True,
    'offence_ni': True,
    'death_in_custody_pp': True,
    'length_of_stay_until_probat': True,
    'length_of_stay_until_tl': True,
    'previous_convictions': True
}



# A mapping to boolean indicating that whether a feature is numerical
FEATURE_TYPE_MAPPING = {
    'offence_pp_general': False,
    'offence_pp': False,
    'pp_sentence_years': True,
    'trial_month': True,
    'trial_year': True,
    'trial_place': False,
    'pris_ht_pp': True,
    'def_age_pp': True,
    'def_literacy': True,
    'def_religion_pp': False,
    'marital_status_pp': False,
    'children_nr': True,
    'cash_sav_pp': True,
    'occupation_pp': False,
    'coloffence_info': True,
    'offence_ni': True,
    'death_in_custody_pp': True,
    'length_of_stay_until_probat': True,
    'length_of_stay_until_tl': True,
    'previous_convictions': True
}

# Select only the enabled features
data = data[[col for col in data.columns if ENABLED_FEATURE_MAPPING[col] or col == TARGET_VARIABLE]]

# Move the target variable to the end of the dataframe
data = data[[col for col in data.columns if col != TARGET_VARIABLE] + [TARGET_VARIABLE]]

# Edit the target column so that if pp_sentence_years = 99, return 1. 0 otherwise
data[TARGET_VARIABLE] = data[TARGET_VARIABLE].apply(lambda x: 1 if x == 99 else 0)

# Move all numerical features to the left of the categorical features
numerical_features = [col for col in data.columns if FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE]
categorical_features = [col for col in data.columns if not FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE]
data = data[numerical_features + categorical_features + [TARGET_VARIABLE]]

# Normalize the numerical features
def normalize_data(data):
    for col in data.columns:
        if FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE:
            data[col] = (data[col] - data[col].mean()) / data[col].std()
    return data
data = normalize_data(data)

# Expect total dimension num_numerical_features + num_categorical_features * number of unique values + 1
dim_expected = len(numerical_features) + sum([len(data[col].unique()) for col in categorical_features]) + 1
print(f"Expected dimension: {dim_expected}")

# Backup data before one-hot encoding
data_backup = data.copy()

# # For each categorical feature, convert it to a one-hot encoding
def one_hot_encode(data):
    for col in data.columns:
        if not FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE:
            one_hot = pd.get_dummies(data[col], prefix=col)
            data = data.drop(col, axis=1)
            data = pd.concat([data, one_hot], axis=1)
    return data
data = one_hot_encode(data)

assert data.shape[1] == dim_expected, f"Data shape {data.shape[1]} does not match expected dimension {dim_expected}"

# Move the target variable to the end of the dataframe again
data = data[[col for col in data.columns if col != TARGET_VARIABLE] + [TARGET_VARIABLE]]

# Convert everything to float
data = data.astype(float)

# Convert to numpy array
data = data.to_numpy()

# Split the data into training, validation, and test sets 60/20/20
X = data[:, :-1]
y = data[:, -1]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

Expected dimension: 119
X_train shape: (2705, 118)
X_val shape: (902, 118)
X_test shape: (902, 118)
y_train shape: (2705,)
y_val shape: (902,)
y_test shape: (902,)


In [81]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

architectures = [
    (75, 50),
    # (100,),
    # (100, 50)
]
max_iter = 200
for layers in architectures:
    clf = MLPClassifier(
        hidden_layer_sizes=layers,
        activation='relu',
        solver='adam',
        max_iter=max_iter,
        random_state=42,
        verbose=True,
        early_stopping=True,
        validation_fraction=0.1 # fraction of training data to set aside as validation set for early stopping
    )
    # Train
    clf.fit(X_train, y_train)
    
    # Predict probabilities and labels on your hold-out set
    y_prob = clf.predict_proba(X_val)[:,1]   # probability of class “1”
    thredold = 0.5
    y_pred = (y_prob >= thredold).astype(int) # convert probabilities to binary predictions
    
    # Compute metrics
    acc   = accuracy_score(y_val, y_pred)
    f1    = f1_score(y_val, y_pred)
    roc   = roc_auc_score(y_val, y_prob)
    
    print(f"Arc={layers}:  ACC={acc:.3f},  F1={f1:.3f},  ROC-AUC={roc:.3f}, Threshold={thredold:.3f}, max_iter={max_iter}, actual_iter={clf.n_iter_}")

Iteration 1, loss = 0.65745890
Validation score: 0.671587
Iteration 2, loss = 0.60577685
Validation score: 0.686347
Iteration 3, loss = 0.57358782
Validation score: 0.715867
Iteration 4, loss = 0.54818015
Validation score: 0.738007
Iteration 5, loss = 0.52510220
Validation score: 0.749077
Iteration 6, loss = 0.50520661
Validation score: 0.760148
Iteration 7, loss = 0.48789970
Validation score: 0.782288
Iteration 8, loss = 0.47370368
Validation score: 0.782288
Iteration 9, loss = 0.45906174
Validation score: 0.793358
Iteration 10, loss = 0.44864933
Validation score: 0.797048
Iteration 11, loss = 0.43810366
Validation score: 0.797048
Iteration 12, loss = 0.42845278
Validation score: 0.800738
Iteration 13, loss = 0.41784337
Validation score: 0.793358
Iteration 14, loss = 0.40924785
Validation score: 0.808118
Iteration 15, loss = 0.39904125
Validation score: 0.804428
Iteration 16, loss = 0.39218320
Validation score: 0.797048
Iteration 17, loss = 0.38304346
Validation score: 0.808118
Iterat

In [82]:
# Final evaluation on test set
best_architecture = architectures[0]
max_iter = 200
model = MLPClassifier(
    hidden_layer_sizes=best_architecture,
    activation='relu',
    solver='adam',
    max_iter=max_iter,
    random_state=42,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1 # fraction of training data to set aside as validation set for early stopping
)
model.fit(X_train, y_train)
y_prob = model.predict(X_test)
y_pred = (y_prob >= thredold).astype(int) # convert probabilities to binary predictions
acc   = accuracy_score(y_test, y_pred)
f1    = f1_score(y_test, y_pred)
roc   = roc_auc_score(y_test, y_prob)

print(f"Final evaluation: ACC={acc:.3f}, F1={f1:.3f}, ROC-AUC={roc:.3f}, Threshold={thredold:.3f}, max_iter={max_iter}, actual_iter={model.n_iter_}")

Iteration 1, loss = 0.65745890
Validation score: 0.671587
Iteration 2, loss = 0.60577685
Validation score: 0.686347
Iteration 3, loss = 0.57358782
Validation score: 0.715867
Iteration 4, loss = 0.54818015
Validation score: 0.738007
Iteration 5, loss = 0.52510220
Validation score: 0.749077
Iteration 6, loss = 0.50520661
Validation score: 0.760148
Iteration 7, loss = 0.48789970
Validation score: 0.782288
Iteration 8, loss = 0.47370368
Validation score: 0.782288
Iteration 9, loss = 0.45906174
Validation score: 0.793358
Iteration 10, loss = 0.44864933
Validation score: 0.797048
Iteration 11, loss = 0.43810366
Validation score: 0.797048
Iteration 12, loss = 0.42845278
Validation score: 0.800738
Iteration 13, loss = 0.41784337
Validation score: 0.793358
Iteration 14, loss = 0.40924785
Validation score: 0.808118
Iteration 15, loss = 0.39904125
Validation score: 0.804428
Iteration 16, loss = 0.39218320
Validation score: 0.797048
Iteration 17, loss = 0.38304346
Validation score: 0.808118
Iterat