In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
## Preprocessing

# Enable and disable features

data = pd.read_csv('VDL_Norfolk_Island_Penal_Colony_Cleaned_No_Outliers.csv')
data = data.drop(columns=['trial_id'])

TARGET_VARIABLE = 'pp_sentence_years'

# A mapping to boolean indicating that whether a feature is enabled, do not use enabled_features
# ENABLED_FEATURE_MAPPING = {
#     'offence_pp_general': True,
#     'offence_pp': True,
#     'pp_sentence_years': True,
#     'trial_month': True,
#     'trial_year': True,
#     'trial_place': True,
#     'pris_ht_pp': True,
#     'def_age_pp': True,
#     'def_literacy': True,
#     'def_religion_pp': True,
#     'marital_status_pp': True,
#     'children_nr': False,
#     'cash_sav_pp': False,
#     'occupation_pp': True,
#     'coloffence_info': True,
#     'offence_ni': True,
#     'death_in_custody_pp': True,
#     'length_of_stay_until_probat': False,
#     'length_of_stay_until_tl': False,
#     'previous_convictions': True
# }
# 4 disabled features


# ENABLED_FEATURE_MAPPING = {
#     'offence_pp_general': True,
#     'offence_pp': False,
#     'pp_sentence_years': True,
#     'trial_month': False,
#     'trial_year': False,
#     'trial_place': False,
#     'pris_ht_pp': False,
#     'def_age_pp': True,
#     'def_literacy': True,
#     'def_religion_pp': True,
#     'marital_status_pp': True,
#     'children_nr': False,
#     'cash_sav_pp': False,
#     'occupation_pp': False,
#     'coloffence_info': True,
#     'offence_ni': True,
#     'death_in_custody_pp': True,
#     'length_of_stay_until_probat': False,
#     'length_of_stay_until_tl': False,
#     'previous_convictions': True
# }
# # offence_pp_general, def_age_pp, def_literacy, def_religion_pp, marital_status_pp, coloffence_info, offence_ni, death_in_custody_pp, previous_convictions


ENABLED_FEATURE_MAPPING = {
    'offence_pp_general': True,
    'offence_pp': True,
    'pp_sentence_years': True,
    'trial_month': True,
    'trial_year': True,
    'trial_place': True,
    'pris_ht_pp': True,
    'def_age_pp': True,
    'def_literacy': True,
    'def_religion_pp': True,
    'marital_status_pp': True,
    'children_nr': True,
    'cash_sav_pp': True,
    'occupation_pp': True,
    'coloffence_info': True,
    'offence_ni': True,
    'death_in_custody_pp': True,
    'length_of_stay_until_probat': True,
    'length_of_stay_until_tl': True,
    'previous_convictions': True
}



# A mapping to boolean indicating that whether a feature is numerical
FEATURE_TYPE_MAPPING = {
    'offence_pp_general': False,
    'offence_pp': False,
    'pp_sentence_years': True,
    'trial_month': True,
    'trial_year': True,
    'trial_place': False,
    'pris_ht_pp': True,
    'def_age_pp': True,
    'def_literacy': True,
    'def_religion_pp': False,
    'marital_status_pp': False,
    'children_nr': True,
    'cash_sav_pp': True,
    'occupation_pp': False,
    'coloffence_info': True,
    'offence_ni': True,
    'death_in_custody_pp': True,
    'length_of_stay_until_probat': True,
    'length_of_stay_until_tl': True,
    'previous_convictions': True
}

# Select only the enabled features
data = data[[col for col in data.columns if ENABLED_FEATURE_MAPPING[col] or col == TARGET_VARIABLE]]

# Move the target variable to the end of the dataframe
data = data[[col for col in data.columns if col != TARGET_VARIABLE] + [TARGET_VARIABLE]]

# Move all numerical features to the left of the categorical features
numerical_features = [col for col in data.columns if FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE]
categorical_features = [col for col in data.columns if not FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE]
data = data[numerical_features + categorical_features + [TARGET_VARIABLE]]

# Normalize the numerical features
def normalize_data(data):
    for col in data.columns:
        if FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE:
            data[col] = (data[col] - data[col].mean()) / data[col].std()
    return data
data = normalize_data(data)

# Expect total dimension num_numerical_features + num_categorical_features * number of unique values + 1
dim_expected = len(numerical_features) + sum([len(data[col].unique()) for col in categorical_features]) + 1
print(f"Expected dimension: {dim_expected}")

# Backup data before one-hot encoding
data_backup = data.copy()

# # For each categorical feature, convert it to a one-hot encoding
def one_hot_encode(data):
    for col in data.columns:
        if not FEATURE_TYPE_MAPPING[col] and col != TARGET_VARIABLE:
            one_hot = pd.get_dummies(data[col], prefix=col)
            data = data.drop(col, axis=1)
            data = pd.concat([data, one_hot], axis=1)
    return data
data = one_hot_encode(data)

assert data.shape[1] == dim_expected, f"Data shape {data.shape[1]} does not match expected dimension {dim_expected}"

# Move the target variable to the end of the dataframe again
data = data[[col for col in data.columns if col != TARGET_VARIABLE] + [TARGET_VARIABLE]]

# Convert everything to float
data = data.astype(float)

# Convert to numpy array
data = data.to_numpy()

# Split the data into training, validation, and test sets 60/20/20
X = data[:, :-1]
y = data[:, -1]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

Expected dimension: 119
X_train shape: (2705, 118)
X_val shape: (902, 118)
X_test shape: (902, 118)
y_train shape: (2705,)
y_val shape: (902,)
y_test shape: (902,)


In [3]:
from sklearn.neural_network import MLPRegressor
from scipy.stats import ttest_rel

architectures = [
    (75, 50), # 1 hidden layer × 50 neurons
    # (100,), # 1 hidden layer × 100 neurons
    # (100, 50) # 2 hidden layers × 100 and 50 neurons
]

scoring = 'neg_mean_squared_error'

# Initialize the MLPRegressor
score_list_mse = []
score_list_mae = []

i = 0

max_iter = 100
for architecture in architectures:
    model = MLPRegressor(
        hidden_layer_sizes=architecture,
        activation='relu',
        solver='adam',
        max_iter=max_iter,
        random_state=42,
        verbose=True,
    )
    # Compute the MSE score on the validation set
    model.fit(X_train, y_train)
    y_val_pred = model.predict(X_val)
    mse = np.mean((y_val - y_val_pred) ** 2)
    mae = np.mean(np.abs(y_val - y_val_pred))

    print(f"Architecture: {architecture}, MSE: {mse}, MAE: {mae} , max_iter: {max_iter}, actual_iter: {model.n_iter_}")

Iteration 1, loss = 2013.07742874
Iteration 2, loss = 1944.85201284
Iteration 3, loss = 1843.66849028
Iteration 4, loss = 1679.70640512
Iteration 5, loss = 1432.81659880
Iteration 6, loss = 1135.63022139
Iteration 7, loss = 905.78470340
Iteration 8, loss = 804.77447541
Iteration 9, loss = 774.07542671
Iteration 10, loss = 749.47948041
Iteration 11, loss = 729.84086098
Iteration 12, loss = 714.11093639
Iteration 13, loss = 701.19573955
Iteration 14, loss = 688.59374852
Iteration 15, loss = 678.18512009
Iteration 16, loss = 668.11627251
Iteration 17, loss = 659.14931736
Iteration 18, loss = 650.63519767
Iteration 19, loss = 643.20840954
Iteration 20, loss = 635.58699817
Iteration 21, loss = 629.04146994
Iteration 22, loss = 623.13133976
Iteration 23, loss = 617.75534603
Iteration 24, loss = 613.43237686
Iteration 25, loss = 608.13835560
Iteration 26, loss = 603.40296126
Iteration 27, loss = 599.15413863
Iteration 28, loss = 596.12504536
Iteration 29, loss = 593.01872960
Iteration 30, los



In [4]:
# Final evaluation on test set
best_architecture = architectures[0]
model = MLPRegressor(
    hidden_layer_sizes=best_architecture,
    activation='relu',
    solver='adam',
    max_iter=100,
    random_state=42,
    verbose=True,
)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)
mse = np.mean((y_test - y_test_pred) ** 2)
mae = np.mean(np.abs(y_test - y_test_pred))
print(f"Final evaluation on test set, Architecture: {best_architecture}, MSE: {mse}, MAE: {mae}")


Iteration 1, loss = 2013.07742874
Iteration 2, loss = 1944.85201284
Iteration 3, loss = 1843.66849028
Iteration 4, loss = 1679.70640512
Iteration 5, loss = 1432.81659880
Iteration 6, loss = 1135.63022139
Iteration 7, loss = 905.78470340
Iteration 8, loss = 804.77447541
Iteration 9, loss = 774.07542671
Iteration 10, loss = 749.47948041
Iteration 11, loss = 729.84086098
Iteration 12, loss = 714.11093639
Iteration 13, loss = 701.19573955
Iteration 14, loss = 688.59374852
Iteration 15, loss = 678.18512009
Iteration 16, loss = 668.11627251
Iteration 17, loss = 659.14931736
Iteration 18, loss = 650.63519767
Iteration 19, loss = 643.20840954
Iteration 20, loss = 635.58699817
Iteration 21, loss = 629.04146994
Iteration 22, loss = 623.13133976
Iteration 23, loss = 617.75534603
Iteration 24, loss = 613.43237686
Iteration 25, loss = 608.13835560
Iteration 26, loss = 603.40296126
Iteration 27, loss = 599.15413863
Iteration 28, loss = 596.12504536
Iteration 29, loss = 593.01872960
Iteration 30, los

