## Processing - build and run the model

In [1]:
# Import our dependencies
!pip install keras_tuner
from google.colab import drive
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import keras_tuner as kt
import warnings
import time

Collecting keras_tuner
  Downloading keras_tuner-1.4.6-py3-none-any.whl (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.9/128.9 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras_tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras_tuner
Successfully installed keras_tuner-1.4.6 kt-legacy-1.0.5


In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
uploaded = files.upload()

Saving preprocessed.csv to preprocessed.csv


In [4]:
start_time = time.time()

In [5]:
df = pd.read_csv('/content/preprocessed.csv')

In [6]:
# convert categorical variable into dummy/indicator variables
dummy_df = pd.get_dummies(df)
dummy_df.head(1)

Unnamed: 0,ID,Residence_PUMA,Gang_Affiliated,Supervision_Risk_Score_First,Dependents,Prior_Arrest_Episodes_Felony,Prior_Arrest_Episodes_Misd,Prior_Arrest_Episodes_Violent,Prior_Arrest_Episodes_Property,Prior_Arrest_Episodes_Drug,...,Prison_Offense_0,Prison_Offense_Drug,Prison_Offense_Other,Prison_Offense_Property,Prison_Offense_Violent/Non-Sex,Prison_Offense_Violent/Sex,Prison_Years_1-2 years,Prison_Years_Greater than 2 to 3 years,Prison_Years_Less than 1 year,Prison_Years_More than 3 years
0,1,16,False,3,3,6,6,1,3,3,...,0,1,0,0,0,0,0,0,0,1


In [7]:
# Split our preprocessed data into our features and target arrays
X = dummy_df.drop('Recidivism_Within_3years', axis = 1)
y = dummy_df['Recidivism_Within_3years']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Compile, Train and Evaluate the Model

In [8]:
def model_builder(hp):
    model = keras.Sequential()
    model.add(keras.layers.Flatten(input_shape=(X_train_scaled.shape[1],)))

    # Tune the number of units in the first Dense layer
    hp_units1 = hp.Int('units1', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units1, activation='relu'))

    # Add another Dense layer
    hp_units2 = hp.Int('units2', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units2, activation='relu'))

    # Add another Dense layer
    hp_units3 = hp.Int('units3', min_value=32, max_value=512, step=32)
    model.add(keras.layers.Dense(units=hp_units3, activation='relu'))

    model.add(keras.layers.Dense(units=30, activation='sigmoid'))
    model.add(keras.layers.Dense(units=1, activation='sigmoid'))

    # Compile the model
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
                  loss="binary_crossentropy",
                  metrics=["accuracy"])

    return model

In [9]:
# Instantiate the tuner
tuner = kt.Hyperband(model_builder,
                     objective='val_accuracy',
                     max_epochs=10,
                     factor=3)

In [10]:
# Perform hypertuning
tuner.search(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))

Trial 30 Complete [00h 00m 43s]
val_accuracy: 0.7583217024803162

Best val_accuracy So Far: 0.7721009254455566
Total elapsed time: 00h 12m 15s


In [11]:
# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

In [12]:
# Build the model with the optimal hyperparameters
model = tuner.hypermodel.build(best_hps)

In [13]:
# Define the early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

In [14]:
# Train the model with EarlyStopping
history = model.fit(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [15]:
val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Best epoch: 2


In [16]:
# Re-instantiate the hypermodel and train it with the optimal number of epochs from above.
hypermodel = tuner.hypermodel.build(best_hps)
hypermodel.fit(X_train_scaled, y_train, epochs=best_epoch, validation_data=(X_test_scaled, y_test))

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7b8956de4c10>

In [17]:
# Evaluate the model using the test data
model_loss, model_accuracy = hypermodel.evaluate(X_test_scaled,y_test,verbose=0)

In [18]:
# Get predictions from the model
y_pred = hypermodel.predict(X_test_scaled)



In [19]:
# Convert predictions to binary values (0 or 1)
y_pred_binary = [int(round(p[0])) for p in y_pred]

In [20]:
# Create the confusion matrix
cm = confusion_matrix(y_test, y_pred_binary)

In [21]:
# Print the confusion matrix
print(cm)

print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

[[2088  658]
 [ 824 2889]]
Loss: 0.4232397973537445, Accuracy: 0.7705526947975159


In [22]:
X_list_predicted = []

X_predicted = hypermodel.predict(X_test_scaled)

for x in range(len(X_predicted)):
  X_list_predicted.append(int(round(X_predicted[x][0],0)))

X_series=pd.DataFrame(X_list_predicted, index=y_test.index)

# Create a dataframe of your predictions
predictions_df = X_series.join(y_test, on=y_test.index)

# Print the combined dataframe
predictions_df = predictions_df.rename(columns={
    0 : "Predicted",
    "Recidivism_Within_3years" : "Actual"
})



In [None]:
# Set the index of predictions_df as a new column called 'ID'
predictions_df['ID'] = predictions_df.index

# Merge the dataframes using the 'ID' column as the key
df = df.merge(predictions_df, on='ID')

# Drop the duplicate 'ID' column from the merged dataframe
df = df.drop(columns=['ID'])

In [None]:
copy_df = df

In [None]:
# Define the filename
filename = f"predictions.csv"

# Download the DataFrame as a CSV file
df.to_csv(filename, index=False)

# Download the file to your local machine
files.download(filename)

# Direct the user where to find the file
print(f"{filename} has been exported to your Downloads folder.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

predictions.csv has been exported to your Downloads folder.


In [None]:
# Disable the SettingWithCopyWarning
pd.options.mode.chained_assignment = None

# Define the filename
filename = f"model.keras"

#cut the noise
#with warnings.catch_warnings():
#    warnings.simplefilter("ignore")
# Save the neural network model with the constructed filename
hypermodel.save(filename)

# Download the file
files.download(filename)

# Direct the user where to find the file
print(f"{filename} has been exported to your Downloads folder.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

model.keras has been exported to your Downloads folder.


In [None]:
print(f"This process took {time.time() - start_time} seconds to run.")