<a href="https://colab.research.google.com/github/gohigh19/EembeddedBoard/blob/master/Prediction_with_police_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Training Script
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import joblib
import os

# Set base path
base_path = '/content/drive/My Drive/Police Data AI/'
file_path = os.path.join(base_path, "combined_data_stop_and_search.csv")

# --- Load Data ---
df = pd.read_csv(file_path)
print(f"Loaded data with {df.shape[0]} rows and {df.shape[1]} columns")

# --- Filling the Missing Values for Specific Columns ---
df["Outcome linked to object of search"] = df["Outcome linked to object of search"].fillna("unknown")
df["Removal of more than just outer clothing"] = df["Removal of more than just outer clothing"].fillna("False")
df["Gender"] = df["Gender"].fillna("Not specified")
df["Age range"] = df["Age range"].fillna("Not specified")
df["Self-defined ethnicity"] = df["Self-defined ethnicity"].fillna("Not stated")
df["Officer-defined ethnicity"] = df["Officer-defined ethnicity"].fillna("Not stated")

# --- Process Date Column ---
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df['Year'] = df['Date'].dt.year
df['Month_num'] = df['Date'].dt.month

# --- Drop Rows Missing Essential Information ---
essential_cols = ["County", "Year", "Month_num", "Legislation", "Object of search", "Outcome"]
df_before = df.shape[0]
df = df.dropna(subset=essential_cols)
df_after = df.shape[0]
print(f"Dropped {df_before - df_after} rows missing essential data (County, Year, Month_num, Legislation, Object of search, Outcome)")

# --- Defining Input and Output Columns ---
input_cols = ["Type", "Longitude", "Latitude", "Gender", "Age range", "Self-defined ethnicity",
              "Officer-defined ethnicity", "Legislation", "Object of search", "County",
              "Removal of more than just outer clothing", "Year", "Month_num"]
# Outputs:
output_cols = ["Outcome", "Outcome linked to object of search"]

# --- Prepare Inputs ---
# Define numerical and categorical features.
num_cols = ["Longitude", "Latitude", "Year", "Month_num"]
cat_cols = [col for col in input_cols if col not in num_cols]

# For safety, if Longitude or Latitude are missing, fill with 0.0.
df["Longitude"] = df["Longitude"].fillna(0.0)
df["Latitude"] = df["Latitude"].fillna(0.0)

# One-hot encode categorical features.
X_cat = pd.get_dummies(df[cat_cols], prefix=cat_cols)
X_num = df[num_cols]
X = pd.concat([X_num, X_cat], axis=1)

# Converting to float32.
X = X.astype('float32')

# Scale numerical features.
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

# Save training feature columns order to use during inference.
X_columns = X.columns
print("Final feature set shape:", X_columns.shape)

# --- Prepare Outputs ---
# Outcome: multi-class target.
le_outcome = LabelEncoder()
df["Outcome_enc"] = le_outcome.fit_transform(df["Outcome"].astype(str))
n_outcome_classes = df["Outcome_enc"].nunique()
y_outcome = to_categorical(df["Outcome_enc"], num_classes=n_outcome_classes)

# Outcome linked: categorical target.
le_linked = LabelEncoder()
df["Outcome_linked_enc"] = le_linked.fit_transform(df["Outcome linked to object of search"].astype(str))
n_linked_classes = df["Outcome_linked_enc"].nunique()
y_linked = to_categorical(df["Outcome_linked_enc"], num_classes=n_linked_classes)

print(f"Number of Outcome classes: {n_outcome_classes}")
print(f"Number of Outcome linked classes: {n_linked_classes}")

# --- Train/Test Split ---
X_train, X_test, y_outcome_train, y_outcome_test, y_linked_train, y_linked_test = train_test_split(
    X.values, y_outcome, y_linked, test_size=0.2, random_state=42
)

# --- Build Multi-Output Neural Network ---
input_dim = X_train.shape[1]
print("Input dimension:", input_dim)
inp = Input(shape=(input_dim,))

# Shared layers with slightly reduced dropout.
x = Dense(256, activation='relu')(inp)
x = Dropout(0.2)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.1)(x)
x = Dense(64, activation='relu')(x)

# Branch for Outcome prediction.
out1 = Dense(n_outcome_classes, activation='softmax', name='outcome')(x)
# Branch for Outcome linked prediction.
out2 = Dense(n_linked_classes, activation='softmax', name='outcome_linked')(x)

model = Model(inputs=inp, outputs=[out1, out2])
model.compile(optimizer='adam',
              loss={'outcome': 'categorical_crossentropy', 'outcome_linked': 'categorical_crossentropy'},
              metrics={'outcome': 'accuracy', 'outcome_linked': 'accuracy'})

model.summary()

# --- Train the Model ---
history = model.fit(
    X_train, {'outcome': y_outcome_train, 'outcome_linked': y_linked_train},
    validation_data=(X_test, {'outcome': y_outcome_test, 'outcome_linked': y_linked_test}),
    epochs=30,
    batch_size=64,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
)

# --- Evaluate the Model ---
loss, loss_outcome, loss_linked, acc_outcome, acc_linked = model.evaluate(
    X_test, {'outcome': y_outcome_test, 'outcome_linked': y_linked_test}
)
print("Test Outcome Accuracy:", acc_outcome)
print("Test Outcome Linked Accuracy:", acc_linked)

# --- Save Model and Preprocessing Objects ---
model_save_path = os.path.join(base_path, 'ai_model.keras')
model.save(model_save_path)
joblib.dump(scaler, os.path.join(base_path, 'ai_scaler.pkl'))
joblib.dump(le_outcome, os.path.join(base_path, 'ai_outcome_encoder.pkl'))
joblib.dump(le_linked, os.path.join(base_path, 'ai_linked_encoder.pkl'))
joblib.dump(X_columns.tolist(), os.path.join(base_path, 'ai_X_columns.pkl'))
joblib.dump(input_cols, os.path.join(base_path, 'ai_input_cols.pkl'))
joblib.dump(num_cols, os.path.join(base_path, 'ai_num_cols.pkl'))
joblib.dump(cat_cols, os.path.join(base_path, 'ai_cat_cols.pkl'))

print("Training completed and model saved successfully.")


Loaded data with 3157084 rows and 14 columns
Dropped 381777 rows missing essential data (County, Year, Month_num, Legislation, Object of search, Outcome)
Final feature set shape: (125,)
Number of Outcome classes: 7
Number of Outcome linked classes: 3
Input dimension: 125


Epoch 1/30
[1m34692/34692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 4ms/step - loss: 1.1702 - outcome_accuracy: 0.7407 - outcome_linked_accuracy: 0.8500 - outcome_linked_loss: 0.3435 - outcome_loss: 0.8267 - val_loss: 1.1009 - val_outcome_accuracy: 0.7424 - val_outcome_linked_accuracy: 0.8650 - val_outcome_linked_loss: 0.3018 - val_outcome_loss: 0.7990
Epoch 2/30
[1m34692/34692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 4ms/step - loss: 1.1106 - outcome_accuracy: 0.7413 - outcome_linked_accuracy: 0.8643 - outcome_linked_loss: 0.3053 - outcome_loss: 0.8053 - val_loss: 1.0951 - val_outcome_accuracy: 0.7424 - val_outcome_linked_accuracy: 0.8669 - val_outcome_linked_loss: 0.2990 - val_outcome_loss: 0.7961
Epoch 3/30
[1m34692/34692[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m161s[0m 4ms/step - loss: 1.1056 - outcome_accuracy: 0.7417 - outcome_linked_accuracy: 0.8655 - outcome_linked_loss: 0.3027 - outcome_loss: 0.8029 - val_loss: 1.0949 - val_outcome_accura

In [2]:
# Inference Script
import pandas as pd
import numpy as np
import joblib
from tensorflow.keras.models import load_model

# Define the base path where your saved files reside.
base_path = '/content/drive/My Drive/Police Data AI/'

# --- Load Saved Model and Preprocessing Objects ---
model = load_model(base_path + 'ai_model.keras')
scaler = joblib.load(base_path + 'ai_scaler.pkl')
le_outcome = joblib.load(base_path + 'ai_outcome_encoder.pkl')
le_linked = joblib.load(base_path + 'ai_linked_encoder.pkl')
X_columns = joblib.load(base_path + 'ai_X_columns.pkl')  # list of training feature names
input_cols = joblib.load(base_path + 'ai_input_cols.pkl')
num_cols = joblib.load(base_path + 'ai_num_cols.pkl')
cat_cols = joblib.load(base_path + 'ai_cat_cols.pkl')

# --- Define Expected Input Keys and Default Values ---
# These keys must match the training input columns.
expected_keys = input_cols
default_values = {
    "Type": "unknown",
    "Gender": "Not specified",
    "Age range": "Not specified",
    "Self-defined ethnicity": "Not stated",
    "Officer-defined ethnicity": "Not stated",
    "County": "unknown",  # County is essential.
    "Removal of more than just outer clothing": "False",
    # For numerical fields, if missing, we set them to 0.0
    "Longitude": 0.0,
    "Latitude": 0.0,
    "Year": 2021,
    "Month_num": 1
}

def fill_missing_inputs(input_dict):
    """Fill missing keys in input_dict using default_values."""
    for key in expected_keys:
        if key not in input_dict or pd.isna(input_dict[key]):
            input_dict[key] = default_values.get(key, None)
    return input_dict

def prepare_input(input_dict):
    """
    Preprocess a single input dictionary:
      - Fill in missing keys.
      - Create a DataFrame.
      - One-hot encode the categorical features.
      - Reindex to match training columns.
      - Scale numerical features.
    """
    # Fill missing keys
    input_dict = fill_missing_inputs(input_dict)
    df_input = pd.DataFrame([input_dict])

    # Split into numerical and categorical parts.
    X_num_input = df_input[num_cols]
    X_cat_input = pd.get_dummies(df_input[cat_cols], prefix=cat_cols)

    X_input = pd.concat([X_num_input, X_cat_input], axis=1)

    # Reindex so that X_input has exactly the same columns as used in training.
    X_input = X_input.reindex(columns=X_columns, fill_value=0)

    # Force type float32.
    X_input = X_input.astype('float32')

    # Scale numerical features.
    X_input[num_cols] = scaler.transform(X_input[num_cols])

    return X_input

# --- Sample Inference ---
# You can provide only essential features; if "Longitude"/"Latitude" are omitted, they'll be set to 0.0.
sample_input = {
    # You can choose to supply either "County" or "Longitude"/"Latitude"; we require "County" here.
    "Type": "Person search",
    # "Longitude" and "Latitude" are optional; if not provided, defaults (0.0) will be used.
    # Uncomment next lines to provide coordinates:
    #"Longitude": -0.471383,
    #"Latitude": 52.138572,
    "Gender": "Male",
    "Age range": "25-34",
    "Self-defined ethnicity": "White - English/Welsh/Scottish/Northern Irish/British",
    "Officer-defined ethnicity": "White",
    "Legislation": "Police and Criminal Evidence Act 1984 (section 1)",
    "Object of search": "Offensive weapons",
    "County": "bedfordshire",   # Essential; if not provided, default "unknown" will be used.
    "Removal of more than just outer clothing": "False",
    "Year": 2021,
    "Month_num": 1
}

# Prepare the input for prediction.
X_sample = prepare_input(sample_input)

# --- Run Prediction ---
preds = model.predict(X_sample)
pred_outcome_probs = preds[0]   # Outcome probabilities (multi-class)
pred_linked_probs = preds[1]    # Outcome linked probabilities (categorical)

# Get predicted class indices.
pred_outcome_class = np.argmax(pred_outcome_probs, axis=1)[0]
pred_linked_class = np.argmax(pred_linked_probs, axis=1)[0]

# Convert the numeric predictions back to the original labels.
pred_outcome_label = le_outcome.inverse_transform([pred_outcome_class])[0]
pred_linked_label = le_linked.inverse_transform([pred_linked_class])[0]

# Optionally, get the top 3 predicted outcomes.
top_outcome_indices = np.argsort(pred_outcome_probs[0])[-3:][::-1]
top_outcomes = []
for idx in top_outcome_indices:
    label = le_outcome.inverse_transform([idx])[0]
    prob = float(pred_outcome_probs[0][idx])
    top_outcomes.append({"outcome": label, "probability": prob})

# --- Displaying the Results ---
print("Predicted Outcome:", pred_outcome_label)
print("Predicted Outcome linked:", pred_linked_label)
print("Outcome Probabilities (softmax):", pred_outcome_probs)
print("Outcome Linked Probabilities (softmax):", pred_linked_probs)
print("\nTop predicted outcomes:")
for outcome in top_outcomes:
    print(f"- {outcome['outcome']}: {outcome['probability']:.4f}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 382ms/step
Predicted Outcome: A no further action disposal
Predicted Outcome linked: unknown
Outcome Probabilities (softmax): [[0.8187613  0.10885515 0.00333715 0.04322225 0.00419729 0.00154502
  0.02008178]]
Outcome Linked Probabilities (softmax): [[0.09788176 0.08816856 0.8139497 ]]

Top predicted outcomes:
- A no further action disposal: 0.8188
- Arrest: 0.1089
- Community resolution: 0.0432
