# Machine Learning Model Experiments

Based on the features:
1. Classify what type of crime it is
2. Regression prediction on where the crime took place
3. Regression prediction on when the crime took place

Try out these models:
1) K Nearest Neighbors
2) Decision Tree
3) Random Forest
4) Logistic Regression
5) Support Vector Machine
6) Multi Layer Perceptron
7) XGBoost

In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, mean_squared_error, 
mean_absolute_error, r2_score, f1_score, precision_recall_fscore_support, roc_auc_score)
from sklearn.model_selection import GridSearchCV
import joblib
from joblib import dump

In [2]:
data = pd.read_csv("data\data_full_features.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,crime_type,point_x,point_y,Year,Month,Day,Hour,Minute,Weekday,IsWeekend,street_name,block_number,distance_to_nearest_police_station,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,17539,Robbery Firearm,-75.07261,40.041574,2023,10,13,23,54,4,0,BLOCK LARGE,6600,0.017699,65.5,51.5,57.2,0.0,0.0,5.4
1,17540,Other Assaults,-75.172952,39.999995,2023,10,13,23,27,4,0,BLOCK N,2900,0.019584,65.5,51.5,57.2,0.0,0.0,5.4
2,17541,Thefts,-75.108462,39.994303,2023,10,13,23,27,4,0,BLOCK JOYCE,3200,0.028199,65.5,51.5,57.2,0.0,0.0,5.4
3,17542,Thefts,-75.136074,39.99189,2023,10,13,23,24,4,0,BLOCK N,2700,0.029554,65.5,51.5,57.2,0.0,0.0,5.4
4,17543,Robbery Firearm,-75.107092,40.032184,2023,10,13,23,19,4,0,BLOCK ADAMS,700,0.038528,65.5,51.5,57.2,0.0,0.0,5.4


In [4]:
feature_data_pipeline = ColumnTransformer([
    ("ordinal", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ["street_name"]),
    ('passthrough', 'passthrough', ['IsWeekend']), 
    ("numerical", StandardScaler(), [
        "block_number", "distance_to_nearest_police_station",
        "temperature_2m_max (°F)", "temperature_2m_min (°F)",
        "temperature_2m_mean (°F)", "precipitation_sum (mm)", 
        "precipitation_hours (h)", "Unemployment Rate of a Population"])
])

# Model Training

### Classifying Crime Type

In [5]:
#Crime Classification:
X_train, X_test, y_train, y_test = train_test_split(data.drop("crime_type", axis=1), data["crime_type"].values.reshape(-1, 1), test_size=0.2)
crimetype_encoder = LabelEncoder()
y_train_encoded = crimetype_encoder.fit_transform(y_train)
y_test_encoded = crimetype_encoder.transform(y_test)

X_train_encoded = feature_data_pipeline.fit_transform(X_train)
X_test_encoded = feature_data_pipeline.transform(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [19]:
X_train_encoded.shape, y_train_encoded.shape, X_test_encoded.shape, y_test_encoded.shape

((2276076, 10), (2276076,), (569019, 10), (569019,))

In [27]:
classifiers = [KNeighborsClassifier()] #, DecisionTreeClassifier(), RandomForestClassifier(), MLPClassifier(), LogisticRegression(), SVC(), XGBClassifier()

classification_results = []

for model in classifiers:
    print(model)
    model.fit(X_train_encoded, y_train_encoded)
    dump(model, f'models/classification/{str(model)}_crime_type.joblib')
    y_pred = model.predict(X_test_encoded)
    accuracy = accuracy_score(y_test_encoded, y_pred)
    precision, recall, fscore, support = precision_recall_fscore_support(y_test_encoded, y_pred)
    f1score = f1_score(y_test_encoded, y_pred, average='weighted')
    roc_auc = roc_auc_score(y_test_encoded, y_pred)
    
    print("Accuracy:", accuracy)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1score)
    print("ROC AUC:", roc_auc)
    classification_results.append([str(model), accuracy, precision, recall, f1score, roc_auc])
    break

KNeighborsClassifier()
Accuracy: 0.21508244891646852
Precision: [0.06899887 0.05795276 0.28282153 0.01453711 0.12076173 0.084142
 0.11767449 0.15453471 0.03658537 0.01949318 0.12079725 0.26490066
 0.01666667 0.31121281 0.09365559 0.20581578 0.2614841  0.2000826
 0.14627995 0.33333333 0.06010929 0.03504043 0.11267606 0.02893309
 0.03542234 0.18579838 0.35411891 0.2183908  0.14349789 0.05840708]
Recall: [0.14615822 0.11018302 0.52479661 0.01157831 0.11261912 0.08505443
 0.07749002 0.12416882 0.0162413  0.00786164 0.09539169 0.16877637
 0.00525394 0.24438455 0.04805342 0.1637318  0.11044776 0.18852624
 0.02913109 0.30350438 0.01314217 0.00387597 0.01179941 0.00515464
 0.00664905 0.12660608 0.32232813 0.09669211 0.08721448 0.00958327]
F1 Score: 0.19662402454284883
DecisionTreeClassifier()
Accuracy: 0.24826236030782803
Precision: [0.16520154 0.10252697 0.43368439 0.03259533 0.15671209 0.11124417
 0.11835299 0.23388133 0.05355715 0.03241491 0.14183563 0.28636364
 0.05088063 0.31709402 0.0834

MemoryError: could not allocate 251658240 bytes

Weighted Precision= ∑(Number of true instances per class × Precision of each class)/Total number of instances

Weighted Recall = ∑(Number of true instances per class×Recall of each class)/ Total number of instances

Micro Precision=Micro Recall= (∑(TP of all classes)+∑(FP of all classes))/∑(TP of all classes)

### Predicting Crime Location

In [None]:
regressors = [KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression(), MLPRegressor(), XGBRegressor()]
regression_location_results = []
for model in regressors:    
    X_train, X_test, y_train, y_test = train_test_split(transformed, np.argmax(transformed[:, :2], axis=1), test_size=0.2)
    model.fit(X_train, y_train)
    dump(model, f'models/regression/crime_location/{str(model)}_crime_location.joblib')
    
    
    mean_absolute_error
    mean_squared_error
    np.sqrt(mean_squared_error)
    r2_score

### Predicting Crime Time

In [None]:
regressors = [KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression(), MLPRegressor(), XGBRegressor()]
regression_time_results = []
for model in regressors:    
    X_train, X_test, y_train, y_test = train_test_split(transformed, np.argmax(transformed[:, 9:15], axis=1), test_size=0.2)
    model.fit(X_train, y_train)
    dump(model, f'models/regression/crime_time/{str(model)}_crime_time.joblib')

# Final Model Hyperparameter Tuning

After selecting best model for each of the 3 tasks, we tune them to maximize performance

In [None]:
#gridsearchcv
#cross fold validation
#     print("Classification Report:\n", classification_report(y_test, y_pred))
#     print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
#random forest feature importances

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
sample_weight = (y_train_pred != y_train)
plt.rc('font', size=10)  # extra code
ConfusionMatrixDisplay.from_predictions(y_train, y_train_pred,
                                        sample_weight=sample_weight,
                                        normalize="true", values_format=".0%")
plt.show()

In [None]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Initialize the KNN classifier
knn = KNeighborsClassifier()

# Setup GridSearchCV
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train_encoded, y_train_encoded)

# Retrieve the best KNN model
best_knn = grid_search.best_estimator_

# Making predictions with the best model
y_pred = best_knn.predict(X_test_encoded)

# Evaluate the best model
accuracy = accuracy_score(y_test_encoded, y_pred)
precision, recall, fscore, support = precision_recall_fscore_support(y_test_encoded, y_pred)
f1score = f1_score(y_test_encoded, y_pred, average='weighted')
roc_auc = roc_auc_score(y_test_encoded, y_pred)

# Displaying the results
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1score)
print("ROC AUC:", roc_auc)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
