# Machine Learning Model Experiments

Based on the features:
1. Classify what type of crime it is
2. Regression prediction on where the crime took place
3. Regression prediction on when the crime took place

Try out these models:
1) K Nearest Neighbors
2) Decision Tree
3) Random Forest
4) Logistic Regression
5) Support Vector Machine
6) Multi Layer Perceptron
7) XGBoost

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, mean_squared_error, 
mean_absolute_error, r2_score, f1_score, precision_recall_fscore_support)

In [22]:
data = pd.read_csv("data\data_full_features.csv")

In [23]:
data.head()

Unnamed: 0.1,Unnamed: 0,crime_type,point_x,point_y,Year,Month,Day,Hour,Minute,Weekday,IsWeekend,street_name,block_number,distance_to_nearest_police_station,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,0,Thefts,-75.232271,39.972757,2023,10,4,22,58,2,0,BLOCK N,1300,0.010203,82.0,60.2,69.9,0.0,0.0,5.4
1,1,DRIVING UNDER THE INFLUENCE,-75.178605,40.030809,2023,10,4,22,42,2,0,BLOCK W,200,0.00752,82.0,60.2,69.9,0.0,0.0,5.4
2,2,All Other Offenses,-75.226049,39.971074,2023,10,4,22,25,2,0,BLOCK N,600,0.014742,82.0,60.2,69.9,0.0,0.0,5.4
3,3,All Other Offenses,-75.121491,40.001767,2023,10,4,22,23,2,0,BLOCK ARBOR,3400,0.007706,82.0,60.2,69.9,0.0,0.0,5.4
4,4,Other Assaults,-75.143587,40.044345,2023,10,4,22,17,2,0,BLOCK N,5900,0.000326,82.0,60.2,69.9,0.0,0.0,5.4


In [24]:
data_pipeline = ColumnTransformer([
    ("categorical", OneHotEncoder(), ["crime_type"]),
    ("numerical", StandardScaler(), ["point_x", "point_y", 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Weekday',
                                     "distance_to_nearest_police_station", 
                                     "temperature_2m_max (°F)", "temperature_2m_min (°F)", 
                                     "temperature_2m_mean (°F)", "precipitation_sum (mm)", "precipitation_hours (h)",
                                     "Unemployment Rate of a Population", "block_number"]),
    ("ordinal", OrdinalEncoder(), ["street_name"])
])

In [18]:
def train_model(model, data, targetStart, targetEnd):
    X_train, X_test, y_train, y_test = train_test_split(data, data[:, targetStart:targetEnd], test_size=0.2)
    X_train_transformed = data_pipeline.fit_transform(X_train)
    
    model.fit(X_train, y_train)
    return model

### Classifying Crime Type

In [19]:
from joblib import dump

multiclass_classifiers = [KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(), MLPClassifier()]
for model in multiclass_classifiers:
    print(model)
    model = train_model(model, data, 16, -1)
    dump(model, f'models/classification{str(model)}_crime_type.joblib')

KNeighborsClassifier()
DecisionTreeClassifier()
RandomForestClassifier()


KeyboardInterrupt: 

In [None]:
singleclass = [LogisticRegression(solver='lbfgs', max_iter=1000), SVC(), XGBClassifier()]

for model in singleclass:    
    X_train, X_test, y_train, y_test = train_test_split(transformed, np.argmax(transformed[:, 16:-1], axis=1), test_size=0.2)
    model.fit(X_train, y_train)
    dump(model, f'models/regression{str(model)}_crime_type.joblib')

### Predicting Crime Location

In [None]:
regressors = [KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression(), MLPRegressor(), XGBRegressor()]

for model in regressors:    
    X_train, X_test, y_train, y_test = train_test_split(transformed, np.argmax(transformed[:, :2], axis=1), test_size=0.2)
    model.fit(X_train, y_train)
    dump(model, f'models/regression/crime_location/{str(model)}_crime_location.joblib')

### Predicting Crime Time

In [None]:
regressors = [KNeighborsRegressor(), DecisionTreeRegressor(), RandomForestRegressor(), LinearRegression(), MLPRegressor(), XGBRegressor()]

for model in regressors:    
    X_train, X_test, y_train, y_test = train_test_split(transformed, np.argmax(transformed[:, 9:15], axis=1), test_size=0.2)
    model.fit(X_train, y_train)
    dump(model, f'models/regression/crime_time/{str(model)}_crime_time.joblib')

### Evaluating Models

In [1]:
import os

os.listdir('models/')


#     print("Accuracy:", accuracy_score(y_test, y_pred))
#     precision,recall,fscore,support = precision_recall_fscore_support(y_train, y_pred)
#     print("Precision:", precision)
#     print("Recall:", recall)
#     print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

[]

# Final Model Hyperparameter Tuning

After selecting best model for each of the 3 tasks, we tune them to maximize performance

In [None]:
#gridsearchcv
#cross fold validation
#     print("Classification Report:\n", classification_report(y_test, y_pred))
#     print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
#random forest feature importances