In [1]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Your runtime has 17.1 gigabytes of available RAM



In [2]:
!nvidia-smi

Thu Mar 28 13:19:39 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.23                 Driver Version: 536.23       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1080      WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   37C    P8               9W / 200W |    432MiB /  8192MiB |      2%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# Machine Learning Model Experiments

Based on the features:
1. Classify what type of crime it is
2. Regression prediction on where the crime took place
3. Regression prediction on when the crime took place

Try out these models:
1) K Nearest Neighbors
2) Decision Tree
3) Random Forest
4) Logistic Regression
5) Support Vector Machine
6) Multi Layer Perceptron
7) XGBoost

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, mean_squared_error,
mean_absolute_error, r2_score, f1_score, precision_recall_fscore_support, roc_auc_score)
from sklearn.model_selection import GridSearchCV
import joblib
from joblib import dump

In [4]:
data = pd.read_csv("data_full_features.csv")

In [5]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.head()

Unnamed: 0,crime_type,point_x,point_y,Year,Month,Day,Hour,Minute,Weekday,IsWeekend,street_name,block_number,distance_to_nearest_police_station,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,Robbery Firearm,-75.07261,40.041574,2023,10,13,23,54,4,0,BLOCK LARGE,6600,0.017699,65.5,51.5,57.2,0.0,0.0,5.4
1,Thefts,-75.108462,39.994303,2023,10,13,23,27,4,0,BLOCK JOYCE,3200,0.028199,65.5,51.5,57.2,0.0,0.0,5.4
2,Thefts,-75.136074,39.99189,2023,10,13,23,24,4,0,BLOCK N,2700,0.029554,65.5,51.5,57.2,0.0,0.0,5.4
3,Robbery Firearm,-75.107092,40.032184,2023,10,13,23,19,4,0,BLOCK ADAMS,700,0.038528,65.5,51.5,57.2,0.0,0.0,5.4
4,Vandalism/Criminal Mischief,-75.234274,39.925604,2023,10,13,23,12,4,0,BLOCK W,6400,0.000292,65.5,51.5,57.2,0.0,0.0,5.4


In [6]:
# Largest prior
data["crime_type"].value_counts()[0]/data["crime_type"].value_counts().sum()

0.20655217949451754

In [7]:
feature_data_pipeline = ColumnTransformer([
    ('passthrough', 'passthrough', ['IsWeekend', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Weekday']),
    ("numerical", StandardScaler(), ["point_x", "point_y",
        "block_number", "distance_to_nearest_police_station",
        "temperature_2m_max (°F)", "temperature_2m_min (°F)",
        "temperature_2m_mean (°F)", "precipitation_sum (mm)",
        "precipitation_hours (h)", "Unemployment Rate of a Population"])
])

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Using SMOTE; We have to adjust target column
# Assuming 'data' is your DataFrame and 'target' is the column with class labels; what is the target column for our DataFrame
X = data.drop(['crime_type', 'street_name'], axis=1)  # Replace 'target' with the name of your target column
y = data['crime_type']  # Replace 'target' with the name of your target column

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Model Training

### Classifying Crime Type

In [9]:
#Crime Classification:
crimetype_encoder = LabelEncoder()
y_train_encoded_smoted = crimetype_encoder.fit_transform(y_train_smote)
y_test_encoded = crimetype_encoder.transform(y_test)

X_train_encoded_smoted = feature_data_pipeline.fit_transform(X_train_smote)
X_test_encoded = feature_data_pipeline.transform(X_test)

In [10]:
X_train_encoded_smoted.shape, y_train_encoded_smoted.shape, X_test_encoded.shape, y_test_encoded.shape

((9052036, 17), (9052036,), (391357, 17), (391357,))

In [11]:
X_train_smote

Unnamed: 0,point_x,point_y,Year,Month,Day,Hour,Minute,Weekday,IsWeekend,block_number,distance_to_nearest_police_station,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,-75.243824,39.916949,2019,6,13,17,5,3,0,7200,0.018497,71.000000,60.200000,64.200000,25.200000,13.000000,5.700000
1,-75.041910,40.080531,2022,11,30,12,56,2,0,9200,0.020388,56.400000,37.500000,46.700000,16.100000,10.000000,5.000000
2,-75.210140,39.969813,2011,1,11,8,18,1,0,4300,0.017745,33.500000,22.300000,27.300000,4.600000,6.000000,10.600000
3,-75.079761,40.042664,2022,10,17,16,10,0,0,1200,0.025939,67.300000,54.900000,59.700000,2.900000,10.000000,5.000000
4,-75.152438,40.007778,2016,7,19,11,39,1,0,3600,0.015529,87.800000,73.400000,80.200000,0.000000,0.000000,7.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9052031,-75.072586,40.007133,2018,1,26,5,14,0,0,6500,0.025251,35.289818,25.435890,30.837424,0.000000,0.000000,6.094049
9052032,-75.115686,39.999963,2012,3,7,19,33,2,0,3400,0.015315,60.565999,39.550102,47.913281,1.579076,1.973845,6.955333
9052033,-75.225634,39.893700,2016,6,15,14,7,3,0,8000,0.043467,75.607596,64.648996,68.852631,6.357298,6.096039,6.783776
9052034,-75.099273,40.031859,2020,9,26,20,4,5,1,6700,0.016317,56.317303,40.984195,49.456539,0.000000,0.000000,12.344824


In [None]:
from joblib import dump

classifiers = [RandomForestClassifier(n_estimators=10, verbose=2, n_jobs=-1), MultinomialNB(), DecisionTreeClassifier(), KNeighborsClassifier(), LogisticRegression(), SVC(), MLPClassifier(), XGBClassifier()]

classification_results = []

for model in classifiers:
    try:
        print(model)
        model.fit(X_train_encoded_smoted, y_train_encoded_smoted)
        model_name = str(model).split("(")[0]
        dump(model, f'models/classification/{model_name}_crime_type.joblib')
        y_pred = model.predict(X_test_encoded)
        accuracy = accuracy_score(y_test_encoded, y_pred)
        precision, recall, fscore, support = precision_recall_fscore_support(y_test_encoded, y_pred)
        f1score = f1_score(y_test_encoded, y_pred, average='weighted')

        print("Accuracy:", accuracy)
        print("Precision:", precision)
        print("Recall:", recall)
        print("F1 Score:", f1score)

        classification_results.append([model_name, accuracy, precision, recall, f1score])
    except Exception as e:
        print(f"Error: {e} on model {str(model)}")
        continue

with open('classification_results.txt', 'w') as file:
    for result in classification_results:
        file.write(f'Model: {result[0]}, Accuracy: {result[1]}, Precision: {result[2]}, Recall: {result[3]}, F1 Score: {result[4]}\n')


RandomForestClassifier(n_estimators=10, n_jobs=-1, verbose=2)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.


building tree 1 of 10building tree 2 of 10
building tree 3 of 10
building tree 4 of 10building tree 5 of 10

building tree 6 of 10

building tree 7 of 10
building tree 8 of 10
building tree 9 of 10building tree 10 of 10



[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:  2.9min remaining:  1.2min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  4.7min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   7 out of  10 | elapsed:    4.3s remaining:    1.8s
[Parallel(n_jobs=8)]: Done  10 out of  10 | elapsed:    4.7s finished


Accuracy: 0.22278124576793057
Precision: [0.09416597 0.0999856  0.01686122 0.11212631 0.11586225 0.14768461
 0.2025639  0.02940294 0.02004076 0.19618821 0.18298969 0.81538462
 0.22795276 0.15081261 0.36189484 0.17955326 0.06843048 0.44283088
 0.064231   0.03494495 0.04766444 0.07054288 0.05198648 0.25712935
 0.47252747 0.26339969 0.23033954 0.10255921]
Recall: [0.20823272 0.14735055 0.04615385 0.21889269 0.14178338 0.26642036
 0.34725635 0.07555898 0.04678826 0.20341685 0.34299517 0.97695853
 0.52350814 0.19389414 0.35403995 0.30690162 0.0847082  0.7625831
 0.1297619  0.04376499 0.07507508 0.07198299 0.04205488 0.18152421
 0.35190565 0.45502646 0.12149234 0.07919186]
F1 Score: 0.22964055991971016
MultinomialNB()
Error: Negative values in data passed to MultinomialNB (input X) on model MultinomialNB()
DecisionTreeClassifier()
Accuracy: 0.1857792245954461
Precision: [0.09497166 0.09730978 0.01200229 0.09625809 0.10588592 0.12443161
 0.17774002 0.02260605 0.01231587 0.18024935 0.10950081 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy: 0.050685180027442975
Precision: [0.00478469 0.05555556 0.0123353  0.03914943 0.08135169 0.06048252
 0.02433425 0.00731806 0.00481195 0.14829659 0.00140273 0.29577465
 0.00770066 0.10464571 0.16934242 0.00814694 0.01737194 0.02633912
 0.00495848 0.07142857 0.00400013 0.01798879 0.         0.16121212
 0.25801258 0.00670466 0.18707811 0.02883625]
Recall: [1.14982178e-04 5.30419562e-05 5.20710059e-02 1.63670091e-01
 2.66973344e-03 1.47680624e-01 7.23450723e-03 8.32690825e-02
 3.01348136e-02 2.22576473e-03 1.88405797e-01 1.00000000e+00
 4.04159132e-01 1.34280725e-02 4.75470067e-02 8.07635830e-02
 9.60354592e-03 3.85565052e-01 1.94047619e-01 2.99760192e-04
 1.80180180e-01 1.29718235e-02 0.00000000e+00 3.32964763e-02
 1.06983472e-01 1.58730159e-01 3.09898653e-02 4.12918449e-03]
F1 Score: 0.05909054871679943
SVC()


In [None]:
with open('classification_results.txt', 'w') as file:
    for result in classification_results:
        file.write(f'Model: {result[0]}, Accuracy: {result[1]}, Precision: {result[2]}, Recall: {result[3]}, F1 Score: {result[4]}\n')


Weighted Precision= ∑(Number of true instances per class × Precision of each class)/Total number of instances

Weighted Recall = ∑(Number of true instances per class×Recall of each class)/ Total number of instances

Micro Precision=Micro Recall= (∑(TP of all classes)+∑(FP of all classes))/∑(TP of all classes)

In [None]:
from sklearn.model_selection import cross_val_score
clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)