In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from google.colab import userdata
data = pd.read_csv(userdata.get('dataPath'))

In [3]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from datetime import datetime
import matplotlib.pyplot as plt

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier, XGBRegressor

from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, mean_squared_error,
mean_absolute_error, r2_score, f1_score, precision_recall_fscore_support, roc_auc_score)
from sklearn.model_selection import GridSearchCV
import joblib
from joblib import dump

In [4]:
data.drop("Unnamed: 0", axis=1, inplace=True)
data.head()

Unnamed: 0,crime_type,point_x,point_y,Year,Month,Day,Hour,Minute,Weekday,IsWeekend,street_name,block_number,distance_to_nearest_police_station,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,Robbery Firearm,-75.07261,40.041574,2023,10,13,23,54,4,0,BLOCK LARGE,6600,0.017699,65.5,51.5,57.2,0.0,0.0,5.4
1,Thefts,-75.108462,39.994303,2023,10,13,23,27,4,0,BLOCK JOYCE,3200,0.028199,65.5,51.5,57.2,0.0,0.0,5.4
2,Thefts,-75.136074,39.99189,2023,10,13,23,24,4,0,BLOCK N,2700,0.029554,65.5,51.5,57.2,0.0,0.0,5.4
3,Robbery Firearm,-75.107092,40.032184,2023,10,13,23,19,4,0,BLOCK ADAMS,700,0.038528,65.5,51.5,57.2,0.0,0.0,5.4
4,Vandalism/Criminal Mischief,-75.234274,39.925604,2023,10,13,23,12,4,0,BLOCK W,6400,0.000292,65.5,51.5,57.2,0.0,0.0,5.4


In [5]:
# Largest prior
data["crime_type"].value_counts()[0]/data["crime_type"].value_counts().sum()

0.20655217949451754

In [6]:
feature_data_pipeline = ColumnTransformer([
    ('passthrough', 'passthrough', ['IsWeekend', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Weekday']),
    ("numerical", StandardScaler(), ["point_x", "point_y",
        "block_number", "distance_to_nearest_police_station",
        "temperature_2m_max (°F)", "temperature_2m_min (°F)",
        "temperature_2m_mean (°F)", "precipitation_sum (mm)",
        "precipitation_hours (h)", "Unemployment Rate of a Population"])
])

In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Using SMOTE; We have to adjust target column
# Assuming 'data' is your DataFrame and 'target' is the column with class labels; what is the target column for our DataFrame
X = data.drop(['crime_type', 'street_name'], axis=1)  # Replace 'target' with the name of your target column
y = data['crime_type']  # Replace 'target' with the name of your target column

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [8]:
X_train_smote

Unnamed: 0,point_x,point_y,Year,Month,Day,Hour,Minute,Weekday,IsWeekend,block_number,distance_to_nearest_police_station,temperature_2m_max (°F),temperature_2m_min (°F),temperature_2m_mean (°F),precipitation_sum (mm),precipitation_hours (h),Unemployment Rate of a Population
0,-75.243824,39.916949,2019,6,13,17,5,3,0,7200,0.018497,71.000000,60.200000,64.200000,25.200000,13.000000,5.700000
1,-75.041910,40.080531,2022,11,30,12,56,2,0,9200,0.020388,56.400000,37.500000,46.700000,16.100000,10.000000,5.000000
2,-75.210140,39.969813,2011,1,11,8,18,1,0,4300,0.017745,33.500000,22.300000,27.300000,4.600000,6.000000,10.600000
3,-75.079761,40.042664,2022,10,17,16,10,0,0,1200,0.025939,67.300000,54.900000,59.700000,2.900000,10.000000,5.000000
4,-75.152438,40.007778,2016,7,19,11,39,1,0,3600,0.015529,87.800000,73.400000,80.200000,0.000000,0.000000,7.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9052031,-75.207890,39.996530,2010,6,26,1,38,1,0,200,0.018828,84.386528,67.854874,75.969223,0.000000,0.000000,6.268784
9052032,-75.130112,39.992425,2006,11,27,22,40,4,0,2800,0.018930,50.749295,32.726678,40.797349,0.000000,0.000000,6.430570
9052033,-75.210656,39.955109,2018,3,29,18,25,4,0,1300,0.019027,35.838358,27.700000,31.708524,0.000000,0.000000,6.229834
9052034,-75.225113,39.969085,2016,5,7,20,18,5,1,5100,0.017484,58.560243,47.361349,52.735582,1.001963,3.722698,6.487116


In [9]:
#Crime Classification:
crimetype_encoder = LabelEncoder()
y_train_encoded_smoted = crimetype_encoder.fit_transform(y_train_smote)
y_test_encoded = crimetype_encoder.transform(y_test)

X_train_encoded_smoted = feature_data_pipeline.fit_transform(X_train_smote)
X_test_encoded = feature_data_pipeline.transform(X_test)

In [10]:
X_train_encoded_smoted.shape, y_train_encoded_smoted.shape, X_test_encoded.shape, y_test_encoded.shape

((9052036, 17), (9052036,), (391357, 17), (391357,))

In [11]:
max(y_train_encoded_smoted) #28 classes

27

In [12]:
import tensorflow as tf
from tensorflow import keras

y_train_one_hot_encoded_smoted = keras.utils.to_categorical(y_train_encoded_smoted, num_classes=28)
y_test_one_hot_encoded_smoted = keras.utils.to_categorical(y_test_encoded, num_classes=28)

In [15]:
from keras import layers
from keras.models import Sequential

# Define the model architecture
model = Sequential([
  layers.Dense(64, activation='relu', input_shape=(17,)),  # Input layer: Adjust the input_shape to match your dataset
  layers.BatchNormalization(),
  layers.Dropout(0.5),
  layers.Dense(64, activation='relu'),
  layers.BatchNormalization(),
  layers.Dropout(0.5),
  layers.Dense(28, activation='softmax')  # Output layer for 27 classes
])

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # Use 'categorical_crossentropy' for multiclass classification
              metrics=['accuracy'])

# Model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 64)                1152      
                                                                 
 batch_normalization_2 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_2 (Dropout)         (None, 64)                0         
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 batch_normalization_3 (Bat  (None, 64)                256       
 chNormalization)                                                
                                                                 
 dropout_3 (Dropout)         (None, 64)               

In [16]:
history = model.fit(X_train_encoded_smoted, y_train_one_hot_encoded_smoted, batch_size=32, epochs=10, validation_data=(X_test_encoded, y_test_one_hot_encoded_smoted))

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [17]:
print(history)

NameError: name 'history' is not defined

In [None]:
import matplotlib.pyplot as plt

# Assuming 'history' is the object returned by the model.fit() method
# It contains training and validation loss and accuracy

# Plot training & validation accuracy values
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train')
plt.plot(history.history['val_accuracy'], label='Validation')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train')
plt.plot(history.history['val_loss'], label='Validation')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper left')

plt.tight_layout()
plt.show()