In [22]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced


In [23]:
columns = ["CCN", "SHIFT", "OFFENSE", "METHOD", "VOTING_PRECINCT", "XBLOCK", "YBLOCK", "crimetype"]
target = ["crimetype"]

In [24]:
#Load data
file_path = Path('dc_crime_add_vars.csv')
dc_crime_df = pd.read_csv(file_path, skiprows=0)[:-2]

dc_crime_df = dc_crime_df.loc[:, columns].copy()
dc_crime_df = dc_crime_df.dropna(axis='columns', how='all')

#drop the null rows:
dc_crime_df = dc_crime_df.dropna()

dc_crime_df



Unnamed: 0,CCN,SHIFT,OFFENSE,METHOD,VOTING_PRECINCT,XBLOCK,YBLOCK,crimetype
0,8123749,EVENING,THEFT/OTHER,OTHERS,Precinct 6,-77.070209,38.913633,Non-Violent
1,8123824,MIDNIGHT,MOTOR VEHICLE THEFT,OTHERS,Precinct 14,-77.045323,38.908564,Non-Violent
2,8123835,MIDNIGHT,THEFT/OTHER,OTHERS,Precinct 129,-77.027045,38.899057,Non-Violent
3,8127848,DAY,THEFT/OTHER,OTHERS,Precinct 15,-77.040089,38.909638,Non-Violent
4,8120153,EVENING,MOTOR VEHICLE THEFT,OTHERS,Precinct 143,-77.019377,38.894573,Non-Violent
...,...,...,...,...,...,...,...,...
342860,8004375,DAY,THEFT/OTHER,OTHERS,Precinct 40,-77.038870,38.931950,Non-Violent
342861,8004384,EVENING,THEFT/OTHER,OTHERS,Precinct 25,-77.044170,38.920120,Non-Violent
342862,8037546,DAY,MOTOR VEHICLE THEFT,OTHERS,Precinct 37,-77.020640,38.919610,Non-Violent
342863,8037554,EVENING,THEFT/OTHER,OTHERS,Precinct 40,-77.036470,38.932120,Non-Violent


In [25]:
# get data types
result = dc_crime_df.dtypes
result

CCN                  int64
SHIFT               object
OFFENSE             object
METHOD              object
VOTING_PRECINCT     object
XBLOCK             float64
YBLOCK             float64
crimetype           object
dtype: object

In [26]:
# create features
X = pd.get_dummies(dc_crime_df, columns=['SHIFT', 'OFFENSE', 'METHOD', 'VOTING_PRECINCT']).drop('crimetype', axis=1)

# create target
y = dc_crime_df['crimetype']
X.head()

Unnamed: 0,CCN,XBLOCK,YBLOCK,SHIFT_DAY,SHIFT_EVENING,SHIFT_MIDNIGHT,OFFENSE_ARSON,OFFENSE_ASSAULT W/DANGEROUS WEAPON,OFFENSE_BURGLARY,OFFENSE_HOMICIDE,...,VOTING_PRECINCT_Precinct 90,VOTING_PRECINCT_Precinct 91,VOTING_PRECINCT_Precinct 92,VOTING_PRECINCT_Precinct 93,VOTING_PRECINCT_Precinct 94,VOTING_PRECINCT_Precinct 95,VOTING_PRECINCT_Precinct 96,VOTING_PRECINCT_Precinct 97,VOTING_PRECINCT_Precinct 98,VOTING_PRECINCT_Precinct 99
0,8123749,-77.070209,38.913633,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8123824,-77.045323,38.908564,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8123835,-77.027045,38.899057,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8127848,-77.040089,38.909638,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8120153,-77.019377,38.894573,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
X.describe()

Unnamed: 0,CCN,XBLOCK,YBLOCK,SHIFT_DAY,SHIFT_EVENING,SHIFT_MIDNIGHT,OFFENSE_ARSON,OFFENSE_ASSAULT W/DANGEROUS WEAPON,OFFENSE_BURGLARY,OFFENSE_HOMICIDE,...,VOTING_PRECINCT_Precinct 90,VOTING_PRECINCT_Precinct 91,VOTING_PRECINCT_Precinct 92,VOTING_PRECINCT_Precinct 93,VOTING_PRECINCT_Precinct 94,VOTING_PRECINCT_Precinct 95,VOTING_PRECINCT_Precinct 96,VOTING_PRECINCT_Precinct 97,VOTING_PRECINCT_Precinct 98,VOTING_PRECINCT_Precinct 99
count,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,...,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0,342781.0
mean,12653050.0,-77.008137,38.905972,0.384733,0.424478,0.190789,0.000951,0.068356,0.091289,0.0036,...,0.008352,0.013487,0.004942,0.00552,0.005076,0.006074,0.005082,0.004919,0.005059,0.005928
std,2845898.0,0.036328,0.031598,0.486533,0.494264,0.392924,0.030824,0.252355,0.28802,0.059892,...,0.091008,0.115347,0.070125,0.074089,0.071066,0.077698,0.071107,0.06996,0.070944,0.076765
min,100060.0,-77.113642,38.81347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10124920.0,-77.03238,38.890958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,13030810.0,-77.013111,38.90643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,15102670.0,-76.98552,38.92532,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,99438990.0,-76.91001,38.994901,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
# check balance of target values
y.value_counts()

Non-Violent    280457
Violent         62324
Name: crimetype, dtype: int64

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(Counter(y_train))
print(Counter(y_test))

Counter({'Non-Violent': 210501, 'Violent': 46584})
Counter({'Non-Violent': 69956, 'Violent': 15740})


In [35]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brf.fit(X_train, y_train)
y_pred = brf.predict(X_test)

In [36]:
#Calculate balanced accuracy score
balanced_accuracy_score(y_test,y_pred)

0.9999785579507119

In [37]:
#confusion matrix
y_pred = brf.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[69953,     3],
       [    0, 15740]], dtype=int64)

In [38]:
# Print the imbalanced classification report


print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

Non-Violent       1.00      1.00      1.00      1.00      1.00      1.00     69956
    Violent       1.00      1.00      1.00      1.00      1.00      1.00     15740

avg / total       1.00      1.00      1.00      1.00      1.00      1.00     85696



In [39]:
importances = brf.feature_importances_
sorted(zip(brf.feature_importances_, X.columns), reverse=True)

[(0.2732158377421571, 'OFFENSE_ROBBERY'),
 (0.14533721372935046, 'OFFENSE_ASSAULT W/DANGEROUS WEAPON'),
 (0.12718571616838176, 'OFFENSE_THEFT/OTHER'),
 (0.10967688182010973, 'METHOD_OTHERS'),
 (0.08521495702057029, 'OFFENSE_THEFT F/AUTO'),
 (0.058939846632250235, 'METHOD_GUN'),
 (0.040489474797855, 'OFFENSE_MOTOR VEHICLE THEFT'),
 (0.03990786790537317, 'OFFENSE_BURGLARY'),
 (0.028512073806671832, 'METHOD_KNIFE'),
 (0.026824831569189894, 'OFFENSE_SEX ABUSE'),
 (0.015547780789905387, 'SHIFT_MIDNIGHT'),
 (0.01166818085536839, 'XBLOCK'),
 (0.00904402894133255, 'SHIFT_DAY'),
 (0.007309994437039723, 'YBLOCK'),
 (0.00654061565481621, 'CCN'),
 (0.00323524538551977, 'OFFENSE_HOMICIDE'),
 (0.0023650203635963807, 'SHIFT_EVENING'),
 (0.0007901122177962098, 'VOTING_PRECINCT_Precinct 129'),
 (0.0004484323829337516, 'OFFENSE_ARSON'),
 (0.00044188259432517934, 'VOTING_PRECINCT_Precinct 6'),
 (0.0003956632632491246, 'VOTING_PRECINCT_Precinct 17'),
 (0.00030721111287123335, 'VOTING_PRECINCT_Precinct 31'