In [81]:
# Imports 
import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering

In [82]:
columns = ["CCN", "SHIFT", "OFFENSE", "METHOD", "NEIGHBORHOOD_CLUSTER", "PSA"]

In [83]:
#Load data
file_path = Path('Crime_Incidents_in_2021 (1).csv')
dc_crime_df = pd.read_csv(file_path, skiprows=0)[:-2]

dc_crime_df = dc_crime_df.loc[:, columns].copy()
dc_crime_df = dc_crime_df.dropna(axis='columns', how='all')

#drop the null rows:
dc_crime_df = dc_crime_df.dropna()

dc_crime_df

Unnamed: 0,CCN,SHIFT,OFFENSE,METHOD,NEIGHBORHOOD_CLUSTER,PSA
0,20139232,MIDNIGHT,HOMICIDE,GUN,Cluster 18,407.0
1,11048862,EVENING,THEFT F/AUTO,OTHERS,Cluster 15,204.0
2,10008883,EVENING,THEFT/OTHER,OTHERS,Cluster 33,604.0
3,11124202,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0
4,12113231,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0
...,...,...,...,...,...,...
28314,21186813,DAY,ASSAULT W/DANGEROUS WEAPON,KNIFE,Cluster 23,507.0
28315,21186822,DAY,THEFT F/AUTO,OTHERS,Cluster 17,401.0
28316,21186824,DAY,THEFT/OTHER,OTHERS,Cluster 39,705.0
28317,21186834,DAY,THEFT/OTHER,OTHERS,Cluster 8,209.0


In [84]:
for column in dc_crime_df.columns:
    print(f"Column {column} has {dc_crime_df[column].isnull().sum()} null values")

Column CCN has 0 null values
Column SHIFT has 0 null values
Column OFFENSE has 0 null values
Column METHOD has 0 null values
Column NEIGHBORHOOD_CLUSTER has 0 null values
Column PSA has 0 null values


In [85]:
dc_crime_df.set_index("CCN", inplace=True)
dc_crime_df.index.name=None
dc_crime_df

Unnamed: 0,SHIFT,OFFENSE,METHOD,NEIGHBORHOOD_CLUSTER,PSA
20139232,MIDNIGHT,HOMICIDE,GUN,Cluster 18,407.0
11048862,EVENING,THEFT F/AUTO,OTHERS,Cluster 15,204.0
10008883,EVENING,THEFT/OTHER,OTHERS,Cluster 33,604.0
11124202,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0
12113231,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0
...,...,...,...,...,...
21186813,DAY,ASSAULT W/DANGEROUS WEAPON,KNIFE,Cluster 23,507.0
21186822,DAY,THEFT F/AUTO,OTHERS,Cluster 17,401.0
21186824,DAY,THEFT/OTHER,OTHERS,Cluster 39,705.0
21186834,DAY,THEFT/OTHER,OTHERS,Cluster 8,209.0


In [86]:
#Create new data fram that holds only the PSA
#method_df = pd.DataFrame(dc_crime_df.METHOD)
#method_df

In [87]:
# Categorize data from method column into violent or non-violent
def categorise (row):
    if row["METHOD"]=='GUN':
        return 'VIOLENT'
    elif row["METHOD"]=='KNIFE':
        return "VIOLENT"
    elif row["METHOD"]=='OTHERS':
        return "NON-VIOLENT"
    return "NON-VIOLENT"

In [88]:
# Create new colum with crimetype - violent or non-violent
dc_crime_df["crimetype"] = dc_crime_df.apply(lambda row: categorise (row), axis=1)
dc_crime_df

Unnamed: 0,SHIFT,OFFENSE,METHOD,NEIGHBORHOOD_CLUSTER,PSA,crimetype
20139232,MIDNIGHT,HOMICIDE,GUN,Cluster 18,407.0,VIOLENT
11048862,EVENING,THEFT F/AUTO,OTHERS,Cluster 15,204.0,NON-VIOLENT
10008883,EVENING,THEFT/OTHER,OTHERS,Cluster 33,604.0,NON-VIOLENT
11124202,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0,NON-VIOLENT
12113231,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0,NON-VIOLENT
...,...,...,...,...,...,...
21186813,DAY,ASSAULT W/DANGEROUS WEAPON,KNIFE,Cluster 23,507.0,VIOLENT
21186822,DAY,THEFT F/AUTO,OTHERS,Cluster 17,401.0,NON-VIOLENT
21186824,DAY,THEFT/OTHER,OTHERS,Cluster 39,705.0,NON-VIOLENT
21186834,DAY,THEFT/OTHER,OTHERS,Cluster 8,209.0,NON-VIOLENT


In [89]:
result = dc_crime_df.dtypes
result

SHIFT                    object
OFFENSE                  object
METHOD                   object
NEIGHBORHOOD_CLUSTER     object
PSA                     float64
crimetype                object
dtype: object

In [90]:
#Use get_dummies() to create variables for test features. 
X=pd.get_dummies(dc_crime_df, columns=['SHIFT', 'OFFENSE', 'METHOD', 'NEIGHBORHOOD_CLUSTER', 'crimetype'])
X.head()

Unnamed: 0,PSA,SHIFT_DAY,SHIFT_EVENING,SHIFT_MIDNIGHT,OFFENSE_ARSON,OFFENSE_ASSAULT W/DANGEROUS WEAPON,OFFENSE_BURGLARY,OFFENSE_HOMICIDE,OFFENSE_MOTOR VEHICLE THEFT,OFFENSE_ROBBERY,...,NEIGHBORHOOD_CLUSTER_Cluster 44,NEIGHBORHOOD_CLUSTER_Cluster 45,NEIGHBORHOOD_CLUSTER_Cluster 46,NEIGHBORHOOD_CLUSTER_Cluster 5,NEIGHBORHOOD_CLUSTER_Cluster 6,NEIGHBORHOOD_CLUSTER_Cluster 7,NEIGHBORHOOD_CLUSTER_Cluster 8,NEIGHBORHOOD_CLUSTER_Cluster 9,crimetype_NON-VIOLENT,crimetype_VIOLENT
20139232,407.0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
11048862,204.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
10008883,604.0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
11124202,506.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
12113231,506.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [91]:
#Standardize the data with StandardScaler()
crime_X_scaled = StandardScaler().fit_transform(X)
crime_X_scaled

array([[ 0.19726806, -0.80005246, -0.86549581, ..., -0.12906912,
        -2.99255978,  2.99255978],
       [-0.90792779, -0.80005246,  1.1554071 , ..., -0.12906912,
         0.33416208, -0.33416208],
       [ 1.26979803, -0.80005246,  1.1554071 , ..., -0.12906912,
         0.33416208, -0.33416208],
       ...,
       [ 1.81967381,  1.24991803, -0.86549581, ..., -0.12906912,
         0.33416208, -0.33416208],
       [-0.88070622,  1.24991803, -0.86549581, ..., -0.12906912,
         0.33416208, -0.33416208],
       [ 1.80878518,  1.24991803, -0.86549581, ..., -0.12906912,
         0.33416208, -0.33416208]])

In [109]:
# Using PCA to reduce dimension to three principal components
pca = PCA(n_components=3)
crime_pca = pca.fit_transform(crime_X_scaled)
crime_pca

array([[ 6.95936001,  0.2462992 ,  0.90635139],
       [-1.09435348,  0.95222855, -0.1542948 ],
       [-0.37735119, -0.36516656, -2.31031616],
       ...,
       [-0.20157879, -2.13545126, -0.41277847],
       [-1.26906919,  1.24068444,  1.36383051],
       [-0.17517628, -3.09635832, -1.14634645]])

In [110]:
#Create a DataFrame with the three principal components
pca_df = pd.DataFrame(data=crime_pca, columns=["PC1", "PC2", "PC3"], index=dc_crime_df.index)
pca_df

Unnamed: 0,PC1,PC2,PC3
20139232,6.959360,0.246299,0.906351
11048862,-1.094353,0.952229,-0.154295
10008883,-0.377351,-0.365167,-2.310316
11124202,-0.784391,-1.076054,0.385156
12113231,-0.784391,-1.076054,0.385156
...,...,...,...
21186813,6.296595,0.049071,1.791659
21186822,-0.930533,-1.461034,1.640248
21186824,-0.201579,-2.135451,-0.412778
21186834,-1.269069,1.240684,1.363831


In [111]:
# Create an elbow curve to find the best value for K.
# YOUR CODE HERE
from sklearn.cluster import KMeans
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(pca_df)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [112]:
#initialize the K-Means model. 
model = KMeans(n_clusters=3, random_state=1)
# Fit the model
# YOUR CODE HERE
model.fit(X)

# Predict clusters

predictions = model.predict(X)
predictions

array([0, 2, 1, ..., 1, 2, 1])

In [113]:
# Create a new DataFrame including predicted clusters and crime features
# Concatenate the dc_crime_df and pca_df DataFrames on the same columns. 
clustered_crime_df = pd.concat([dc_crime_df, pca_df], axis=1, join='inner')

# Add a new column, 'Class' to the clustered_crime_df DataFrame that holds the predictions. 
clustered_crime_df["Class"] = model.labels_

#Print the shape of the clustered_crime_df
print(clustered_crime_df.shape)
clustered_crime_df.head()

(28134, 10)


Unnamed: 0,SHIFT,OFFENSE,METHOD,NEIGHBORHOOD_CLUSTER,PSA,crimetype,PC1,PC2,PC3,Class
20139232,MIDNIGHT,HOMICIDE,GUN,Cluster 18,407.0,VIOLENT,6.95936,0.246299,0.906351,0
11048862,EVENING,THEFT F/AUTO,OTHERS,Cluster 15,204.0,NON-VIOLENT,-1.094353,0.952229,-0.154295,2
10008883,EVENING,THEFT/OTHER,OTHERS,Cluster 33,604.0,NON-VIOLENT,-0.377351,-0.365167,-2.310316,1
11124202,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0,NON-VIOLENT,-0.784391,-1.076054,0.385156,1
12113231,DAY,THEFT/OTHER,OTHERS,Cluster 23,506.0,NON-VIOLENT,-0.784391,-1.076054,0.385156,1


In [115]:
# Creating a 3D-Scatter with the PCA data and the clusters
import plotly.express as px
fig = px.scatter_3d(
    clustered_crime_df,
    x="PC1",
    y="PC2",
    z="PC3",
    hover_name="PSA",
    hover_data=["OFFENSE"],
    color="crimetype",
    symbol="Class"
)
fig.show()

In [124]:
clustered_crime_df.hvplot.scatter(
    x="OFFENSE",
    y="NEIGHBORHOOD_CLUSTER",
    hover_cols=["OFFENSE"],
    by="crimetype"
)
    