# Chicago Crime Clustering Analysis 

## Importing libraries and loading dataset

In [146]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as colors
import folium




In [51]:
df = pd.read_csv('Chicago_Crime_Main.csv')


In [6]:
df.head(5)

Unnamed: 0,ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location,Historical Wards 2003-2015,Zip Codes,Community Areas,Census Tracts,Wards,Boundaries - ZIP Codes,Police Districts,Police Beats
0,5741943,HN549294,2007-08-25 09:22:18,074XX N ROGERS AVE,560,ASSAULT,SIMPLE,OTHER,False,False,2422,24.0,49.0,1.0,08A,,,2007,2015-08-17 15:03:40,,,,,,,,,,,
1,1930689,HH109118,2002-01-05 21:24:00,007XX E 103 ST,820,THEFT,$500 AND UNDER,GAS STATION,True,False,512,5.0,,,06,,,2002,2016-02-04 06:33:39,,,,,,,,,,,
2,13203321,JG415333,2023-09-06 17:00:00,002XX N Wells st,1320,CRIMINAL DAMAGE,TO VEHICLE,PARKING LOT / GARAGE (NON RESIDENTIAL),False,False,122,1.0,42.0,32.0,14,1174694.0,1901831.0,2023,2023-11-04 15:40:18,41.886018,-87.633938,"(41.886018055, -87.633937881)",22.0,14309.0,38.0,92.0,36.0,46.0,22.0,79.0
3,13210088,JG423627,2023-08-31 12:00:00,023XX W JACKSON BLVD,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,STREET,False,False,1225,12.0,27.0,28.0,11,1160870.0,1898642.0,2023,2023-09-16 15:41:56,41.877565,-87.684791,"(41.877565108, -87.68479102)",48.0,21184.0,29.0,766.0,46.0,28.0,15.0,139.0
4,13210004,JG422532,2023-07-24 21:45:00,073XX S JEFFERY BLVD,281,CRIMINAL SEXUAL ASSAULT,NON-AGGRAVATED,APARTMENT,False,False,333,3.0,7.0,43.0,02,1190812.0,1856743.0,2023,2023-09-16 15:41:56,41.761919,-87.576209,"(41.7619185, -87.576209245)",32.0,22538.0,39.0,419.0,37.0,24.0,18.0,89.0


## Cleaning and analyzing data 

In [52]:
#Reducing data down to just 2022. Dataset to large for clustering entire dataset

#Convert to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Filter for the year 2022
data_2022 = df[df['Date'].dt.year == 2022]

# Drop rows with NaN values in 'Latitude' or 'Longitude'
data_2022 = data_2022.dropna(subset=['Latitude', 'Longitude'])

#Writing this to new csv
data_2022.to_csv('data_2022.csv', index=False)



In [53]:
#Filtering for crimes by type of crimes
unique_crime_types = data_2022['Primary Type'].unique()

unique_crime_types

array(['SEX OFFENSE', 'OTHER OFFENSE', 'OFFENSE INVOLVING CHILDREN',
       'WEAPONS VIOLATION', 'BATTERY', 'THEFT', 'CRIMINAL SEXUAL ASSAULT',
       'ROBBERY', 'BURGLARY', 'ASSAULT', 'MOTOR VEHICLE THEFT',
       'CRIMINAL DAMAGE', 'CRIMINAL TRESPASS', 'HOMICIDE', 'ARSON',
       'DECEPTIVE PRACTICE', 'INTIMIDATION', 'NARCOTICS',
       'PUBLIC PEACE VIOLATION', 'STALKING', 'PROSTITUTION',
       'INTERFERENCE WITH PUBLIC OFFICER', 'LIQUOR LAW VIOLATION',
       'KIDNAPPING', 'CONCEALED CARRY LICENSE VIOLATION',
       'HUMAN TRAFFICKING', 'OTHER NARCOTIC VIOLATION', 'OBSCENITY',
       'GAMBLING', 'NON-CRIMINAL', 'PUBLIC INDECENCY'], dtype=object)

## Organizing data into clusters for visualization 

In [105]:
# Assuming the columns are named 'Latitude' and 'Longitude'
coords = data_2022[['Latitude', 'Longitude']]

# Earth's radius in kilometers
earth_radius = 6371

# Initialize DBSCAN with spherical distance metric (Haversine)
dbscan = DBSCAN(eps=.15/earth_radius, min_samples=10, algorithm='ball_tree', metric='haversine')

# Convert latitude and longitude to radians for use in Haversine formula
coords_in_radians = np.radians(coords)

# Fit the model
clusters = dbscan.fit_predict(coords_in_radians)

# Add the cluster labels back to your original dataframe
data_2022['Cluster'] = clusters


In [97]:
# Count of crimes in each cluster
print(data_2022['Cluster'].value_counts())



Cluster
 0      215002
-1        3425
 10       1317
 4        1157
 24        980
         ...  
 163         6
 222         6
 217         2
 157         1
 239         1
Name: count, Length: 248, dtype: int64


In [62]:
# Create a map centered around an average location in your data
map_center = [data_2022['Latitude'].mean(), data_2022['Longitude'].mean()]

# Create a folium map object
m = folium.Map(location=map_center, zoom_start=12)


# Add points to the map
for idx, row in data_2022.iterrows():
    # Skip noise points
    if row['Cluster'] == -1:
        continue
    
    # Create a marker with the appropriate color
    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=3,
        color='blue' if row['Cluster'] == 0 else 'red',  # Change colors as needed
        fill=True
    ).add_to(m)


In [98]:
#Run the map 
"""
m

"""

'\nm\n\n'

#### No matter what I did with adjusting parameters (eps and min_samples), I could not get a good clustering visualization. 
#### Changing these would always lead to a large amount of -1's (noise) or a very large cluster encompassing most of the data points that would extend across almost the entire city

#### Will try to look more closely at specific crimes

## Attempting clustering with specific crime types

In [100]:
assault_data = data_2022[data_2022['Primary Type'] == 'ASSAULT']

In [201]:
#.1 eps and min_samples 15 seems to give the best visualization but a lot of the datapoints get filtered out as noise (lots of -1)
#.3 eps and min_samples 10 reduces a lot of noise but then a lot of points get assigned to the first cluster because the allowed distance
#between points is too high


coords = assault_data[['Latitude', 'Longitude']]

# Earth's radius in kilometers
earth_radius = 6371

# Initialize DBSCAN with spherical distance metric (Haversine)
dbscan = DBSCAN(eps=.1/earth_radius, min_samples=15, algorithm='ball_tree', metric='haversine')

# Convert latitude and longitude to radians for use in Haversine formula
coords_in_radians = np.radians(coords)

# Fit the model
clusters = dbscan.fit_predict(coords_in_radians)

# Add the cluster labels back to your original dataframe
assault_data['Cluster'] = clusters


print(assault_data['Cluster'].value_counts())


Cluster
-1     18957
 0        89
 2        72
 9        58
 4        49
       ...  
 68       15
 46       15
 56       15
 62       15
 69        6
Name: count, Length: 73, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  assault_data['Cluster'] = clusters


In [206]:
# Initialize map centered around the mean latitude and longitude of the assault data
map_center = [assault_data['Latitude'].mean(), assault_data['Longitude'].mean()]
m = folium.Map(location=map_center, zoom_start=12)

# Generate a larger color palette with a color for each cluster
unique_clusters = sorted(assault_data['Cluster'].unique())
n_clusters = len(unique_clusters)
colors_array = cm.rainbow(np.linspace(0, 1, n_clusters))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# Calculate cluster centroids, excluding noise (-1)
cluster_centroids = assault_data[assault_data['Cluster'] != -1].groupby('Cluster')[['Latitude', 'Longitude']].mean()

# Add each point to the map with a color based on its cluster, excluding noise
for idx, row in assault_data.iterrows():
    cluster_num = row['Cluster']
    # Skip points labeled as noise (-1)
    if cluster_num == -1:
        continue

    folium.CircleMarker(
        location=[row['Latitude'], row['Longitude']],
        radius=3,
        color=rainbow[unique_clusters.index(cluster_num)],
        fill=True
    ).add_to(m)

# Add cluster centroids to the map, excluding noise
for cluster_num, centroid in cluster_centroids.iterrows():
    folium.CircleMarker(
        location=[centroid['Latitude'], centroid['Longitude']],
        radius=5,
        color=rainbow[unique_clusters.index(cluster_num)],
        fill=True,
        fill_color=rainbow[unique_clusters.index(cluster_num)],
        fill_opacity=0.7
    ).add_to(m)

# Display the map
m



## Analysis
#### - The dataset seems to be inherently noisy and crimes are geographically dense which make it very difficult to organize the geospatial data. Altering the eps and min_values parameters allows either of the following, but not both:

#### a. Clearly defined and well visualized clusters, but with a significant number of data points being filtered out as noise.
#### b. Few datapoints filtered out as noise, but one large cluster that extends over most of the city because the allowed distance between points (eps) is too large)

#### Clustering analysis with this current model may not be viable or helpful. 
#### Can try something like a Clustered Continuous Conditional Random Field model but this is a more advanced model that would take more effort to do