<a href="https://colab.research.google.com/github/claralin1222/Chicago_Crime_Analysis/blob/main/crime_map_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

crime = pd.read_csv('/content/drive/MyDrive/Chicago Crime Analysis/Crime_Clean_150M.csv')
chicago_attractions = pd.read_csv('/content/drive/MyDrive/Chicago Crime Analysis/chicago_attractions_clean.csv')

In [3]:
# Create a mapping dictionary to combine similar crime types
crime_type_mapping = {
    'CRIM SEXUAL ASSAULT': 'CRIMINAL SEXUAL ASSAULT',
    'OTHER NARCOTIC VIOLATION': 'NARCOTICS',
    'NON-CRIMINAL (SUBJECT SPECIFIED)': 'NON-CRIMINAL',
}

In [4]:
def standardize_crime_type(crime_type, mapping):
    return mapping.get(crime_type, crime_type)

crime['Standardized Primary Type'] = crime['Primary Type'].apply(standardize_crime_type, args=(crime_type_mapping,))

# Drop rows with missing location data
crime = crime.dropna(subset=['Longitude', 'Latitude'])

# Convert the date column to datetime format
crime['Date'] = pd.to_datetime(crime['Date'], format='%m/%d/%Y %I:%M:%S %p')

# Extract year and month from the date column
crime['Year'] = crime['Date'].dt.year
crime['Month'] = crime['Date'].dt.month

# Normalize the geographic and temporal data
scaler = StandardScaler()
crime[['Latitude', 'Longitude', 'Year', 'Month']] = scaler.fit_transform(crime[['Latitude', 'Longitude', 'Year', 'Month']])

# Group the data by primary type, year, and month
grouped_data = crime.groupby(['Standardized Primary Type', 'Year', 'Month'])

In [5]:
crime = crime[['ID','Date','Standardized Primary Type','Latitude', 'Longitude']]

In [6]:
# Function to cluster data within each group
def cluster_and_average(group, n_clusters=50):
    if len(group) < n_clusters:
        return pd.DataFrame({
            'Latitude': [group['Latitude'].mean()],
            'Longitude': [group['Longitude'].mean()],
            'Standardized Primary Type': group['Standardized Primary Type'].iloc[0],
            'Year': group['Year'].iloc[0],
            'Month': group['Month'].iloc[0],
            'Crime Count': [len(group)]
        })

    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    group['Cluster'] = kmeans.fit_predict(group[['Latitude', 'Longitude', 'Year', 'Month']])
    clustered_group = group.groupby('Cluster').agg({
        'Latitude': 'mean',
        'Longitude': 'mean',
        'Standardized Primary Type': 'first',
        'Year': 'first',
        'Month': 'first',
        'ID': 'count'
    }).rename(columns={'ID': 'Crime Count'}).reset_index(drop=True)

    return clustered_group

In [None]:
# Apply clustering to each group
clustered_data = grouped_data.apply(cluster_and_average).reset_index(drop=True)

# Denormalize the data for visualization
clustered_data[['Latitude', 'Longitude', 'Year', 'Month']] = scaler.inverse_transform(clustered_data[['Latitude', 'Longitude', 'Year', 'Month']])

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super().

In [None]:
import plotly.express as px
import plotly.io as pio

# Create a scatter plot
fig = px.scatter_mapbox(
    clustered_data,
    lat='Latitude',
    lon='Longitude',
    color='Standardized Primary Type',
    size='Crime Count',
    hover_name='Standardized Primary Type',
    hover_data={'Latitude': True, 'Longitude': True, 'Year': True, 'Month': True, 'Crime Count': True},
    title='Crime Clusters in Chicago',
    mapbox_style="open-street-map",           #mapbox_style = "carto-positron"
)

# Update layout
fig.update_layout(
    margin={"r":0,"t":0,"l":0,"b":0},
    mapbox=dict(
        style="open-street-map",
        zoom=10,
        center={"lat": 41.8781, "lon": -87.6298},
    ),
    legend=dict(
        title="Layers",
        itemsizing='constant'
    )
)

# Show the plot
fig.show()


In [None]:
# Save the plot to an HTML file
pio.write_html(fig, file='clustered_data.html', auto_open=True)

# **Statistics**

In [7]:
import pandas as pd
from sklearn.neighbors import KDTree
import statsmodels.api as sm

In [22]:
# Define crime categories
personal_safety_crimes = ['BATTERY', 'OTHER OFFENSE', 'ASSAULT', 'CRIMINAL SEXUAL ASSAULT',
                          'OFFENSE INVOLVING CHILDREN', 'ROBBERY', 'HOMICIDE',
                          'SEX OFFENSE', 'INTIMIDATION', 'STALKING']

property_safety_crimes = ['THEFT', 'CRIMINAL DAMAGE', 'MOTOR VEHICLE THEFT', 'BURGLARY']

In [23]:
# Filter the crime data for personal and property safety crimes
personal_safety_data = crime[crime['Standardized Primary Type'].isin(personal_safety_crimes)]
property_safety_data = crime[crime['Standardized Primary Type'].isin(property_safety_crimes)]

In [24]:
# Load tourist attractions data
attraction_coords = chicago_attractions[['Latitude', 'Longitude']].values

# Create KDTree for tourist attractions
kdtree = KDTree(attraction_coords, leaf_size=30, metric='euclidean')

In [25]:
def calculate_distances(chunk):
    crime_coords = chunk[['Latitude', 'Longitude']].values
    distances, _ = kdtree.query(crime_coords, k=1)
    chunk['distance_to_attraction'] = distances
    return chunk

In [26]:
# Process personal safety crimes in chunks
chunk_size = 10000
chunks = [personal_safety_data[i:i + chunk_size] for i in range(0, personal_safety_data.shape[0], chunk_size)]
processed_chunks = [calculate_distances(chunk) for chunk in chunks]
personal_safety_processed = pd.concat(processed_chunks)

# Aggregate personal safety crime data
personal_safety_aggregated = personal_safety_processed.groupby(['Latitude', 'Longitude']).size().reset_index(name='crime_count')

# Merge with distance data
personal_safety_final = pd.merge(personal_safety_aggregated, personal_safety_processed[['Latitude', 'Longitude', 'distance_to_attraction']].drop_duplicates(), on=['Latitude', 'Longitude'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['distance_to_attraction'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['distance_to_attraction'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['distance_to_attraction'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [27]:
# Process property safety crimes in chunks
chunks = [property_safety_data[i:i + chunk_size] for i in range(0, property_safety_data.shape[0], chunk_size)]
processed_chunks = [calculate_distances(chunk) for chunk in chunks]
property_safety_processed = pd.concat(processed_chunks)

# Aggregate property safety crime data
property_safety_aggregated = property_safety_processed.groupby(['Latitude', 'Longitude']).size().reset_index(name='crime_count')

# Merge with distance data
property_safety_final = pd.merge(property_safety_aggregated, property_safety_processed[['Latitude', 'Longitude', 'distance_to_attraction']].drop_duplicates(), on=['Latitude', 'Longitude'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['distance_to_attraction'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['distance_to_attraction'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['distance_to_attraction'] = distances
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [28]:
# Define the response variable and predictors for personal safety crimes
y_personal = personal_safety_final['crime_count']
X_personal = personal_safety_final[['distance_to_attraction']]

# Add a constant to the predictors (intercept term)
X_personal = sm.add_constant(X_personal)

# Fit the Poisson regression model
poisson_model_personal = sm.GLM(y_personal, X_personal, family=sm.families.Poisson()).fit()
print("Personal Safety Related Crimes Model Summary:")
print(poisson_model_personal.summary())

Personal Safety Related Crimes Model Summary:
                 Generalized Linear Model Regression Results                  
Dep. Variable:            crime_count   No. Observations:               180665
Model:                            GLM   Df Residuals:                   180663
Model Family:                 Poisson   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -5.7664e+05
Date:                Thu, 15 Aug 2024   Deviance:                   6.9671e+05
Time:                        19:55:37   Pearson chi2:                 2.32e+06
No. Iterations:                     5   Pseudo R-squ. (CS):            0.02393
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------

In [30]:
# Define the response variable and predictors for property safety crimes
y_property = property_safety_final['crime_count']
X_property = property_safety_final[['distance_to_attraction']]

# Add a constant to the predictors (intercept term)
X_property = sm.add_constant(X_property)

# Fit the Poisson regression model
poisson_model_property = sm.GLM(y_property, X_property, family=sm.families.Poisson()).fit()
print("Property Safety Related Crimes Model Summary:")
print(poisson_model_property.summary())

Property Safety Related Crimes Model Summary:
                 Generalized Linear Model Regression Results                  
Dep. Variable:            crime_count   No. Observations:               215879
Model:                            GLM   Df Residuals:                   215877
Model Family:                 Poisson   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -7.0779e+05
Date:                Thu, 15 Aug 2024   Deviance:                   8.8722e+05
Time:                        20:06:18   Pearson chi2:                 7.04e+06
No. Iterations:                     6   Pseudo R-squ. (CS):            0.02202
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------