Add codes for EDA here and we'll convert them for Streamlit app later.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import geopandas as gpd
from shapely import wkt
import json


In [None]:
from os import makedirs

makedirs("../jsonvis/",exist_ok=True)

In [None]:
file_path = "../ProjectData/ChicagoCrimes(20152025).csv"
df = pd.read_csv(file_path)
df.head()

## Prepare Date and Time

Convert the 'Date' column to datetime objects with the specified format and `errors='coerce'` to handle potential parsing issues, and display the first few rows of the DataFrame to confirm the changes.




In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
df = df.rename(columns={"Date":"Datetime"})
df['Date'] = df['Datetime'].dt.date
df['Time'] = df['Datetime'].dt.time # pandas normalizes to 24-hour format
df.head()

## Understanding Dataset

In [None]:
df.info()

In [None]:
# Convert multiple columns at once to category type
categorical_columns = ['IUCR', 'Primary Type', 'Description',
                       'Location Description', 'Arrest', 'Domestic',
                       'Beat', 'District', 'Ward', 'Community Area']
df[categorical_columns] = df[categorical_columns].astype('category')

# Subset to only categorical columns and describe
df[categorical_columns].describe()

## Analyze Missing Data Over Time

Calculate the number and missing values for each column, grouped by year, to understand the trend of missing data over different years. This will provide insights into when certain data points started to be missing or were more prevalent.


In [None]:
# Count number of cases per year
cases_per_year = df.groupby("Year").size()
print(cases_per_year)

In [None]:
# Calculate missing values by year and column
missing_by_year = df.groupby("Year").apply(lambda x: x.isnull().sum())

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(missing_by_year.T, cmap="Reds", annot=True, fmt="d", cbar_kws={'label': 'Fraction Missing'})
plt.title("Heatmap of Missing Values by Column and Year")
plt.xlabel("Year")
plt.ylabel("Columns")
plt.tight_layout()
plt.show()

## Analyse Duplicate Records

In [None]:
# Check for duplicate Case Numbers
print("Duplicate Case Numbers Check:")
print("=" * 50)

# Count total duplicates
duplicate_count = df['Case Number'].duplicated().sum()
print(f"Total duplicate Case Numbers: {duplicate_count}")

# Check if any duplicates exist
has_duplicates = df['Case Number'].duplicated().any()
print(f"Has duplicates: {has_duplicates}")

# Count unique vs total
print(f"\nTotal rows: {len(df)}")
print(f"Unique Case Numbers: {df['Case Number'].nunique()}")
print(f"Duplicate rows: {len(df) - df['Case Number'].nunique()}")

# View duplicate Case Numbers
duplicate_case_numbers = df[df['Case Number'].duplicated(keep=False)]
print(f"\nRows with duplicate Case Numbers: {len(duplicate_case_numbers)}")

# Show duplicate Case Numbers with their counts
duplicate_summary = df['Case Number'].value_counts()
duplicate_summary = duplicate_summary[duplicate_summary > 1]
print(f"\nNumber of Case Numbers appearing more than once: {len(duplicate_summary)}")
print("\nTop 10 most duplicated Case Numbers:")
print(duplicate_summary.head(10))

# View actual duplicate rows
print("\nSample of duplicate rows:")
print(duplicate_case_numbers.sort_values('Case Number').head(10))

## Remove Missing Data, 2026 Data and Duplicates

Total enteries is 2755021. Fraction of missing data is small which will not distort EDA when removed. Also, this simplifies workflow - avoid complexity of imputing values which can introduce bias if not done carefully. Plots and summaries will not be cluttered by NaN values too.

Also removed 2026 data such that EDA will focus on years 2015 - 2025.

Also removed duplicate records, to retain only one entry per Case Number.


In [None]:
print("Original entries:", len(df))
# Remove entries from the year 2026
df_clean = df[df["Year"] != 2026]
print("After dropping 2026 data:", len(df_clean))

# Drop rows with any missing values
df_clean = df_clean.dropna()
print("After dropping missing data:", len(df_clean))

# 'ID' is a separate column, sort first then drop duplicates
df_clean = df_clean.sort_values('ID', ascending=False)
df_clean = df_clean.drop_duplicates(subset='Case Number', keep='first')
print("After dropping duplicates based on Case Number:", len(df_clean))

## Time Series Plot by Date, Week, Month and Year

In [None]:
# Ensure notebook renders Plotly charts
pio.renderers.default = "notebook"

# Assuming df_clean already has a 'Datetime' column parsed
df_clean["Date"] = df_clean["Datetime"].dt.date
df_clean["Week"] = df_clean["Datetime"].dt.to_period("W").dt.start_time
df_clean["Month"] = df_clean["Datetime"].dt.to_period("M").dt.start_time
df_clean["Year"] = df_clean["Datetime"].dt.year

# Aggregate counts using df_clean
cases_by_date = df_clean.groupby("Date").size().reset_index(name="Cases")
cases_by_week = df_clean.groupby("Week").size().reset_index(name="Cases")
cases_by_month = df_clean.groupby("Month").size().reset_index(name="Cases")
cases_by_year = df_clean.groupby("Year").size().reset_index(name="Cases")

# Build figure with multiple traces
fig = go.Figure()

fig.add_trace(go.Scatter(x=cases_by_date["Date"], y=cases_by_date["Cases"],
                         mode="lines", name="By Date", visible=True))
fig.add_trace(go.Scatter(x=cases_by_week["Week"], y=cases_by_week["Cases"],
                         mode="lines", name="By Week", visible=False))
fig.add_trace(go.Scatter(x=cases_by_month["Month"], y=cases_by_month["Cases"],
                         mode="lines", name="By Month", visible=False))
fig.add_trace(go.Scatter(x=cases_by_year["Year"], y=cases_by_year["Cases"],
                        mode="lines", name="By Year", visible=False))

# Add dropdown menu to toggle visibility
fig.update_layout(
    updatemenus=[
        dict(
            type="dropdown",
            x=0.97, y=1.12,
            buttons=[
                dict(label="Date", method="update",
                     args=[{"visible": [True, False, False, False]},
                           {"title": "Cases by Date"}]),
                dict(label="Week", method="update",
                     args=[{"visible": [False, True, False, False]},
                           {"title": "Cases by Week"}]),
                dict(label="Month", method="update",
                     args=[{"visible": [False, False, True, False]},
                           {"title": "Cases by Month"}]),
                dict(label="Year", method="update",
                     args=[{"visible": [False, False, False, True]},
                           {"title": "Cases by Year"}]),
            ]
        )
    ]
)

fig.show()


In [None]:
fig.write_json("../jsonvis/time_series_seasonality.json")

### Observations

Outlier number of crime incidents reported on specific dates. High number of crime 1877 reported on 31 May 2020. Low number of crime 110 and 115 on 19 Dec 2023 and 14 May 2024 respectively.

Strictly lesser than 20k of crime incidents reported per month from Jan 2020 to May 2022. The lesser amount of crime might be due to Covid Pandemic effect.

Appears to be some seasonality trend or pattern for crime rate annually.

## Seasonality Plot (over Months) by Year

Click on legend entries to hide/show specific years. Single click hides/shows one line. Double click isolates one line (hides all others).

In [None]:
pio.renderers.default = "notebook"

# Assuming df_clean already has a 'Datetime' column parsed
df_clean["Month"] = df_clean["Datetime"].dt.month
df_clean["Year"] = df_clean["Datetime"].dt.year

# Aggregate counts by Year and Month
cases_by_month_year = (
    df_clean.groupby(["Year", "Month"])
    .size()
    .reset_index(name="Cases")
)
fig = px.line(cases_by_month_year, x="Month", y="Cases", color="Year", markers=True)


# Customize layout
fig.update_layout(
    title="Seaonality Plot (over Months) by Year",
    xaxis=dict(
        tickmode="array",
        tickvals=list(range(1, 13)),
        ticktext=["Jan","Feb","Mar","Apr","May","Jun",
                  "Jul","Aug","Sep","Oct","Nov","Dec"]
    ),
    legend_title="Year",
    hovermode="x unified"
)

fig.show()

### Observation

Crime seems to dip / lowest in Feb, gradually increases thereafter and peaks in Jul or Aug, before declining again towards Dec.

While the broad pattern is similiar for 2020, it dipped later, reached lowest in Apr instead of Feb.

In 2019 and 2020, the crime rate dropped drastically with the most likely cause is the covid pandemic. The crime rate numbers continue to increase for the following years and continues to have the same seasonality patterns like the the other years.  

## Weekly Patterns (by Top 10 Crime Type)

In [None]:
# Extract day of week and categorize as weekday/weekend
df_clean['DayOfWeek'] = df_clean['Datetime'].dt.day_name()
df_clean['DayNum'] = df_clean['Datetime'].dt.dayofweek  # 0=Monday, 6=Sunday
df_clean['IsWeekend'] = df_clean['DayNum'].isin([5, 6]).map({True: 'Weekend', False: 'Weekday'})

# Crime count by day of week overall
crime_by_day = (
    df_clean.groupby(['DayOfWeek', 'DayNum', 'IsWeekend'])
    .size()
    .reset_index(name='Crime Count')
    .sort_values('DayNum')
)

# Top 10 crime types by day of week
top_10_types = df_clean['Primary Type'].value_counts().head(10).index
crime_by_day_type = (
    df_clean[df_clean['Primary Type'].isin(top_10_types)]
    .groupby(['DayOfWeek', 'DayNum', 'Primary Type', 'IsWeekend'])
    .size()
    .reset_index(name='Crime Count')
    .sort_values('DayNum')
)

# Normalize to percent of total (overall and per crime type)
overall_total = crime_by_day['Crime Count'].sum()
crime_by_day['PercentOfTotal'] = (crime_by_day['Crime Count'] / overall_total) * 100

type_totals = crime_by_day_type.groupby('Primary Type')['Crime Count'].transform('sum')
crime_by_day_type['PercentOfTotal'] = (crime_by_day_type['Crime Count'] / type_totals) * 100

# Create main figure with overall day-of-week data (percent of total)
fig = px.bar(
    crime_by_day,
    x='DayOfWeek',
    y='PercentOfTotal',
    color='IsWeekend',
    color_discrete_map={'Weekday': '#636EFA', 'Weekend': '#EF553B'},
    title='Crime Incidents by Day of Week (% of Total)',
    labels={'PercentOfTotal': 'Percent of Total (%)'},
    category_orders={'DayOfWeek': ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']}
)

# Create traces for each top crime type (percent of that type's total)
for crime_type in top_10_types:
    crime_data = crime_by_day_type[crime_by_day_type['Primary Type'] == crime_type]
    crime_fig = px.bar(
        crime_data,
        x='DayOfWeek',
        y='PercentOfTotal',
        color='IsWeekend',
        color_discrete_map={'Weekday': '#636EFA', 'Weekend': "#1B1FEA"},
        title=f'Crime Incidents by Day of Week - {crime_type} (% of Type Total)',
        labels={'PercentOfTotal': 'Percent of Type Total (%)'}
    )
    for trace in crime_fig.data:
        trace.visible = False
        fig.add_trace(trace)

# Set initial visibility (only first two traces visible - weekday and weekend)
for i, trace in enumerate(fig.data):
    trace.visible = (i < 2)

# Add dropdown buttons
buttons = [
    dict(
        label='Overall',
        method='update',
        args=[{'visible': [True, True] + [False] * (len(top_10_types) * 2)},
              {'title': 'Crime Incidents by Day of Week (% of Total)'}]
    )
]

for i, crime_type in enumerate(top_10_types):
    visibility = [False] * (2 + len(top_10_types) * 2)
    visibility[2 + i * 2] = True      # Weekday trace
    visibility[2 + i * 2 + 1] = True  # Weekend trace
    buttons.append(
        dict(
            label=crime_type,
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Crime Incidents by Day of Week - {crime_type} (% of Type Total)'}]
        )
    )

fig.update_layout(
    updatemenus=[
        dict(
            type='dropdown',
            x=1,
            y=1.11,
            buttons=buttons
        )
    ],
    height=600,
    xaxis=dict(categoryorder='array', categoryarray=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']),
    showlegend=True
)

fig.show()

## Diurnal Cycle (by Day of Week and Top 10 Crime Type)

In [None]:
# Diurnal cycle subplots: hourly heatmap + 4-hour time block heatmap
from plotly.subplots import make_subplots

df_clean['DayOfWeek'] = df_clean['Datetime'].dt.day_name()
df_clean['Hour'] = df_clean['Datetime'].dt.hour

day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
top_10_types = df_clean['Primary Type'].value_counts().head(10).index

time_bins = list(range(0, 25, 4))
time_labels = ['0-3', '4-7', '8-11', '12-15', '16-19', '20-23']
df_clean['TimeBlock'] = pd.cut(
    df_clean['Hour'],
    bins=time_bins,
    labels=time_labels,
    right=False,
    include_lowest=True
)

def build_hourly_heatmap(dataframe, title):
    heat = (
        dataframe.groupby(['DayOfWeek', 'Hour'])
        .size()
        .reset_index(name='Crime Count')
    )
    pivot = (
        heat.pivot(index='Hour', columns='DayOfWeek', values='Crime Count')
        .reindex(columns=day_order)
        .fillna(0)
    )
    return go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale='Reds',
        colorbar=dict(
            title='Crime Count',
            x=0.46,
            y=0.5,
            len=0.8
        ),
        name=f'{title} (Hourly)',
        visible=False,
        hovertemplate=(
            'Day of Week: %{x}<br>'
            'Hour: %{y}<br>'
            'Count: %{z}<extra></extra>'
        )
    )
def build_timeblock_heatmap(dataframe, title):
    heat = (
        dataframe.groupby(['DayOfWeek', 'TimeBlock'])
        .size()
        .reset_index(name='Crime Count')
    )
    pivot = (
        heat.pivot(index='TimeBlock', columns='DayOfWeek', values='Crime Count')
        .reindex(columns=day_order, index=time_labels)
        .fillna(0)
    )
    return go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale='Reds',
        colorbar=dict(
            title='Crime Count',
            x=1.02,
            y=0.5,
            len=0.8
        ),
        name=f'{title} (Time Block)',
        visible=False,
        hovertemplate=(
            'Day of Week: %{x}<br>'
            'Time Block: %{y}<br>'
            'Count: %{z}<extra></extra>'
        )
    )

fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=['Hourly (0-23)', 'Time Blocks (4-hour)'],
    horizontal_spacing=0.14
)

# Overall heatmaps
hourly_overall = build_hourly_heatmap(df_clean, 'Overall')
timeblock_overall = build_timeblock_heatmap(df_clean, 'Overall')
hourly_overall.visible = True
timeblock_overall.visible = True
fig.add_trace(hourly_overall, row=1, col=1)
fig.add_trace(timeblock_overall, row=1, col=2)

# Crime-type heatmaps
for crime_type in top_10_types:
    df_type = df_clean[df_clean['Primary Type'] == crime_type]
    fig.add_trace(build_hourly_heatmap(df_type, crime_type), row=1, col=1)
    fig.add_trace(build_timeblock_heatmap(df_type, crime_type), row=1, col=2)

# Dropdown buttons (toggle both subplots together)
buttons = [
    dict(
        label='Overall',
        method='update',
        args=[{'visible': [True, True] + [False] * (len(top_10_types) * 2)},
              {'title': 'Diurnal Cycle by Day of Week (Overall)'}]
    )
]

for i, crime_type in enumerate(top_10_types):
    visibility = [False] * (2 + len(top_10_types) * 2)
    visibility[2 + i * 2] = True
    visibility[2 + i * 2 + 1] = True
    buttons.append(
        dict(
            label=crime_type,
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Diurnal Cycle by Day of Week - {crime_type}'}]
        )
    )

fig.update_layout(
    title='Diurnal Cycle by Day of Week (Overall)',
    updatemenus=[dict(type='dropdown', x=1, y=1.15, buttons=buttons)],
    height=600
)

fig.update_xaxes(title_text='Day of Week', categoryorder='array', categoryarray=day_order, row=1, col=1)
fig.update_xaxes(title_text='Day of Week', categoryorder='array', categoryarray=day_order, row=1, col=2)
fig.update_yaxes(title_text='Hour of Day', tickmode='linear', dtick=1, row=1, col=1)
fig.update_yaxes(title_text='Time Block (Hour)', categoryorder='array', categoryarray=time_labels, row=1, col=2)

fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots


df_clean['DayOfWeek'] = df_clean['Datetime'].dt.day_name()
df_clean['Hour'] = df_clean['Datetime'].dt.hour

DAY_ORDER = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
TOP_10_TYPES = df_clean['Primary Type'].value_counts().head(10).index


def prepare_heatmap_data(dataframe):
    heat = (
        dataframe
        .groupby(['DayOfWeek', 'Hour'])
        .size()
        .reset_index(name='Crime Count')
    )

    pivot = (
        heat
        .pivot(index='Hour', columns='DayOfWeek', values='Crime Count')
        .reindex(columns=DAY_ORDER)
        .fillna(0)
    )

    return pivot


def build_hourly_heatmap(pivot, title):
    return go.Heatmap(
        z=pivot.values,
        x=pivot.columns,
        y=pivot.index,
        colorscale='Reds',
        name=title,
        visible=False,
        colorbar=dict(
            title='Crime Count',
            thickness=14,
            len=0.85,
            y=0.5
        ),
        hovertemplate=(
            'Day: %{x}<br>'
            'Hour: %{y}:00<br>'
            'Crime Count: %{z}<extra></extra>'
        )
    )


fig = make_subplots(rows=1, cols=1)

# Overall
pivot_overall = prepare_heatmap_data(df_clean)
fig.add_trace(build_hourly_heatmap(pivot_overall, 'Overall'), row=1, col=1)

# Crime Types
for crime_type in TOP_10_TYPES:
    df_type = df_clean[df_clean['Primary Type'] == crime_type]
    pivot_type = prepare_heatmap_data(df_type)
    fig.add_trace(build_hourly_heatmap(pivot_type, crime_type), row=1, col=1)


buttons = [
    dict(
        label='Overall',
        method='update',
        args=[
            {'visible': [True] + [False]*len(TOP_10_TYPES)},
            {'title': 'Diurnal Crime Pattern by Day of Week (Overall)'}
        ]
    )
]

for i, crime_type in enumerate(TOP_10_TYPES):
    visibility = [False]*(1 + len(TOP_10_TYPES))
    visibility[1+i] = True

    buttons.append(
        dict(
            label=crime_type,
            method='update',
            args=[
                {'visible': visibility},
                {'title': f'Diurnal Crime Pattern — {crime_type}'}
            ]
        )
    )


fig.update_layout(
    updatemenus=[
        dict(
            type='dropdown',
            x=1,
            y=1.12,
            xanchor='right',
            buttons=buttons
        )
    ],
    height=620,
    margin=dict(l=60, r=60, t=80, b=60),
    font=dict(
        family='Arial',
        size=12
    ),
)

# Axis styling
fig.update_xaxes(
    title='Day of Week',
    categoryorder='array',
    categoryarray=DAY_ORDER,
    showgrid=False
)

fig.update_yaxes(
    title='Hour of Day',
    showgrid=False
)

fig.data[0].visible = True
fig.show()

In [None]:
fig.write_json("../jsonvis/diurnal_heatmap.json")

## Crime Incidents Located Outside of Chicago

In [None]:
# Validate that latitude is between -90 and 90 and longtitude is between -180 and 180
print(df_clean["Latitude"].min(), df_clean["Latitude"].max())
print(df_clean["Longitude"].min(), df_clean["Longitude"].max())


The min and max of Latitude and Longitude is beyond Chicago city boundaries. Present of outlier. Code below to identify/visualise outlier records.

In [None]:
# Create the plot
plt.figure(figsize=(12, 8))

# Plot the data points
plt.scatter(df_clean['Longitude'], df_clean['Latitude'], 
            alpha=0.1, s=1, color='blue', label='Data Points')

# 5. Draw the "Real Chicago" Bounding Box for reference
# (Lat: 41.64 to 42.02 | Long: -87.94 to -87.52)
plt.axhline(y=41.64, color='red', linestyle='--', linewidth=1)
plt.axhline(y=42.02, color='red', linestyle='--', linewidth=1)
plt.axvline(x=-87.94, color='red', linestyle='--', linewidth=1)
plt.axvline(x=-87.52, color='red', linestyle='--', linewidth=1)

# 6. Formatting
plt.title('Identifying Lat/Long Outliers in Chicago Crime Data')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid(True, linestyle=':', alpha=0.6)
plt.legend(['Data Points', 'Actual Chicago Boundaries'], loc='upper left')

# Add text labels to help explain the chart
plt.text(-91, 38, "Outliers found here\n(Likely errors or state-wide data)", 
         color='darkred', fontsize=10, bbox=dict(facecolor='white', alpha=0.5))

plt.tight_layout()
# plt.savefig('outlier_detection_plot.png', dpi=300)
plt.show()

# 7. Print a summary of how many records are "Out of Bounds"
outliers = df_clean[
    (df_clean['Latitude'] < 41.6) | (df_clean['Latitude'] > 42.1) | 
    (df_clean['Longitude'] < -88.0) | (df_clean['Longitude'] > -87.5)
]

print(f"Total records: {len(df_clean)}")
print(f"Number of outlier records: {len(outliers)}")
print(f"Percentage of outliers: {(len(outliers)/len(df_clean))*100:.2f}%")

In [None]:
# Filter out anything outside the Chicago city limits
df_map = df_clean[(df_clean['Latitude'] > 41.6) & (df_clean['Latitude'] < 42.1) & 
        (df_clean['Longitude'] > -88.0) & (df_clean['Longitude'] < -87.5)]
df_map.shape

# Chicago Choropleth Crime Density Map Over The Years

In [None]:
df_polygon = pd.read_csv('../ProjectData/ChicagoCommunityArea.csv').iloc[:,1:]
df_polygon.columns = ['GEOMETRY','AREA_NUMBER','COMMUNITY','AREA_NUM_1','SHAPE_AREA','SHAPE_LEN']

In [None]:
# Convert WKT string to geometry objects and convert area to km2
df_polygon["GEOMETRY_OBJ"] = df_polygon["GEOMETRY"].apply(wkt.loads)
df_polygon['SHAPE_AREA_FLT'] = df_polygon['SHAPE_AREA'].apply(lambda x: float(x.replace('.', '').replace(',', '.')))
df_polygon['AREA_KM2']  = round(df_polygon['SHAPE_AREA_FLT'] * 9.2903e-8,2)

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
    df_polygon,
    geometry="GEOMETRY_OBJ",
    crs="EPSG:4326"
)

In [None]:
df_crime_yearly = df_clean.groupby(['Community Area','Year']).size().reset_index(name='Total Crime')


gdf_plot_yearly = gdf.copy().merge(
    df_crime_yearly,
    left_on="AREA_NUMBER",
    right_on="Community Area",
    how="left"
)

gdf_plot_yearly['Crime/km2'] = round(gdf_plot_yearly['Total Crime'] / (gdf_plot_yearly['AREA_KM2']),0)

In [None]:
geojson = json.loads(gdf.copy().to_json()) 
fig_choropleth_overall = px.choropleth_mapbox(
    gdf_plot_yearly, 
    geojson=geojson, 
    locations="AREA_NUMBER",
    featureidkey="properties.AREA_NUMBER", 
    color="Crime/km2", animation_frame="Year", 
    color_continuous_scale="Reds", 
    mapbox_style="open-street-map", 
    center={"lat": 41.828, "lon": -87.62}, 
    zoom=9, 
    range_color=(0, max(gdf_plot_yearly['Crime/km2'])), 
    opacity=0.85, 
    hover_name="COMMUNITY", 
    hover_data={"Total Crime": True, "AREA_KM2": True}, 
    height=650, 
    width=1000, 
    title="Crime Density by Community Area Over The Years" )

fig_choropleth_overall.show()


In [None]:
fig_choropleth_overall.write_json("../jsonvis/crime_choropleth_map.json")

### Top 10 Community Areas by Crime/km2 per Year - Table

In [None]:
# Top 10 Community Areas by Crime/km2 per Year - Interactive Table
years = sorted(gdf_plot_yearly['Year'].unique())

# Create figure with table
fig = go.Figure()

# Create a table for each year
for year in years:
    top_10_year = (
        gdf_plot_yearly[gdf_plot_yearly['Year'] == year]
        .dropna(subset=['Crime/km2'])
        .nlargest(10, 'Crime/km2')[['COMMUNITY', 'Total Crime', 'AREA_KM2', 'Crime/km2']]
        .reset_index(drop=True)
    )
    top_10_year.index = top_10_year.index + 1
    
    fig.add_trace(
        go.Table(
            header=dict(
                values=['<b>Rank</b>', '<b>Community</b>', '<b>Total Crime</b>', '<b>Area (km²)</b>', '<b>Crime/km²</b>'],
                fill_color='#2c3e50',
                font=dict(color='white', size=12),
                align='left',
                height=30
            ),
            cells=dict(
                values=[
                    top_10_year.index,
                    top_10_year['COMMUNITY'],
                    top_10_year['Total Crime'].apply(lambda x: f"{x:,.0f}"),
                    top_10_year['AREA_KM2'].apply(lambda x: f"{x:.2f}"),
                    top_10_year['Crime/km2'].apply(lambda x: f"{x:,.0f}")
                ],
                fill_color=[['#ecf0f1', 'white'] * 5],
                align='left',
                height=25,
                font=dict(size=11)
            ),
            visible=(year == years[0])  # Only first year visible initially
        )
    )

# Create dropdown buttons
buttons = []
for i, year in enumerate(years):
    visibility = [False] * len(years)
    visibility[i] = True
    buttons.append(
        dict(
            label=str(year),
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Top 10 Community Areas by Crime/km² - {year}'}]
        )
    )

fig.update_layout(
    updatemenus=[
        dict(
            type='dropdown',
            x=1,
            y=1.12,
            buttons=buttons,
            showactive=True
        )
    ],
    title=f'Top 10 Community Areas by Crime/km² - {years[0]}',
    height=500,
    margin=dict(t=80, b=20, l=20, r=20)
)

fig.show()

### Concentration of Crime in Top Community Areas - Heatmap

In [None]:
# Using a subset of your data for the top 5 consistent areas
data = {
    'Year': [2015, 2017, 2019, 2021, 2023, 2025],
    'LOOP': [1734, 2427, 2418, 1238, 2028, 1970],
    'WEST GARFIELD PARK': [1725, 1590, 1853, 1214, 1184, 983],
    'NEAR NORTH SIDE': [1246, 1691, 1741, 1130, 1539, 1539],
    'SOUTH SHORE': [1172, 1127, 1134, 958, 1146, 1057]
}

df_trend = pd.DataFrame(data).set_index('Year').T

plt.figure(figsize=(12, 6))
sns.heatmap(df_trend, annot=True, fmt=".0f", cmap="YlOrRd", cbar_kws={'label': 'Crime/km²'})
plt.title('Concentration of Crime in Top Community Areas (2015-2025)')
plt.show()

In [None]:
# Create a mapping dictionary from gdf
community_mapping = dict(zip(gdf['AREA_NUMBER'], gdf['COMMUNITY']))

# Add community name column to df_clean
df_clean['Community Name'] = df_clean['Community Area'].map(community_mapping)



### Top Crime Type Heatmap for Top Community Areas (by Year)

In [None]:
# Interactive Crime Type Heatmap by Year - Top 10 Community Areas
years = sorted(df_clean['Year'].unique())

# Get overall top 10 communities and top 10 crime types (consistent across years)
top_10_communities = df_clean.groupby('Community Name').size().nlargest(10).index
top_10_crimes = df_clean['Primary Type'].value_counts().head(10).index

# Find global max for consistent color scale
all_counts = []
for year in years:
    df_year = df_clean[df_clean['Year'] == year]
    year_counts = (
        df_year[
            (df_year['Community Name'].isin(top_10_communities)) & 
            (df_year['Primary Type'].isin(top_10_crimes))
        ]
        .groupby(['Community Name', 'Primary Type'])
        .size()
        .reset_index(name='Count')
    )
    all_counts.append(year_counts['Count'].max())

global_max = max(all_counts)

# Create initial heatmap for first year
df_year = df_clean[df_clean['Year'] == years[0]]
heatmap_data = (
    df_year[
        (df_year['Community Name'].isin(top_10_communities)) & 
        (df_year['Primary Type'].isin(top_10_crimes))
    ]
    .groupby(['Community Name', 'Primary Type'])
    .size()
    .reset_index(name='Count')
    .pivot(index='Community Name', columns='Primary Type', values='Count')
    .fillna(0)
)

heatmap_data = heatmap_data.reindex(
    index=top_10_communities,
    columns=top_10_crimes,
    fill_value=0
)

fig = go.Figure(data=[
    go.Heatmap(
        z=heatmap_data.values,
        x=heatmap_data.columns,
        y=heatmap_data.index,
        colorscale='YlOrRd',
        text=heatmap_data.values,
        texttemplate='%{text:.0f}',
        textfont={"size": 10},
        colorbar=dict(title='Count'),
        hovertemplate='<b>Community Area:</b> %{y}<br><b>Crime Type:</b> %{x}<br><b>Count:</b> %{z}<extra></extra>',
        zmin=0,
        zmax=global_max
    )
])

# Create frames for animation
frames = []
for year in years:
    df_year = df_clean[df_clean['Year'] == year]
    
    heatmap_data = (
        df_year[
            (df_year['Community Name'].isin(top_10_communities)) & 
            (df_year['Primary Type'].isin(top_10_crimes))
        ]
        .groupby(['Community Name', 'Primary Type'])
        .size()
        .reset_index(name='Count')
        .pivot(index='Community Name', columns='Primary Type', values='Count')
        .fillna(0)
    )
    
    heatmap_data = heatmap_data.reindex(
        index=top_10_communities,
        columns=top_10_crimes,
        fill_value=0
    )
    
    frame = go.Frame(
        data=[
            go.Heatmap(
                z=heatmap_data.values,
                x=heatmap_data.columns,
                y=heatmap_data.index,
                colorscale='YlOrRd',
                text=heatmap_data.values,
                texttemplate='%{text:.0f}',
                textfont={"size": 10},
                colorbar=dict(title='Count'),
                hovertemplate='<b>Community Area:</b> %{y}<br><b>Crime Type:</b> %{x}<br><b>Count:</b> %{z}<extra></extra>',
                zmin=0,
                zmax=global_max
            )
        ],
        name=str(year)
    )
    frames.append(frame)

fig.frames = frames

# Create slider steps with animate method
steps = []
for year in years:
    step = dict(
        method='animate',
        args=[
            [str(year)],
            {
                'frame': {'duration': 0, 'redraw': True},
                'mode': 'immediate',
                'transition': {'duration': 0, 'easing': 'linear'}
            }
        ],
        label=str(year)
    )
    steps.append(step)

# Add play/pause buttons
updatemenus = [dict(
    buttons=[
        dict(
            args=[None, {
                'frame': {'duration': 500, 'redraw': True},
                'mode': 'immediate',
                'fromcurrent': True,
                'transition': {'duration': 500, 'easing': 'linear'}
            }],
            label='▶',
            method='animate'
        ),
        dict(
            args=[[None], {
                'frame': {'duration': 0, 'redraw': True},
                'mode': 'immediate',
                'fromcurrent': True,
                'transition': {'duration': 0, 'easing': 'linear'}
            }],
            label='⏹',
            method='animate'
        )
    ],
    direction='right',
    pad=dict(r=10, t=10),
    showactive=False,
    type='buttons',
    x=0.01,
    xanchor='left',
    y=-0.6,
    yanchor='bottom'
)]

# Add slider control
sliders = [dict(
    active=0,
    currentvalue=dict(prefix='Year: '),
    len=0.85,
    pad=dict(b=10, t=10),
    x=0.2,
    xanchor='left',
    y=-0.7,
    yanchor='bottom',
    steps=steps
)]

fig.update_layout(
    updatemenus=updatemenus,
    sliders=sliders,
    title=f'Crime Type Heatmap - Top 10 Community Areas ({years[0]} - {years[-1]})',
    # xaxis_title='Crime Type',
    yaxis_title='Community Area',
    yaxis=dict(categoryorder='array', categoryarray=top_10_communities, autorange='reversed'),
    height=650,
    width=1100,
    margin=dict(t=100, b=150, l=200, r=100)
)

fig.update_xaxes(tickangle=45)
fig.show()

In [None]:
fig.write_json("../jsonvis/area_crimetype_heatmap.json")

### Community Area and Time of Day of Crimes

In [None]:
# Interactive visualization: Community Area vs Time of Day by Top 10 Crime Type
def time_day_func(hour):
    if 5 <= hour < 12:
        return 'Morning (5-12)'
    elif 12 <= hour < 17:
        return 'Afternoon (12-17)'
    elif 17 <= hour < 21:
        return 'Evening (17-21)'
    else:
        return 'Night (21-5)'

df_comm_crime = df_clean[['Datetime', 'Community Name', 'Primary Type']].copy()
df_comm_crime['Hour'] = df_comm_crime['Datetime'].dt.hour
df_comm_crime['Time of Day'] = df_comm_crime['Hour'].apply(time_day_func)

# Get top 10 crime types
top_10_types = df_clean['Primary Type'].value_counts().head(10).index

# Create figure with dropdown by crime type
fig = go.Figure()

time_order = ['Morning (5-12)', 'Afternoon (12-17)', 'Evening (17-21)', 'Night (21-5)']

# Overall trace - top 10 community areas overall
comm_time_overall = (
    df_comm_crime.groupby(['Community Name', 'Time of Day'])
    .size()
    .reset_index(name='Crime Count')
)

# Get top 10 community areas overall
top_comm_overall = comm_time_overall.groupby('Community Name')['Crime Count'].sum().nlargest(10).index
comm_time_overall = comm_time_overall[comm_time_overall['Community Name'].isin(top_comm_overall)]

heatmap_pivot_overall = comm_time_overall.pivot(
    index='Community Name',
    columns='Time of Day',
    values='Crime Count'
).fillna(0)[time_order]

# Sort by total crime count (descending)
heatmap_pivot_overall['Total'] = heatmap_pivot_overall.sum(axis=1)
heatmap_pivot_overall = heatmap_pivot_overall.sort_values('Total', ascending=True)
heatmap_pivot_overall = heatmap_pivot_overall.drop('Total', axis=1)

fig.add_trace(
    go.Heatmap(
        z=heatmap_pivot_overall.values,
        x=heatmap_pivot_overall.columns,
        y=heatmap_pivot_overall.index,
        colorscale='Reds',
        name='Overall',
        visible=True,
        colorbar=dict(title='Crime Count'),
        hovertemplate='<b>Time of Day:</b> %{x}<br><b>Community Area:</b> %{y}<br><b>Count:</b> %{z}<extra></extra>'
    )
)

# Add traces for each crime type with their top 10 community areas
for crime_type in top_10_types:
    df_crime = df_comm_crime[df_comm_crime['Primary Type'] == crime_type]
    
    comm_time_crime = (
        df_crime.groupby(['Community Name', 'Time of Day'])
        .size()
        .reset_index(name='Crime Count')
    )
    
    # Get top 10 community areas for this crime type
    top_comm_crime = comm_time_crime.groupby('Community Name')['Crime Count'].sum().nlargest(10).index
    comm_time_crime = comm_time_crime[comm_time_crime['Community Name'].isin(top_comm_crime)]
    
    heatmap_pivot_crime = comm_time_crime.pivot(
        index='Community Name',
        columns='Time of Day',
        values='Crime Count'
    ).fillna(0)
    
    # Ensure all time periods exist
    for time in time_order:
        if time not in heatmap_pivot_crime.columns:
            heatmap_pivot_crime[time] = 0
    heatmap_pivot_crime = heatmap_pivot_crime[time_order]
    
    # Sort by total crime count (descending)
    heatmap_pivot_crime['Total'] = heatmap_pivot_crime.sum(axis=1)
    heatmap_pivot_crime = heatmap_pivot_crime.sort_values('Total', ascending=True)
    heatmap_pivot_crime = heatmap_pivot_crime.drop('Total', axis=1)
    
    fig.add_trace(
        go.Heatmap(
            z=heatmap_pivot_crime.values,
            x=heatmap_pivot_crime.columns,
            y=heatmap_pivot_crime.index,
            colorscale='Reds',
            name=crime_type,
            visible=False,
            colorbar=dict(title='Crime Count'),
            hovertemplate='<b>Time of Day:</b> %{x}<br><b>Community Area:</b> %{y}<br><b>Count:</b> %{z}<extra></extra>'
        )
    )

# Create dropdown buttons
buttons = [
    dict(
        label='Overall',
        method='update',
        args=[{'visible': [True] + [False] * len(top_10_types)},
              {'title': 'Top 10 Community Areas vs Time of Day (Overall)'}]
    )
]

for i, crime_type in enumerate(top_10_types):
    visibility = [False] * (len(top_10_types) + 1)
    visibility[i + 1] = True
    buttons.append(
        dict(
            label=crime_type,
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Top 10 Community Areas vs Time of Day - {crime_type}'}]
        )
    )

fig.update_layout(
    title='Top 10 Community Areas vs Time of Day (Overall)',
    xaxis_title='Time of Day',
    yaxis_title='Community Area',
    updatemenus=[dict(type='dropdown', x=1, y=1.12, buttons=buttons)],
    height=700,
    width=1000
)

fig.show()

In [None]:
fig.write_json("../jsonvis/crime_type_area_heatmap.json")
fig = pio.read_json(fig)
fig.show()

## Top 10 Crime Type Distribution Over The Years

In [None]:
# Group by Primary Type and Year (without Month)
df_crime_type_annual = df_clean.groupby(['Primary Type', 'Year']).size().reset_index(name='Crime Count')

# DO NOT NEED TO REMOVE 'OTHER OFFENSE'

# Get top 10 crime types by total count across all years
top_crimes = df_crime_type_annual.groupby('Primary Type')['Crime Count'].sum().nlargest(10).index
df_top10_annual = df_crime_type_annual[df_crime_type_annual['Primary Type'].isin(top_crimes)]

# Create animated bar chart
fig_top_crime_annual = px.bar(
    df_top10_annual,
    x="Crime Count",
    y="Primary Type",
    animation_frame="Year",
    orientation="h",
    title="Top 10 Crime Types (by Year)",
    color="Primary Type"
)

fig_top_crime_annual.update_layout(
    xaxis_title="Crime Count (Annual)",
    yaxis_title="Crime Type",
    height=800,
    width=1000,
    showlegend=True,
    xaxis=dict(range=[0, df_top10_annual['Crime Count'].max() * 1.1])  # Add 10% padding
)

fig_top_crime_annual.update_xaxes(tickformat=",d")

# Ensure bars are sorted descending in every frame
fig_top_crime_annual.update_yaxes(categoryorder="total ascending")

fig_top_crime_annual.show()

In [None]:
fig_top_crime_annual.write_json("../jsonvis/top_crime_annual.json")

## Composition of Top 10 Crime Types

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Get top 10 primary crime types
top_10_primary = df_clean['Primary Type'].value_counts().nlargest(10).index.tolist()

# Create word clouds for each primary type
fig, axes = plt.subplots(5, 2, figsize=(20, 25))
axes = axes.flatten()

for idx, crime_type in enumerate(top_10_primary):
    # Filter data for this crime type
    df_crime = df_clean[df_clean['Primary Type'] == crime_type]
    
    # Get description counts
    description_counts = df_crime['Description'].value_counts().to_dict()
    
    # Create word cloud with frequencies
    wordcloud = WordCloud(
        width=800, 
        height=400,
        background_color='white',
        colormap='viridis',
        relative_scaling=0.5,
        min_font_size=10
    ).generate_from_frequencies(description_counts)
    
    # Plot
    axes[idx].imshow(wordcloud, interpolation='bilinear')
    axes[idx].set_title(f'{crime_type}\n(Total: {df_crime.shape[0]:,} incidents)', 
                        fontsize=14, fontweight='bold')
    axes[idx].axis('off')

plt.tight_layout()
plt.show()

## Arrest Rate by Crime Type

In [None]:
# Ensure Arrest is numeric (0/1) before aggregation
df_clean['Arrest_Num'] = df_clean['Arrest'].astype(int)

arrest_rate = (
    df_clean.groupby('Primary Type')['Arrest_Num']
    .mean()
    .mul(100)
    .reset_index(name='Arrest Rate (%)')
    .sort_values('Arrest Rate (%)', ascending=False)
)

# Plot arrest rates (highest at top)
plt.figure(figsize=(10, 6))
sns.barplot(
    data=arrest_rate,
    x='Arrest Rate (%)',
    y='Primary Type',
    order=arrest_rate['Primary Type'],
    palette='viridis'
)
plt.title('Arrest Rate by Crime Type')
plt.xlabel('Arrest Rate (%)')
plt.ylabel('Crime Type')
plt.tight_layout()
plt.show()

## Domesticity and Location Type of Crimes

In [None]:
import plotly.express as px

# 1. Prepare the data (Add 'Domestic' to the group)
loc_breakdown = (
    df_clean.groupby(['Primary Type', 'Domestic', 'Location Description'])
    .size()
    .reset_index(name='Count')
)

# 2. Convert Boolean True/False to more readable strings
loc_breakdown['Domestic'] = loc_breakdown['Domestic'].map({True: 'Domestic', False: 'Non-Domestic'})

# 3. Add 'Domestic' to the path
fig = px.treemap(
    loc_breakdown,
    path=['Primary Type', 'Domestic', 'Location Description'], # <--- New Layer added here
    values='Count',
    color='Primary Type',
    title="Crime Hierarchy: Type -> Domesticity -> Location"
)

fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
fig.show()

### Added Arrest Rate (with min. 50 Incidents)

In [None]:
import plotly.express as px

# 1. Aggregate the data
treemap_data = (
    df_clean.groupby(['Primary Type', 'Domestic', 'Location Description'])
    .agg(
        Total_Incidents=('ID', 'count'),
        Arrest_Rate=('Arrest_Num', 'mean')
    )
    .reset_index()
)

# 2. FILTERING: Remove small samples to eliminate "noise"
# A threshold of 50 or 100 is recommended for a dataset of 2M records
threshold = 50 
treemap_data = treemap_data[treemap_data['Total_Incidents'] > threshold]

# 3. Final Calculations and Labeling
treemap_data['Arrest Rate (%)'] = treemap_data['Arrest_Rate'] * 100
treemap_data['Domestic_Label'] = treemap_data['Domestic'].map({True: 'Domestic', False: 'Non-Domestic'})

# 4. Create the Treemap
fig = px.treemap(
    treemap_data,
    path=['Primary Type', 'Domestic_Label', 'Location Description'],
    values='Total_Incidents',
    color='Arrest Rate (%)',
    color_continuous_scale='RdYlGn', 
    range_color=[0, 60], 
    template='plotly_white', # <--- Changes background to clean white
    title=f"Chicago Crime: Arrest Success Rate (Min. {threshold} incidents)"
)

# 5. Styling and Layout
fig.update_traces(
    textinfo="label+value+percent parent",
    marker=dict(line=dict(width=0.5, color='white')) # Subtle borders for readability
)

fig.update_layout(
    margin=dict(t=50, l=10, r=10, b=10),
    # Explicitly set background colors to white
    paper_bgcolor="white",
    plot_bgcolor="white"
)

fig.show()

### Observation

Green Zones (High Success): E.g. Narcotics. Arrests usually happen at the moment the crime is discovered, the arrest rate is often near 100%. 

Red Zones (The Challenge): E.g. Theft or Burglary. Arrest rates below 5%. Big boxes (high volume) and deep red means most of these crimes go unsolved. 

Domestic Shift: E.g. Battery (Domestic) vs. Battery (Non-Domestic). Domestic battery has a higher arrest rate because the offender is known and often still present when police arrive.

## Location Type and Time of Day of Crimes 

In [None]:
# Interactive visualization: Location Description vs Time of Day by Top 10 Crime Type
def time_day_func(hour):
    if 5 <= hour < 12:
        return 'Morning (5-12)'
    elif 12 <= hour < 17:
        return 'Afternoon (12-17)'
    elif 17 <= hour < 21:
        return 'Evening (17-21)'
    else:
        return 'Night (21-5)'

df_loc_crime = df_clean[['Datetime', 'Location Description', 'Primary Type']].copy()
df_loc_crime['Hour'] = df_loc_crime['Datetime'].dt.hour
df_loc_crime['Time of Day'] = df_loc_crime['Hour'].apply(time_day_func)

# Get top 10 crime types
top_10_types = df_clean['Primary Type'].value_counts().head(10).index

# Create figure with dropdown by crime type
fig = go.Figure()

time_order = ['Morning (5-12)', 'Afternoon (12-17)', 'Evening (17-21)', 'Night (21-5)']

# Overall trace - top 10 locations overall
loc_time_overall = (
    df_loc_crime.groupby(['Location Description', 'Time of Day'])
    .size()
    .reset_index(name='Crime Count')
)

# Get top 10 locations overall
top_locations_overall = loc_time_overall.groupby('Location Description')['Crime Count'].sum().nlargest(10).index
loc_time_overall = loc_time_overall[loc_time_overall['Location Description'].isin(top_locations_overall)]

heatmap_pivot_overall = loc_time_overall.pivot(
    index='Location Description',
    columns='Time of Day',
    values='Crime Count'
).fillna(0)[time_order]

# Sort by total crime count (descending)
heatmap_pivot_overall['Total'] = heatmap_pivot_overall.sum(axis=1)
heatmap_pivot_overall = heatmap_pivot_overall.sort_values('Total', ascending=True)
heatmap_pivot_overall = heatmap_pivot_overall.drop('Total', axis=1)

fig.add_trace(
    go.Heatmap(
        z=heatmap_pivot_overall.values,
        x=heatmap_pivot_overall.columns,
        y=heatmap_pivot_overall.index,
        colorscale='Reds',
        name='Overall',
        visible=True,
        colorbar=dict(title='Crime Count'),
        hovertemplate='<b>Time of Day:</b> %{x}<br><b>Location Type:</b> %{y}<br><b>Count:</b> %{z}<extra></extra>'
    )
)

# Add traces for each crime type with their top 10 locations
for crime_type in top_10_types:
    df_crime = df_loc_crime[df_loc_crime['Primary Type'] == crime_type]
    
    loc_time_crime = (
        df_crime.groupby(['Location Description', 'Time of Day'])
        .size()
        .reset_index(name='Crime Count')
    )
    
    # Get top 10 locations for this crime type
    top_locations_crime = loc_time_crime.groupby('Location Description')['Crime Count'].sum().nlargest(10).index
    loc_time_crime = loc_time_crime[loc_time_crime['Location Description'].isin(top_locations_crime)]
    
    heatmap_pivot_crime = loc_time_crime.pivot(
        index='Location Description',
        columns='Time of Day',
        values='Crime Count'
    ).fillna(0)
    
    # Ensure all time periods exist
    for time in time_order:
        if time not in heatmap_pivot_crime.columns:
            heatmap_pivot_crime[time] = 0
    heatmap_pivot_crime = heatmap_pivot_crime[time_order]
    
    # Sort by total crime count (descending)
    heatmap_pivot_crime['Total'] = heatmap_pivot_crime.sum(axis=1)
    heatmap_pivot_crime = heatmap_pivot_crime.sort_values('Total', ascending=True)
    heatmap_pivot_crime = heatmap_pivot_crime.drop('Total', axis=1)
    
    fig.add_trace(
        go.Heatmap(
            z=heatmap_pivot_crime.values,
            x=heatmap_pivot_crime.columns,
            y=heatmap_pivot_crime.index,
            colorscale='Reds',
            name=crime_type,
            visible=False,
            colorbar=dict(title='Crime Count'),
            hovertemplate='<b>Time of Day:</b> %{x}<br><b>Location Type:</b> %{y}<br><b>Count:</b> %{z}<extra></extra>'
        )
    )

# Create dropdown buttons
buttons = [
    dict(
        label='Overall',
        method='update',
        args=[{'visible': [True] + [False] * len(top_10_types)},
              {'title': 'Top 10 Locations vs Time of Day (Overall)'}]
    )
]

for i, crime_type in enumerate(top_10_types):
    visibility = [False] * (len(top_10_types) + 1)
    visibility[i + 1] = True
    buttons.append(
        dict(
            label=crime_type,
            method='update',
            args=[{'visible': visibility},
                  {'title': f'Top 10 Locations vs. Time of Day - {crime_type}'}]
        )
    )

fig.update_layout(
    title='Top 10 Locations vs Time of Day (Overall)',
    xaxis_title='Time of Day',
    yaxis_title='Location Description',
    updatemenus=[dict(type='dropdown', x=1, y=1.12, buttons=buttons)],
    height=700,
    width=1000
)

fig.show()

# Top Crime Occurrence Distribution by Time of Day

In [None]:
def time_day_func(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'

In [None]:
df_crime_type = df_clean.groupby(['Primary Type','Year','Month']).size().reset_index(name='Crime Count')
df_crime_type = df_crime_type.sort_values(by=['Primary Type', 'Year', 'Month']).reset_index(drop=True)
df_crime_type = df_crime_type.loc[df_crime_type['Primary Type'] != 'OTHER OFFENSE']
# Calculate cumulative count by crime type
df_crime_type['Cummulative Count'] = df_crime_type.groupby(['Primary Type'])['Crime Count'].cumsum()
df_crime_type["YearMonth"] = pd.to_datetime(
    df_crime_type["Year"].astype(str) + "-" +
    df_crime_type["Month"].astype(str).str.zfill(2)
)

df_crime_type["Year-Month"] = df_crime_type["YearMonth"].dt.strftime("%Y-%m")

df_top11 = (
    df_crime_type
    .sort_values(["YearMonth", "Cummulative Count"], ascending=[True, False])
    .groupby("YearMonth", group_keys=False)
    .head(11)
)
top_crime_list = list(df_top11.loc[(df_top11['Year'] == 2025) & (df_top11['Month']==12)]['Primary Type'])

In [None]:

df_time = df_clean[['Primary Type','Date','Time','Year','Month']].reset_index(drop=True)
df_time['Hour'] = df_time['Time'].apply(lambda x: (x.hour))
df_time['Time of Day'] = df_time['Hour'].apply(lambda x: time_day_func(x))
df_time['Top Crime'] = df_time['Primary Type'].apply(lambda x: x in top_crime_list)

df_time = df_time.loc[df_time['Top Crime'] == True].reset_index(drop=True)
df_time_Occurrence = df_time.groupby(['Time of Day','Primary Type']).size().reset_index(name='Occurrence')

In [None]:
time_of_day_list = ['Morning', 'Afternoon', 'Evening', 'Night']
total_crime_day = []

for time in time_of_day_list:
    df_temp = df_time_Occurrence.loc[df_time_Occurrence['Time of Day'] == time].reset_index(drop=True)
    sum_crime = sum(df_temp['Occurrence'])
    total_crime_day.append(sum_crime)


In [None]:
crime_Occurrence = []

for crime in top_crime_list:
    df_temp = df_time_Occurrence.loc[df_time_Occurrence['Primary Type'] == crime].reset_index(drop=True)
    sum_crime = sum(df_temp['Occurrence'])
    crime_Occurrence.append(sum_crime)

sum_crime_time = pd.DataFrame({'Primary Type':top_crime_list,'Sum Crime': crime_Occurrence})

In [None]:
df_time_crime_percentage = pd.merge(df_time_Occurrence, sum_crime_time, on='Primary Type', how='inner')
df_time_crime_percentage['CrimePercentage'] = round(df_time_crime_percentage['Occurrence']/df_time_crime_percentage['Sum Crime'],3)*100

In [None]:
fig = px.bar(
    df_time_crime_percentage,
    x="CrimePercentage",
    y="Primary Type",
    color="Time of Day",
    orientation="h",
    title="Crime Distribution by Time of Day",
    category_orders={
        "Time of Day": [
            "Morning",
            "Afternoon",
            "Evening",
            "Night",
        ],
        "Primary Type":top_crime_list
    },
        color_discrete_map={
        "Morning": "#2ECC71",
        "Afternoon": "#F1C40f",
        "Evening": "#e67e22",
        "Night": "#213D97"
    }

)

fig.update_layout(
    xaxis_title="Percentage of Crime (%)",
    yaxis_title="Crime Type",
    barmode="stack",
    height=600
)

# After creating fig
fig.for_each_trace(
    lambda t: t.update(
        name={
            "Morning": "Morning (5AM–9AM)",
            "Afternoon": "Afternoon (12PM–5PM)",
            "Evening": "Evening (5PM–9PM)",
            "Night": "Night (9PM–5AM)",
        }[t.name]
    )
)

fig.show()


In [None]:
df_time_crime_summary = df_time_crime_percentage.copy().groupby(['Time of Day'])['Occurrence'].sum().reset_index(name='Crime Count')
fig = px.pie(
    df_time_crime_summary,
    names="Time of Day",
    values="Crime Count",
    title="Crime Occurrences by Time of Day in Chicago 2015-2025",
    color="Time of Day",
        category_orders={
        "Time of Day": [
            "Morning",
            "Afternoon",
            "Evening",
            "Night",
        ]
    },
    color_discrete_map={
        "Morning": "#2ECC71",
        "Afternoon": "#F1C40f",
        "Evening": "#e67e22",
        "Night": "#213D97"
    }
)

# Optional: improve layout
fig.update_traces(textinfo="label+percent", pull=[0, 0, 0, 0.05])  # slight pull for Night slice
fig.update_layout(height=600,width=600)

fig.show()
