Add codes for EDA here and we'll convert them for Streamlit app later.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import geopandas as gpd
from shapely import wkt
from shapely.geometry import Point
import json


In [None]:
file_path = "../ProjectData/ChicagoCrimes(20152025).csv"
df = pd.read_csv(file_path)
df.head()

## Prepare Date and Time

Convert the 'Date' column to datetime objects with the specified format and `errors='coerce'` to handle potential parsing issues, and display the first few rows of the DataFrame to confirm the changes.




In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')
df = df.rename(columns={"Date":"Datetime"})
df['Date'] = df['Datetime'].dt.date
df['Time'] = df['Datetime'].dt.time # pandas normalizes to 24-hour format
df.head()

In [None]:
df.info()

## Analyze Missing Data Over Time

Calculate the number and missing values for each column, grouped by year, to understand the trend of missing data over different years. This will provide insights into when certain data points started to be missing or were more prevalent.


In [None]:
# Count number of cases per year
cases_per_year = df.groupby("Year").size()
print(cases_per_year)

In [None]:
# Calculate missing values by year and column
missing_by_year = df.groupby("Year").apply(lambda x: x.isnull().sum())

# Plot heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(missing_by_year.T, cmap="Reds", annot=True, fmt="d", cbar_kws={'label': 'Fraction Missing'})
plt.title("Heatmap of Missing Values by Column and Year")
plt.xlabel("Year")
plt.ylabel("Columns")
plt.tight_layout()
plt.show()

## Remove Missing Data and 2026 Data

Total enteries is 2755021. Fraction of missing data is small which will not distort EDA when removed. Also, this simplifies workflow - avoid complexity of imputing values which can introduce bias if not done carefully. Plots and summaries will not be cluttered by NaN values too.

Also removed 2026 data such that EDA will focus on years 2015 - 2025.

In [None]:
# Drop rows with any missing values and remove 2026 data
df_clean = df.dropna()
df_clean = df_clean[df_clean["Year"] != 2026]
print("Original entries:", len(df))
print("After dropping missing:", len(df_clean))

## Time Series Plot by Date, Week, Month and Year

In [None]:
# Ensure Colab renders Plotly charts
pio.renderers.default = "colab"

# Assuming df_clean already has a 'Datetime' column parsed
df_clean["Date"] = df_clean["Datetime"].dt.date
df_clean["Week"] = df_clean["Datetime"].dt.to_period("W").dt.start_time
df_clean["Month"] = df_clean["Datetime"].dt.to_period("M").dt.start_time
df_clean["Year"] = df_clean["Datetime"].dt.year

# Aggregate counts using df_clean
cases_by_date = df_clean.groupby("Date").size().reset_index(name="Cases")
cases_by_week = df_clean.groupby("Week").size().reset_index(name="Cases")
cases_by_month = df_clean.groupby("Month").size().reset_index(name="Cases")
cases_by_year = df_clean.groupby("Year").size().reset_index(name="Cases")

# Build figure with multiple traces
fig = go.Figure()

fig.add_trace(go.Scatter(x=cases_by_date["Date"], y=cases_by_date["Cases"],
                         mode="lines", name="By Date", visible=True))
fig.add_trace(go.Scatter(x=cases_by_week["Week"], y=cases_by_week["Cases"],
                         mode="lines", name="By Week", visible=False))
fig.add_trace(go.Scatter(x=cases_by_month["Month"], y=cases_by_month["Cases"],
                         mode="lines", name="By Month", visible=False))
fig.add_trace(go.Bar(x=cases_by_year["Year"], y=cases_by_year["Cases"],
                     name="By Year", visible=False))

# Add dropdown menu to toggle visibility
fig.update_layout(
    title="Cases Over Time",
    updatemenus=[
        dict(
            type="dropdown",
            x=0.1, y=1.15,
            buttons=[
                dict(label="Date", method="update",
                     args=[{"visible": [True, False, False, False]},
                           {"title": "Cases by Date"}]),
                dict(label="Week", method="update",
                     args=[{"visible": [False, True, False, False]},
                           {"title": "Cases by Week"}]),
                dict(label="Month", method="update",
                     args=[{"visible": [False, False, True, False]},
                           {"title": "Cases by Month"}]),
                dict(label="Year", method="update",
                     args=[{"visible": [False, False, False, True]},
                           {"title": "Cases by Year"}]),
            ]
        )
    ]
)

fig.show()


### Observations

Outlier number of crime incidents reported on specific dates. High number of crime 1877 reported on 31 May 2020. Low number of crime 110 and 115 on 19 Dec 2023 and 14 May 2024 respectively.

Strictly lesser than 20k of crime incidents reported per month from Jan 2020 to May 2022. The lesser amount of crime might be due to Covid Pandemic effect.

Appears to be some seasonality trend or pattern for crime rate annually.

## Seasonality Plot (over Months) by Year

Click on legend entries to hide/show specific years. Single click hides/shows one line. Double click isolates one line (hides all others).

In [None]:
pio.renderers.default = "colab"

# Assuming df_clean already has a 'Datetime' column parsed
df_clean["Month"] = df_clean["Datetime"].dt.month
df_clean["Year"] = df_clean["Datetime"].dt.year

# Aggregate counts by Year and Month
cases_by_month_year = (
    df_clean.groupby(["Year", "Month"])
    .size()
    .reset_index(name="Cases")
)

# Build figure with one trace per year
fig = go.Figure()

years = sorted(cases_by_month_year["Year"].unique())
for yr in years:
    data = cases_by_month_year[cases_by_month_year["Year"] == yr]
    fig.add_trace(go.Scatter(
        x=data["Month"], y=data["Cases"],
        mode="lines+markers",
        name=str(yr)  # legend entry
    ))

# Customize layout
fig.update_layout(
    title="Monthly Cases by Year (Seasonality)",
    xaxis=dict(
        tickmode="array",
        tickvals=list(range(1, 13)),
        ticktext=["Jan","Feb","Mar","Apr","May","Jun",
                  "Jul","Aug","Sep","Oct","Nov","Dec"]
    ),
    legend_title="Year",
    hovermode="x unified"
)

fig.show()

### Observation

Crime seems to dip / lowest in Feb, gradually increases thereafter and peaks in Jul or Aug, before declining again towards Dec.

While the broad pattern is similiar for 2020, it dipped later, reached lowest in Apr instead of Feb.

In 2019 and 2020, the crime rate dropped drastically with the most likely cause is the covid pandemic. The crime rate numbers continue to increase for the following years and continues to have the same seasonality patterns like the the other years.  

## Density Map of Crime Incidents

In [None]:
# Validate that latitude is between -90 and 90 and longtitude is between -180 and 180
print(df_clean["Latitude"].min(), df_clean["Latitude"].max())
print(df_clean["Longitude"].min(), df_clean["Longitude"].max())


In [None]:
pio.renderers.default = "colab"

# Assuming df_clean already has 'Datetime', 'Latitude', 'Longitude'
df_clean["Year"] = df_clean["Datetime"].dt.year

fig = px.density_mapbox(
    df_clean,
    lat="Latitude",
    lon="Longitude",
    radius=10,
    hover_data=["Year", "Date"],
    color_continuous_scale="Viridis",
    mapbox_style="open-street-map",
    zoom=9,   # higher zoom = closer view
    center={"lat": 41.8781, "lon": -87.6298},  # Chicago coordinates
    height=600,
    title="Incident Density Map - Chicago Focus",
    animation_frame="Year"   # optional: play by year
)

fig.show()

# Chicago Choropleth Crime Density Map Over The Years

In [None]:
df_polygon = pd.read_csv('../ProjectData/ChicagoCommunityArea.csv')
df_polygon.columns = ['0','GEOMETRY','AREA_NUMBER','COMMUNITY','AREA_NUM_1','SHAPE_AREA','SHAPE_LEN']

In [None]:
# Convert WKT string to geometry objects
df_polygon["GEOMETRY_OBJ"] = df_polygon["GEOMETRY"].apply(wkt.loads)
df_polygon['SHAPE_AREA_FLT'] = df_polygon['SHAPE_AREA'].apply(lambda x: float(x.replace('.', '').replace(',', '.')))
df_polygon['AREA_KM2']  = round(df_polygon['SHAPE_AREA_FLT'] * 9.2903e-8,2)    # 1 sq ft = 9.2903e-8 km²

# Create GeoDataFrame
gdf = gpd.GeoDataFrame(
    df_polygon,
    geometry="GEOMETRY_OBJ",
    crs="EPSG:4326"
)


In [None]:
df_crime_yearly = df_clean.groupby(['Community Area','Year']).size().reset_index(name='Total Crime')

gdf_base = gdf.copy()
geojson = json.loads(gdf_base.to_json())

gdf_plot_yearly = gdf_base.merge(
    df_crime_yearly,
    left_on="AREA_NUMBER",
    right_on="Community Area",
    how="left"
)

In [None]:
gdf_plot_yearly['Crime/km2'] = round(gdf_plot_yearly['Total Crime'] / (gdf_plot_yearly['AREA_KM2']),0)

fig = px.choropleth_mapbox(
    gdf_plot_yearly,
    geojson=geojson,
    locations="AREA_NUMBER",            
    featureidkey="properties.AREA_NUMBER",
    color="Crime/km2",
    animation_frame="Year",
    color_continuous_scale="Reds",
    mapbox_style="open-street-map",
    center={"lat": 41.8781, "lon": -87.6298},
    zoom=9,
    range_color=(0, max(gdf_plot_yearly['Crime/km2'])),
    opacity=0.85,
    hover_name="COMMUNITY",
    hover_data={"Total Crime": True,
                "AREA_KM2": True},
    height=650,
    width=1000,
    title="Crime Density by Community Area Over The Years"
)

fig.show()


# Chicago Crime Type Count Over The Years

In [None]:
df_crime_type = df_clean.groupby(['Primary Type','Year','Month']).size().reset_index(name='Crime Count')
df_crime_type = df_crime_type.sort_values(by=['Primary Type', 'Year', 'Month']).reset_index(drop=True)
df_crime_type = df_crime_type.loc[df_crime_type['Primary Type'] != 'OTHER OFFENSE']
# Calculate cumulative count by crime type
df_crime_type['Cummulative Count'] = df_crime_type.groupby(['Primary Type'])['Crime Count'].cumsum()

In [None]:
df_crime_type["YearMonth"] = pd.to_datetime(
    df_crime_type["Year"].astype(str) + "-" +
    df_crime_type["Month"].astype(str).str.zfill(2)
)

df_crime_type["Year-Month"] = df_crime_type["YearMonth"].dt.strftime("%Y-%m")

df_top11 = (
    df_crime_type
    .sort_values(["YearMonth", "Cummulative Count"], ascending=[True, False])
    .groupby("YearMonth", group_keys=False)
    .head(11)
)

fig = px.bar(
    df_top11,
    x="Cummulative Count",
    y="Primary Type",
    animation_frame="Year-Month",
    orientation="h",
    title="Cummulative Count for Chicago Crime Based on Type (Monthly, Top 11)",
)

fig.update_layout(
    xaxis_title="Crime Count",
    yaxis_title="Crime Type",
    height=800,
    width=1000
)

fig.update_xaxes(tickformat=",d")

# Ensure bars are sorted descending in every frame
fig.update_yaxes(categoryorder="total ascending")

fig.show()


# Top Crime Occurrence Distribution by Time of Day

In [None]:
def TimeOfDay(hour):
    if 5 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    elif 17 <= hour < 21:
        return 'Evening'
    else:
        return 'Night'
    

In [None]:
top_crime_list = list(df_top11.loc[(df_top11['Year'] == 2025) & (df_top11['Month']==12)]['Primary Type'])

In [None]:

df_time = df_clean[['Primary Type','Date','Time','Year','Month']].reset_index(drop=True)
df_time['Hour'] = df_time['Time'].apply(lambda x: (x.hour))
df_time['Time of Day'] = df_time['Hour'].apply(lambda x: TimeOfDay(x))
df_time['Top Crime'] = df_time['Primary Type'].apply(lambda x: x in top_crime_list)

df_time = df_time.loc[df_time['Top Crime'] == True].reset_index(drop=True)
df_time_Occurrence = df_time.groupby(['Time of Day','Primary Type']).size().reset_index(name='Occurrence')

In [None]:
time_of_day_list = ['Morning', 'Afternoon', 'Evening', 'Night']
total_crime_day = []

for time in time_of_day_list:
    df_temp = df_time_Occurrence.loc[df_time_Occurrence['Time of Day'] == time].reset_index(drop=True)
    sum_crime = sum(df_temp['Occurrence'])
    total_crime_day.append(sum_crime)

sum_crime_day = pd.DataFrame({'Time of Day':time_of_day_list,'Sum Crime': total_crime_day})

In [None]:
crime_Occurrence = []

for crime in top_crime_list:
    df_temp = df_time_Occurrence.loc[df_time_Occurrence['Primary Type'] == crime].reset_index(drop=True)
    sum_crime = sum(df_temp['Occurrence'])
    crime_Occurrence.append(sum_crime)

sum_crime_time = pd.DataFrame({'Primary Type':top_crime_list,'Sum Crime': crime_Occurrence})

In [None]:
df_time_crime_percentage = pd.merge(df_time_Occurrence, sum_crime_time, on='Primary Type', how='inner')
df_time_crime_percentage['CrimePercentage'] = round(df_time_crime_percentage['Occurrence']/df_time_crime_percentage['Sum Crime'],3)*100

In [None]:
fig = px.bar(
    df_time_crime_percentage,
    x="CrimePercentage",
    y="Primary Type",
    color="Time of Day",
    orientation="h",
    title="Crime Distribution by Time of Day",
    category_orders={
        "Time of Day": [
            "Morning",
            "Afternoon",
            "Evening",
            "Night",
        ],
        "Primary Type":top_crime_list
    },
        color_discrete_map={
        "Morning": "#2ECC71",
        "Afternoon": "#F1C40f",
        "Evening": "#e67e22",
        "Night": "#213D97"
    }

)

fig.update_layout(
    xaxis_title="Percentage of Crime (%)",
    yaxis_title="Crime Type",
    barmode="stack",
    height=600
)

# After creating fig
fig.for_each_trace(
    lambda t: t.update(
        name={
            "Morning": "Morning (5AM–9AM)",
            "Afternoon": "Afternoon (12PM–5PM)",
            "Evening": "Evening (5PM–9PM)",
            "Night": "Night (9PM–5AM)",
        }[t.name]
    )
)

fig.show()


In [None]:
df_time_crime_summary = df_time_crime_percentage.copy().groupby(['Time of Day'])['Occurrence'].sum().reset_index(name='Crime Count')
fig = px.pie(
    df_time_crime_summary,
    names="Time of Day",
    values="Crime Count",
    title="Crime Occurrences by Time of Day in Chicago 2015-2025",
    color="Time of Day",
        category_orders={
        "Time of Day": [
            "Morning",
            "Afternoon",
            "Evening",
            "Night",
        ]
    },
    color_discrete_map={
        "Morning": "#2ECC71",
        "Afternoon": "#F1C40f",
        "Evening": "#e67e22",
        "Night": "#213D97"
    }
)

# Optional: improve layout
fig.update_traces(textinfo="label+percent", pull=[0, 0, 0, 0.05])  # slight pull for Night slice
fig.update_layout(height=600,width=600)

fig.show()
