In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from data_preprocessing import load_and_preprocess_data

DATASET_PATH = "../milestone1/data/Crimes_-_2001_to_Present.csv"

In [2]:
dataset = load_and_preprocess_data(DATASET_PATH)

In [15]:
grouped = dataset.groupby(["Primary Type", "Description"]).size().sort_values(ascending=False).reset_index()
grouped[grouped["Primary Type"] == "THEFT"]

Unnamed: 0,Primary Type,Description,0
0,THEFT,$500 AND UNDER,660668
3,THEFT,OVER $500,444873
10,THEFT,FROM BUILDING,261958
11,THEFT,RETAIL THEFT,225548
29,THEFT,POCKET-PICKING,41868
30,THEFT,FINANCIAL ID THEFT: OVER $300,40261
57,THEFT,PURSE-SNATCHING,18686
71,THEFT,FINANCIAL ID THEFT:$300 &UNDER,14280
76,THEFT,ATTEMPT THEFT,13292
148,THEFT,THEFT FROM MOTOR VEHICLE,3098


In [20]:
grouped[grouped["Primary Type"] == "DECEPTIVE PRACTICE"]

Unnamed: 0,Primary Type,Description,0
21,DECEPTIVE PRACTICE,CREDIT CARD FRAUD,68590
25,DECEPTIVE PRACTICE,FRAUD OR CONFIDENCE GAME,46002
28,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,41914
31,DECEPTIVE PRACTICE,ILLEGAL USE CASH CARD,38878
36,DECEPTIVE PRACTICE,THEFT OF LABOR/SERVICES,32176
49,DECEPTIVE PRACTICE,FORGERY,22882
51,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,22260
61,DECEPTIVE PRACTICE,BOGUS CHECK,16479
73,DECEPTIVE PRACTICE,COUNTERFEITING DOCUMENT,13695
94,DECEPTIVE PRACTICE,COUNTERFEIT CHECK,9205


In [16]:
grouped["Primary Type"].unique()

array(['THEFT', 'BATTERY', 'CRIMINAL DAMAGE', 'ASSAULT',
       'MOTOR VEHICLE THEFT', 'BURGLARY', 'NARCOTICS', 'OTHER OFFENSE',
       'CRIMINAL TRESPASS', 'ROBBERY', 'DECEPTIVE PRACTICE',
       'WEAPONS VIOLATION', 'PROSTITUTION', 'PUBLIC PEACE VIOLATION',
       'OFFENSE INVOLVING CHILDREN', 'CRIM SEXUAL ASSAULT', 'HOMICIDE',
       'GAMBLING', 'SEX OFFENSE', 'ARSON',
       'INTERFERENCE WITH PUBLIC OFFICER', 'CRIMINAL SEXUAL ASSAULT',
       'LIQUOR LAW VIOLATION', 'STALKING', 'INTIMIDATION', 'KIDNAPPING',
       'CONCEALED CARRY LICENSE VIOLATION', 'OBSCENITY',
       'PUBLIC INDECENCY', 'OTHER NARCOTIC VIOLATION', 'NON-CRIMINAL',
       'HUMAN TRAFFICKING', 'NON - CRIMINAL',
       'NON-CRIMINAL (SUBJECT SPECIFIED)', 'RITUALISM',
       'DOMESTIC VIOLENCE'], dtype=object)

In [3]:
# Load the dataset
def prepare_data_for_stacked_crime_plot(dataset: pd.DataFrame, save=False):
    # Group by year and category, then count occurrences
    crime_trends = dataset.groupby(["Year", "Category"]).size().unstack()

    crime_trends_json = crime_trends.reset_index().to_dict()
    crime_trends_json = {key: [entry[index] for index in entry] for key, entry in crime_trends_json.items()}


    crime_by_year_subcat = dataset.groupby(["Category", "Subcategory", "Year"]).size().reset_index().rename(columns={0: "count"})
    crime_by_year_subcat = crime_by_year_subcat.pivot(index=['Category', 'Subcategory'], columns='Year', values='count')
    crime_by_year_subcat["yearly_counts"] = crime_by_year_subcat.apply(lambda row: [int(row[h]) if not np.isnan(row[h]) else 0 for h in range(2001, 2026)], axis=1)
    crime_by_year_subcat = crime_by_year_subcat.drop(columns=list(range(2001, 2026)))

    subcat_dict = {}
    for category in crime_by_year_subcat.index.levels[0]:
        # This will give you all the rows associated with this category:
        category_data = crime_by_year_subcat.loc[category]["yearly_counts"]
        subcat_dict[category] = category_data.to_dict()

    full_dict = {"main": crime_trends_json, "subcategories": subcat_dict}


    if save:
        with open("../frontend/src/data/crime_category_over_years.json", "w") as file:
            json.dump(full_dict, file, indent=4)
    
    return full_dict

In [4]:
prepare_data_for_stacked_crime_plot(dataset, save=True)

{'main': {'Year': [2001,
   2002,
   2003,
   2004,
   2005,
   2006,
   2007,
   2008,
   2009,
   2010,
   2011,
   2012,
   2013,
   2014,
   2015,
   2016,
   2017,
   2018,
   2019,
   2020,
   2021,
   2022,
   2023,
   2024,
   2025],
  'ASSAULT': [147765,
   145306,
   139503,
   136781,
   131364,
   127008,
   125557,
   121867,
   110960,
   105519,
   99298,
   97056,
   87581,
   79742,
   79389,
   85278,
   85754,
   85930,
   85064,
   76676,
   78318,
   80097,
   87111,
   87191,
   11496],
  'MINOR': [86033,
   84443,
   88326,
   92157,
   90892,
   90402,
   89373,
   79790,
   76291,
   76388,
   71268,
   64024,
   57176,
   48431,
   38953,
   30512,
   27184,
   27027,
   27271,
   18071,
   11963,
   13058,
   14299,
   16212,
   3138],
  'NON STREET CRIME': [59736,
   61017,
   61752,
   61287,
   60231,
   57419,
   57166,
   55054,
   51676,
   46262,
   43544,
   41213,
   41736,
   41750,
   39920,
   44061,
   43835,
   44088,
   44354,
   35325,
   3461

In [5]:
# Required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from ipywidgets import interact, widgets
import matplotlib.dates as mdates

# Extract hour from date
dataset['hour'] = dataset['Date'].dt.hour

# Create the interactive function
def prepare_data_for_stacked_time_plot(dataset: pd.DataFrame, save=False):
    # Filter data by selected crime type

    crime_by_hour = dataset.groupby(["hour", "Category"]).size().unstack()
    crime_by_hour = crime_by_hour.reset_index().to_dict()
    crime_by_hour = {key: [entry[index] for index in entry] for key, entry in crime_by_hour.items()}


    crime_by_hour_subcat = dataset.groupby(["Category", "Subcategory", "hour"]).size().reset_index().rename(columns={0: "count"})
    crime_by_hour_subcat = crime_by_hour_subcat.pivot(index=['Category', 'Subcategory'], columns='hour', values='count')
    crime_by_hour_subcat["hourly_counts"] = crime_by_hour_subcat.apply(lambda row: [int(row[h]) if not np.isnan(row[h]) else 0 for h in range(24)], axis=1)
    crime_by_hour_subcat = crime_by_hour_subcat.drop(columns=list(range(24)))

    subcat_dict = {}
    for category in crime_by_hour_subcat.index.levels[0]:
        # This will give you all the rows associated with this category:
        category_data = crime_by_hour_subcat.loc[category]["hourly_counts"]
        subcat_dict[category] = category_data.to_dict()

    full_dict = {"main": crime_by_hour, "subcategories": subcat_dict}


    if save:
        with open("../frontend/src/data/crime_category_over_hours.json", "w") as file:
            json.dump(full_dict, file, indent=4)

    return full_dict


In [6]:
prepare_data_for_stacked_time_plot(dataset, save=True)

{'main': {'hour': [0,
   1,
   2,
   3,
   4,
   5,
   6,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23],
  'ASSAULT': [121511,
   107149,
   93827,
   75090,
   54896,
   39934,
   36266,
   46122,
   69852,
   82580,
   93183,
   104851,
   114204,
   112163,
   126197,
   138850,
   129602,
   128145,
   133648,
   137890,
   140872,
   142015,
   138833,
   129931],
  'MINOR': [52406,
   30791,
   24187,
   19930,
   16265,
   15169,
   22696,
   33194,
   44327,
   46774,
   59613,
   74860,
   79487,
   72811,
   69696,
   64142,
   57517,
   55233,
   67417,
   88837,
   91821,
   88809,
   82923,
   63777],
  'NON STREET CRIME': [86647,
   24245,
   19809,
   15616,
   12010,
   11028,
   14686,
   24924,
   47254,
   81044,
   64919,
   57592,
   94479,
   58331,
   59869,
   63084,
   59431,
   58785,
   58063,
   57220,
   53635,
   47854,
   41765,
   34495],
  'SEX OFFENSE': [8110,
   3197,
   3140,
   29