In [11]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import copy
from datetime import date

<h3>Read in CSV and separate clusters into individual dataframes. Store cluster dataframes in a list for easy interation</h3>

In [12]:
spir_all = pd.read_csv("Processed SPIR Data/SPIR_with_labels_2021-12-07.csv")

In [42]:
spir_all.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'Event Clearance Code',
       'Event Clearance Description', 'Event Clearance SubGroup',
       'Event Clearance Group', 'Census Tract', 'Initial Type Description',
       'Initial Type Subgroup', 'Initial Type Group', 'at_scene_time',
       'event_clear_time', 'time_at_scene', 'white', 'total_pop', 'male',
       'notUScitizen', 'asian', 'black', 'native', 'other_race', 'two_races',
       'some_college', 'bachelors', 'grad_deg', 'under18MC', 'under18MS',
       'under18FS', 'under18T', 'income', 'initial_type_desc_cat',
       'at_scene_time_pm', 'event_clear_time_pm', 'at_scene_time_weekday',
       'event_clear_time_weekday', 'at_scene_time_month',
       'event_clear_time_month', 'at_scene_time_year', 'event_clear_time_year',
       'time_at_scene_seconds', 'cluster_label'],
      dtype='object')

In [13]:
spir_clust=[]
for cluster in range(7):
    spir_temp = spir_all[spir_all["cluster_label"] == cluster]
    spir_clust.append(spir_temp)

<h3>Organize parameters into separate lists for generating different types of data summaries. Numerical data, such as total population and income, will use median, mean, min, and max functions to summarize data. Categorical data will use value_counts.</h3>

In [14]:
cluster_summary = pd.DataFrame(columns=['white', 'total_pop', 'male', 'notUScitizen', 'asian',
       'black', 'native', 'other_race', 'two_races', 'some_college',
       'bachelors', 'grad_deg', 'under18MC', 'under18MS', 'under18FS',
       'under18T', 'income', 'time_at_scene_seconds'], index=["median", "mean", "min", "max"])
cluster_summary_median = [
       'white', 'total_pop', 'male', 'notUScitizen', 'asian',
       'black', 'native', 'other_race', 'two_races', 'some_college',
       'bachelors', 'grad_deg', 'under18MC', 'under18MS', 'under18FS',
       'under18T', 'income', 'time_at_scene_seconds']

cluster_summary_mean = [
       'white', 'total_pop', 'male', 'notUScitizen', 'asian',
       'black', 'native', 'other_race', 'two_races', 'some_college',
       'bachelors', 'grad_deg', 'under18MC', 'under18MS', 'under18FS',
       'under18T', 'income', 'time_at_scene_seconds']

cluster_summary_min = [
       'white', 'total_pop', 'male', 'notUScitizen', 'asian',
       'black', 'native', 'other_race', 'two_races', 'some_college',
       'bachelors', 'grad_deg', 'under18MC', 'under18MS', 'under18FS',
       'under18T', 'income', 'time_at_scene_seconds']

cluster_summary_max = [
       'white', 'total_pop', 'male', 'notUScitizen', 'asian',
       'black', 'native', 'other_race', 'two_races', 'some_college',
       'bachelors', 'grad_deg', 'under18MC', 'under18MS', 'under18FS',
       'under18T', 'income', 'time_at_scene_seconds']

cluster_summary_value_counts = ['Event Clearance Code',
       'initial_type_desc_cat','at_scene_time_weekday',
       'event_clear_time_weekday', 'at_scene_time_month',
       'event_clear_time_month', 'at_scene_time_year', 'event_clear_time_year',
       'at_scene_time_pm', 'event_clear_time_pm',
       ]
list_of_cluster_summaries = [cluster_summary_median, cluster_summary_mean, cluster_summary_min, 
       cluster_summary_max, cluster_summary_value_counts]

<h4>Numerical data summary</h4>

In [None]:
clust_num = 1
for cluster in spir_clust:
    temp_df = copy.deepcopy(cluster_summary)
    for column_label in cluster_summary_median:
        temp_df.loc["median", column_label] = cluster[column_label].median()
    for column_label in cluster_summary_mean:
        temp_df.loc["mean", column_label] = cluster[column_label].mean()
    for column_label in cluster_summary_min:
        temp_df.loc["min", column_label] = cluster[column_label].min()
    for column_label in cluster_summary_max:
        temp_df.loc["max", column_label] = cluster[column_label].max()
    temp_df.to_csv(f"Cluster Summaries {date.today()}/Cluster_{clust_num}_Summary.csv")
    clust_num += 1


<h4>Categorical Data Summary</h4>

In [10]:
for column in cluster_summary_value_counts:
    temp_df = pd.DataFrame()
    clust_num = 1
    for cluster in spir_clust:
        temp_series = cluster[column].value_counts()
        temp_series.name = f"Cluster_{clust_num}"
        temp_df = pd.concat([temp_df, temp_series], axis=1)
        clust_num += 1
    temp_df.to_csv(f"Cluster Summaries {date.today()}/{column}_summary_by_cluster.csv")
    


<h4>Census Tract Summary</h4>

In [None]:
import math
temp_df = pd.DataFrame()
clust_num = 1
for cluster in spir_clust:
    temp_series = cluster["Census Tract"].apply(math.floor).value_counts()
    temp_series.name = f"Cluster_{clust_num}"
    temp_df = pd.concat([temp_df, temp_series], axis=1)
    clust_num += 1
temp_df.to_csv(f"Cluster Summaries 2021-12-07/Census Tract_summary_by_cluster.csv")

<h4>Time of Day (AM or PM) Summary</h4>

In [17]:
temp_df = pd.DataFrame(columns=['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7', 'All'], index=['am_median', 'pm_median', 'am_mean', 'pm_mean'])
clust_num = 1
for cluster in spir_clust:
    temp_series_pm = cluster[cluster.at_scene_time_pm == 1].time_at_scene_seconds
    temp_series_am = cluster[cluster.at_scene_time_pm == 0].time_at_scene_seconds
    temp_df[f'Cluster {clust_num}']['am_median'] = temp_series_am.median()
    temp_df[f'Cluster {clust_num}']['pm_median'] = temp_series_pm.median()
    temp_df[f'Cluster {clust_num}']['am_mean'] = temp_series_am.mean()
    temp_df[f'Cluster {clust_num}']['pm_mean'] = temp_series_pm.mean()
    clust_num += 1

temp_series_pm = spir_all[spir_all.at_scene_time_pm == 1].time_at_scene_seconds
temp_series_am = spir_all[spir_all.at_scene_time_pm == 0].time_at_scene_seconds
temp_df['All']['am_median'] = temp_series_am.median()
temp_df['All']['pm_median'] = temp_series_pm.median()
temp_df['All']['am_mean'] = temp_series_am.mean()
temp_df['All']['pm_mean'] = temp_series_pm.mean()

temp_df.to_csv("Cluster Summaries 2021-12-07/Occurence_vs_clearance_time.csv")


<h4>Day of the Week vs Time at Scene Summary</h4>

In [23]:
temp_df = pd.DataFrame(columns=['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7', 'All'], 
    index=[0, 1, 2, 3, 4, 5, 6])
clust_num = 1
for cluster in spir_clust:
    for day in range(7):
        temp_series = cluster[cluster.at_scene_time_weekday == day].time_at_scene_seconds
        temp_df[f'Cluster {clust_num}'][day] = temp_series.median()
        #temp_df[f'Cluster {clust_num}'][day] = temp_series.mean()
    clust_num += 1

for day in range(7):
    temp_series = spir_all[spir_all.at_scene_time_weekday == day].time_at_scene_seconds
    temp_df[f'All'][day] = temp_series.median()
    #temp_df[f'All'][day] = temp_series.mean()

temp_df.to_csv("Cluster Summaries 2021-12-07/Weekday vs clearance time_median.csv")


<h4>Month vs Time at Scene Summary</h4>

In [30]:
temp_df = pd.DataFrame(columns=['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7', 'All'], 
    index=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
clust_num = 1
for cluster in spir_clust:
    for month in range(13):
        temp_series = cluster[cluster.at_scene_time_month == month].time_at_scene_seconds
        #temp_df[f'Cluster {clust_num}'][month] = temp_series.median()
        temp_df[f'Cluster {clust_num}'][month] = temp_series.mean()
    clust_num += 1

for month in range(13):
    temp_series = spir_all[spir_all.at_scene_time_month == month].time_at_scene_seconds
    #temp_df[f'All'][month] = temp_series.median()
    temp_df[f'All'][month] = temp_series.mean()

temp_df.to_csv("Cluster Summaries 2021-12-07/Month vs clearance time_mean.csv")
#temp_df.to_csv("Cluster Summaries 2021-12-07/Month vs clearance time_median.csv")

<h4>Year vs Time at Scene Summary</h4>

In [37]:
temp_df = pd.DataFrame(columns=['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7', 'All'], 
    index=[2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017])
clust_num = 1
for cluster in spir_clust:
    for year in range(2010,2018):
        temp_series = cluster[cluster.at_scene_time_year == year].time_at_scene_seconds
        temp_df[f'Cluster {clust_num}'][year] = temp_series.median()
        #temp_df[f'Cluster {clust_num}'][year] = temp_series.mean()
    clust_num += 1

for year in range(2010, 2018):
    temp_series = spir_all[spir_all.at_scene_time_year == year].time_at_scene_seconds
    temp_df[f'All'][year] = temp_series.median()
    #temp_df[f'All'][year] = temp_series.mean()

#temp_df.to_csv("Cluster Summaries 2021-12-07/Year vs clearance time_mean.csv")
temp_df.to_csv("Cluster Summaries 2021-12-07/Year vs clearance time_median.csv")

<h4>Event Type vs Time at Scene Summary</h4>

In [43]:
event_codes = [245, 430, 280, 460, 161, 65, 63, 64, 71, 40]
temp_df = pd.DataFrame(columns=['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5', 'Cluster 6', 'Cluster 7', 'All'], 
    index=event_codes)
clust_num = 1
for cluster in spir_clust:
    for code in event_codes:
        temp_series = cluster[cluster['Event Clearance Code'] == code].time_at_scene_seconds
        temp_df[f'Cluster {clust_num}'][code] = temp_series.median()
        #temp_df[f'Cluster {clust_num}'][code] = temp_series.mean()
    clust_num += 1

for code in event_codes:
    temp_series = spir_all[spir_all['Event Clearance Code'] == code].time_at_scene_seconds
    temp_df[f'All'][code] = temp_series.median()
    #temp_df[f'All'][code] = temp_series.mean()

#temp_df.to_csv("Cluster Summaries 2021-12-07/Year vs clearance time_mean.csv")
temp_df.to_csv("Cluster Summaries 2021-12-07/Event vs clearance time_median.csv")

<h4>Output individual clusters to CSV files</h4>

In [None]:
clust_num=1
for df in spir_clust:
    df.to_csv(f"SPIR_Cluster_{clust_num}_20211207.csv")
    clust_num += 1