Cleaning Raw CSV Files:

In [1]:
import numpy as np
import pandas as pd

In [2]:
def categorize_time(time_str):
    #Split Violation Time column into Morning, Afternoon, and Night
    #we define morning as 8am-12pm, afternoon as 12-4pm, evening as 4-8pm and night as 8pm-8am. 

    #Exception handling
    if (not isinstance(time_str, str)) or (len(time_str)!=5): 
        return None
    if (time_str[:4].isdigit() and time_str[4] in ['P', 'A']):
        hour = int(time_str[:2])
        period = time_str[-1]
    
        #PM
        if period == 'P':
            if hour >= 8 and hour < 12:
                return 'Night'
            elif hour >= 4 and hour <8:
                return 'Evening'            
            else:
                return 'Afternoon'
        #AM
        else:  
            if hour >= 12 or hour < 8:
                return 'Night'
            elif hour >= 8:
                return 'Morning'
            else:
                return 'Night'
     
    else: return None
   

In [3]:
# #Change csv as need be
# df=pd.read_csv('../../data/main_data/time_of_day/Parking_Violations_Issued_2014.csv')

# 2014
df_2014 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2014.csv")
# 2015
df_2015 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2015.csv")
# 2016
df_2016 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2016.csv")
# 2017
df_2017 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2017.csv")
# 2018
df_2018 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2018.csv")
# 2019
df_2019 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2019.csv")
# 2020
df_2020 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2020.csv")
# 2021
df_2021 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2021.csv")
# 2022
df_2022 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2022.csv")
# 2023
df_2023 = pd.read_csv("../../data/main_data/time_of_day/Parking_Violations_Issued_2023.csv")

df = pd.concat([df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020, df_2021, df_2022, df_2023], axis = 0)
df = df.reset_index(drop = True)

In [4]:
df['Time of Day'] = df['Violation Time'].apply(categorize_time)

In [5]:
# Convert 'Issue Date' column to DateTime
df['Issue Date'] = pd.to_datetime(df['Issue Date'])
# Extract month from 'Issue Date'
df['Month'] = df['Issue Date'].dt.strftime('%m/%y')

In [6]:
violations_summary = df.groupby(['Month', 'Violation Precinct', 'Time of Day']).size().reset_index(name='Violations')
print(violations_summary)

       Month  Violation Precinct Time of Day  Violations
0      01/14                   1   Afternoon          88
1      01/14                   1     Evening          28
2      01/14                   1     Morning          91
3      01/14                   1       Night          27
4      01/14                   5   Afternoon          52
...      ...                 ...         ...         ...
35903  12/22                 122       Night           4
35904  12/22                 123   Afternoon           1
35905  12/22                 123     Evening           1
35906  12/22                 123     Morning           2
35907  12/22                 123       Night           3

[35908 rows x 4 columns]


In [7]:
# Group violations summary by Month, Precinct, and find the index of the max violations for each group
idx = violations_summary.groupby(['Month', 'Violation Precinct'])['Violations'].idxmax()
# Filter violations summary 
result = violations_summary.loc[idx, ['Month', 'Violation Precinct', 'Time of Day', 'Violations']]
# Rename the columns 
result.columns = ['Month', 'Precinct', 'Time Most Tickets Are Issued', 'No of Tickets']
# Splitting 'Month' into 'Month' and 'Year' columns
result['Year'] = '20' + result['Month'].str[-2:]  # Extracting the last two digits as year and converting to YYYY format
result['Month'] = result['Month'].str[:2].str.zfill(2)  # Extracting the first two characters as month and ensuring format MM
result = result[['Month', 'Year', 'Precinct', 'Time Most Tickets Are Issued', 'No of Tickets']]
# Reset index
result = result.reset_index(drop=True)
print(result)

     Month  Year  Precinct Time Most Tickets Are Issued  No of Tickets
0       01  2014         1                      Morning             91
1       01  2014         5                    Afternoon             52
2       01  2014         6                      Morning             35
3       01  2014         7                    Afternoon             25
4       01  2014         9                      Morning             39
...    ...   ...       ...                          ...            ...
9151    12  2022       115                      Morning             82
9152    12  2022       120                      Morning             24
9153    12  2022       121                      Morning             14
9154    12  2022       122                      Morning             11
9155    12  2022       123                        Night              3

[9156 rows x 5 columns]


In [8]:
df2 = result

# # Append the data to existing CSV file for summaries
# result.to_csv('All.csv', mode='a', header=False, index=True)

Clean Up Generated Summary CSV ('All.csv'):

In [9]:
# df2=pd.read_csv('All.csv')

In [10]:
df2.head()

Unnamed: 0,Month,Year,Precinct,Time Most Tickets Are Issued,No of Tickets
0,1,2014,1,Morning,91
1,1,2014,5,Afternoon,52
2,1,2014,6,Morning,35
3,1,2014,7,Afternoon,25
4,1,2014,9,Morning,39


In [11]:
max_tickets_idx = df2.groupby(['Month', 'Year', 'Precinct'])['No of Tickets'].idxmax()
filtered_df = df2.loc[max_tickets_idx, ['Month', 'Year', 'Precinct', 'Time Most Tickets Are Issued']]

# Drop duplicates
filtered_df = filtered_df.drop_duplicates(subset=['Month', 'Year', 'Precinct'])

filtered_df['Date'] = pd.to_datetime(filtered_df['Month'].astype(str) + '/' + filtered_df['Year'].astype(str), format='%m/%Y')
# Sort 
filtered_df = filtered_df.sort_values('Date')

# Reset index 
filtered_df = filtered_df.reset_index(drop=True)
print(filtered_df)


     Month  Year  Precinct Time Most Tickets Are Issued       Date
0       07  2013        63                    Afternoon 2013-07-01
1       07  2013        32                        Night 2013-07-01
2       07  2013        33                        Night 2013-07-01
3       07  2013        34                      Morning 2013-07-01
4       07  2013        40                        Night 2013-07-01
...    ...   ...       ...                          ...        ...
9151    06  2023        40                      Morning 2023-06-01
9152    06  2023        34                        Night 2023-06-01
9153    06  2023        33                    Afternoon 2023-06-01
9154    06  2023        45                    Afternoon 2023-06-01
9155    06  2023       122                    Afternoon 2023-06-01

[9156 rows x 5 columns]


In [12]:
filtered_df.to_csv('../../data/final_data_to_join/time_of_day_summary.csv')