In [1]:
import pandas as pd

In [2]:
def clean_plate_type(pt):
    try:
        if pt == "PAS":
            return "PASSENGER"
        elif pt == "COM":
            return "COMMERCIAL"
        else:
            return "OTHER"
    except:
        return "OTHER"

In [3]:
year_list = [i for i in range(2014, 2024)]
flag = 0

for year in year_list:

    print(f"Started {year}")
    
    # load data
    data = pd.read_csv(f"../../data/main_data/color_and_plate_type/Parking_Violations_Issued_{str(year)}.csv",
                       dtype="string")
    
    # create month and year columns, convert precinct to int
    data["Month"] = data["Issue Date"].map(lambda x: int(x[:2]))
    data["Year"] = data["Issue Date"].map(lambda x: int(x[-4:]))
    data["Violation Precinct"] = data["Violation Precinct"].map(lambda x: int(x))
    
    # drop unnecessary columns
    data.drop(columns=["Issue Date", "Vehicle Color"], inplace=True)

    # clean colors and dates
    data["Plate Type"] = data["Plate Type"].map(lambda x: clean_plate_type(x))

    # complicated aggregate queries
    data2 = data.groupby(by = ["Year","Month","Violation Precinct","Plate Type"], 
                            as_index=False).size()
    data2 = data2.groupby(by = ["Year","Month","Violation Precinct","Plate Type"],
                        )["size"].sum().unstack(fill_value=0).stack()
    # why the above? https://stackoverflow.com/questions/64348300/add-default-value-while-grouping-elements-in-pandas 
    data2 = data2.reset_index()
    data2 = data2.rename(columns={0:"size"})
    data2 = data2.sort_values(by = ["Year","Month","Violation Precinct","size"],
                                ascending = [True, True, True, False])

    # separate out rows
    data2_pas = data2[data2["Plate Type"] == "PASSENGER"].reset_index(drop=True)
    data2_com = data2[data2["Plate Type"] == "COMMERCIAL"].reset_index(drop=True)
    data2_oth = data2[data2["Plate Type"] == "OTHER"].reset_index(drop=True)
    data_max = data.groupby(by = ["Year","Month","Violation Precinct",], 
                        as_index=False).size()
    data_max = data_max.sort_values(by = ["Year","Month","Violation Precinct",],)

    # assign columns
    data3 = data2.drop(columns=["Plate Type", "size"]).drop_duplicates()
    data3.reset_index(drop=True, inplace=True)
    data3["total_passenger_vehicles"] = data2_pas["size"]
    data3["total_commercial_vehicles"] = data2_com["size"]
    data3["total_other_vehicles"] = data2_oth["size"]
    data3["total_vehicles"] = data_max["size"]

    # percentages
    data3["percent_passenger_vehicles"] = 100*data3["total_passenger_vehicles"] / data3["total_vehicles"]
    data3["percent_commercial_vehicles"] = 100*data3["total_commercial_vehicles"] / data3["total_vehicles"]
    data3["percent_other_vehicles"] = 100*data3["total_other_vehicles"] / data3["total_vehicles"]
    
    # append this year's data to final_data
    if not flag:
        flag = 1
        final_data = data3
    else:
        final_data = pd.concat([final_data, data3])
    
    print(f"Finished {str(year)}")

final_data.to_csv(f"../../data/final_data_to_join/plate_type_data.csv", index=False)

Started 2014
Finished 2014
Started 2015
Finished 2015
Started 2016
Finished 2016
Started 2017
Finished 2017
Started 2018
Finished 2018
Started 2019
Finished 2019
Started 2020
Finished 2020
Started 2021
Finished 2021
Started 2022
Finished 2022
Started 2023
Finished 2023


In [4]:
# data2 = data.groupby(by = ["Year","Month","Violation Precinct","Plate Type"], 
#                         as_index=False).size()
# data2 = data2.groupby(by = ["Year","Month","Violation Precinct","Plate Type"],
#                       )["size"].sum().unstack(fill_value=0).stack()
# # why the above? https://stackoverflow.com/questions/64348300/add-default-value-while-grouping-elements-in-pandas 
# data2 = data2.reset_index()
# data2 = data2.rename(columns={0:"size"})
# data2 = data2.sort_values(by = ["Year","Month","Violation Precinct","size"],
#                             ascending = [True, True, True, False])

# # separate out rows
# data2_pas = data2[data2["Plate Type"] == "PASSENGER"].reset_index(drop=True)
# data2_com = data2[data2["Plate Type"] == "COMMERCIAL"].reset_index(drop=True)
# data2_oth = data2[data2["Plate Type"] == "OTHER"].reset_index(drop=True)
# data_max = data.groupby(by = ["Year","Month","Violation Precinct",], 
#                      as_index=False).size()
# data_max = data_max.sort_values(by = ["Year","Month","Violation Precinct",],)

# # assign columns
# data3 = data2.drop(columns=["Plate Type", "size"]).drop_duplicates()
# data3.reset_index(drop=True, inplace=True)
# data3["total_passenger_vehicles"] = data2_pas["size"]
# data3["total_commercial_vehicles"] = data2_com["size"]
# data3["total_other_vehicles"] = data2_oth["size"]
# data3["total_vehicles"] = data_max["size"]

# # percentages
# data3["percent_passenger_vehicles"] = 100*data3["total_passenger_vehicles"] / data3["total_vehicles"]
# data3["percent_commercial_vehicles"] = 100*data3["total_commercial_vehicles"] / data3["total_vehicles"]
# data3["percent_other_vehicles"] = 100*data3["total_other_vehicles"] / data3["total_vehicles"]

# data3

In [5]:
# data3.to_csv(f"../Data/final_data_to_join/temp.csv", index=False)

In [6]:
# data2

In [7]:
# data3

In [8]:
# data2.columns.tolist()

In [9]:
# vc_pt = data["Plate Type"].value_counts()
# l1 = vc_pt.index.tolist()
# l2 = vc_pt.tolist()
# l = [[l1[i], l2[i]] for i in range(len(vc_pt))]
# l

In [10]:
# data2 = data["Plate Type"].map(lambda x: clean_plate_type(x))
# vc_pt = data2["Plate Type"].value_counts()
# l1 = vc_pt.index.tolist()
# l2 = vc_pt.tolist()
# l = [[l1[i], l2[i]] for i in range(len(vc_pt))]
# l