In [1]:
from zipfile import ZipFile
import copy
import json
import pandas as pd
from tqdm import tqdm

In [2]:
gardiner_dfs = []
front_yonge_dfs = []
incidents_dfs = []
weather_dfs = []

with ZipFile("../../raw_data/traffic_flow/202305.zip") as traffic_flow:
    total_files = len(traffic_flow.filelist)
    for file_idx in tqdm(range(total_files), ncols=75):
        flow_file = traffic_flow.filelist[file_idx]
        if not flow_file.is_dir():
            json_data_str = traffic_flow.open(
                flow_file.filename).read().decode("UTF-8")
            json_data_obj = json.loads(json_data_str)

            collected_at = json_data_obj["collected_at"]

            gardiner_data = json_data_obj["gardiner_expy_traffic"]["flowSegmentData"]
            gardiner_df_tmp = pd.json_normalize(gardiner_data)
            gardiner_df_tmp["collectedAt"] = collected_at
            gardiner_dfs.append(gardiner_df_tmp)
            del gardiner_data
            del gardiner_df_tmp

            front_yonge_data = json_data_obj["front_yonge_traffic"]["flowSegmentData"]
            front_yonge_df_tmp = pd.json_normalize(front_yonge_data)
            front_yonge_df_tmp["collectedAt"] = collected_at
            front_yonge_dfs.append(front_yonge_df_tmp)
            del front_yonge_data
            del front_yonge_df_tmp

            incidents_data = json_data_obj["incidents"]["incidents"]
            incidents_df_tmp = pd.json_normalize(incidents_data)
            incidents_df_tmp["collectedAt"] = collected_at
            incidents_dfs.append(incidents_df_tmp)
            del incidents_data
            del incidents_df_tmp

            weather_data = json_data_obj["weather_info"]
            weather_condition_array = copy.deepcopy(weather_data["weather"])
            weather_condition_array = sorted(
                weather_condition_array, key=lambda x: x["id"])
            weather_condition_array = list(
                map(lambda x: x["id"], weather_condition_array))
            weather_data["weather"] = weather_condition_array
            weather_data["collectedAt"] = collected_at
            weather_df_tmp = pd.json_normalize(weather_data)
            weather_dfs.append(weather_df_tmp)
            del weather_data
            del weather_condition_array
            del weather_df_tmp

100%|█████████████████████████████████| 8959/8959 [00:26<00:00, 335.77it/s]


In [3]:
gardiner_merged_df = pd.concat(gardiner_dfs)
gardiner_merged_df = gardiner_merged_df.sort_values(by=["collectedAt"])
gardiner_merged_df["collectedAt"] = pd.to_datetime(
    gardiner_merged_df["collectedAt"])
del gardiner_dfs

In [4]:
front_yonge_merged_df = pd.concat(front_yonge_dfs)
front_yonge_merged_df = front_yonge_merged_df.sort_values(by=["collectedAt"])
front_yonge_merged_df["collectedAt"] = pd.to_datetime(
    front_yonge_merged_df["collectedAt"])
del front_yonge_dfs

In [5]:
incidents_merged_df = pd.concat(incidents_dfs)
incidents_merged_df = incidents_merged_df.drop_duplicates(
    subset="properties.id")
incidents_merged_df = incidents_merged_df.sort_values(by=["collectedAt"])
incidents_merged_df["collectedAt"] = pd.to_datetime(
    incidents_merged_df["collectedAt"])
del incidents_dfs

In [6]:
weather_merged_df = pd.concat(weather_dfs)
weather_merged_df = weather_merged_df.sort_values(by=["collectedAt"])
weather_merged_df["collectedAt"] = pd.to_datetime(
    weather_merged_df["collectedAt"])
del weather_dfs

In [7]:
gardiner_merged_df = gardiner_merged_df.rename(columns={"coordinates.coordinate": "coordinates"})

front_yonge_merged_df = front_yonge_merged_df.rename(columns={"coordinates.coordinate": "coordinates"})

incidents_merged_df = incidents_merged_df.rename(columns={"properties.id": "id",
                                                          "properties.iconCategory": "iconCategory",
                                                          "properties.magnitudeOfDelay": "magnitudeOfDelay",
                                                          "properties.startTime": "startTime",
                                                          "properties.endTime": "endTime",
                                                          "properties.from": "from",
                                                          "properties.to": "to",
                                                          "properties.length": "length",
                                                          "properties.delay": "delay",
                                                          "properties.roadNumbers": "roadNumbers",
                                                          "properties.timeValidity": "timeValidity",
                                                          "properties.probabilityOfOccurrence": "probabilityOfOccurrence",
                                                          "properties.numberOfReports": "numberOfReports",
                                                          "properties.lastReportTime": "lastReportTime",
                                                          "properties.events": "events",
                                                          "properties.tmc.countryCode": "tmcCountryCode",
                                                          "properties.tmc.tableNumber": "tmcTableNumber",
                                                          "properties.tmc.tableVersion": "tmcTableVersion",
                                                          "properties.tmc.direction": "tmcDirection",
                                                          "properties.tmc.points": "tmcPoints",
                                                          "geometry.type": "geometryType",
                                                          "geometry.coordinates": "geometryCoordinates",
                                                          "properties.tmc": "tmc"})

weather_merged_df = weather_merged_df.rename(columns={"dt": "datetime",
                                                      "cod": "Code",
                                                      "coord.lon": "longitude",
                                                      "coord.lat": "latitude",
                                                      "main.temp": "temperature",
                                                      "main.feels_like": "FeelsLike",
                                                      "main.temp_min": "tempMin",
                                                      "main.temp_max": "tempMax",
                                                      "main.pressure": "pressure",
                                                      "main.humidity": "humidity",
                                                      "wind.speed": "windSpeed",
                                                      "wind.deg": "windDegree",
                                                      "rain.1h": "rain1h",
                                                      "snow.1h": "snow1h",
                                                      "clouds.all": "cloudsAll",
                                                      "sys.type": "systemType",
                                                      "sys.id": "systemId",
                                                      "sys.country": "systemCountry",
                                                      "sys.sunrise": "systemSunrise",
                                                      "sys.sunset": "systemSunset",
                                                      "wind.gust": "windGust",
                                                      "main.sea_level": "seaLevel",
                                                      "main.grnd_level": "groundLevel"})

In [8]:
gardiner_merged_df.to_csv(f"../../csv/v2/202305_gardiner_flow_data.csv.zip", index=False, compression="zip")
front_yonge_merged_df.to_csv(f"../../csv/v2/202305_yonge_flow_data.csv.zip", index=False, compression="zip")
incidents_merged_df.to_csv(f"../../csv/v2/202305_incidents_data.csv.zip", index=False, compression="zip")
weather_merged_df.to_csv(f"../../csv/v2/202305_weather_data.csv.zip", index=False, compression="zip")