Reads all of the json files inside the zip files of the transport data.
Saves them to a csv file (consolidated traffic flow data)

In [1]:
import os
import glob
import zipfile
import pandas as pd
import tempfile
from tqdm import tqdm

In [2]:
# Step 1: List all zip files
years = ['2022', '2023']
data_path = "../data/raw/Hackacity 23/Datasets/Traffic Detectors/Data/"
zip_files = []
for year in years:
    full_path = os.path.join(data_path, f'{year}/*/*.zip')
    zip_files.extend(glob.glob(full_path))

In [5]:
# Create a CSV file to store the data
output_csv = 'consolidated_traffic_flow_data.csv'
iterim_data_path = "../data/interim/"
full_output_path = os.path.join(iterim_data_path, output_csv)

# Check if the CSV file exists and create it if it doesn't
if not os.path.isfile(full_output_path):
    with open(full_output_path, 'w') as f:
        pass

In [9]:
# Step 2: Extract zip files and Step 3: Read JSON files
all_dataframes = []
for zip_file in tqdm(zip_files):
    with tempfile.TemporaryDirectory() as tmpdirname:
        with zipfile.ZipFile(zip_file, 'r') as zip_ref:
            zip_ref.extractall(tmpdirname)
        
        json_files = glob.glob(os.path.join(tmpdirname, '*.json'))
        for json_file in tqdm(json_files):
            df = pd.read_json(json_file)
            # Check if the file is empty to decide whether to write the header
            write_header = not os.path.exists(full_output_path) or os.stat(full_output_path).st_size == 0
            df.to_csv(full_output_path, mode='a', header=write_header, index=False)

100%|██████████| 127/127 [00:11<00:00, 11.12it/s]
100%|██████████| 127/127 [00:21<00:00,  5.95it/s]
100%|██████████| 127/127 [00:02<00:00, 51.13it/s]
100%|██████████| 127/127 [00:42<00:00,  2.96it/s]
100%|██████████| 127/127 [00:14<00:00,  9.00it/s]
100%|██████████| 127/127 [00:17<00:00,  7.36it/s]
100%|██████████| 127/127 [00:16<00:00,  7.58it/s]
100%|██████████| 127/127 [00:05<00:00, 22.59it/s]
100%|██████████| 127/127 [00:36<00:00,  3.48it/s]
100%|██████████| 127/127 [00:12<00:00, 10.49it/s]
100%|██████████| 127/127 [00:08<00:00, 15.75it/s]
100%|██████████| 127/127 [00:09<00:00, 14.09it/s]
100%|██████████| 127/127 [01:00<00:00,  2.11it/s]
100%|██████████| 127/127 [00:38<00:00,  3.32it/s]
100%|██████████| 127/127 [01:01<00:00,  2.07it/s]
100%|██████████| 127/127 [00:47<00:00,  2.67it/s]
100%|██████████| 127/127 [00:50<00:00,  2.51it/s]
100%|██████████| 17/17 [07:51<00:00, 27.73s/it]
