In [1]:
import os
import gzip
from datetime import datetime
from tqdm import tqdm
import xmltodict
import json
import pandas as pd
import threading
import time
import urllib
from concurrent.futures import ThreadPoolExecutor
import logging

# Global directory variables
decompressed_dir = None
compressed_dir = None
subfolder_path = None

# Set up logging
logging.basicConfig(filename='error_log.txt', level=logging.ERROR,
                    format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

# Custom function to log errors
def log_error(message):
    print(message)
    logging.error(message)

# Custom function to log general messages
def log_message(message):
    print(message)

download_delay = 0.05
last_downloaded_time = time.time()

def download_file(url, file_path):
    global last_downloaded_time
    if os.path.exists(file_path):
        log_message(f"File {file_path} already exists. Skipping download.")
        return file_path

    try:
        urllib.request.urlretrieve(url, file_path)
        time.sleep(download_delay)
        file_size = os.path.getsize(file_path)

        if file_size < 150*1024:
            os.remove(file_path)
            log_error(f"Deleted {file_path} as its size was less than 1KB.")
            return None

        last_downloaded_time = time.time()
        return file_path

    except urllib.error.HTTPError as e:
        log_error(f"Failed to download {url}. Error: {e}")
    return None

def monitor_downloads(timeout=3):
    global last_downloaded_time
    while True:
        time.sleep(1)
        if time.time() - last_downloaded_time > timeout:
            log_message("Monitoring timeout. Stopping the download process.")
            return False

def get_xml(date, path, max_concurrent_downloads):
    global last_downloaded_time

    try:
        input_date = datetime.strptime(date, '%Y%m%d')
        if input_date.date() >= datetime.now().date():
            log_error(f"{date} is not in the past.")
            return
    except ValueError:
        log_error(f"{date} is not a valid date in YYYYMMDD format.")
        return

    subfolder_path = os.path.join(path, date)
    if not os.path.exists(subfolder_path):
        os.mkdir(subfolder_path)
        log_message(f"Created subfolder at {subfolder_path}")
    else:
        log_message(f"Subfolder {subfolder_path} already exists.")

    download_attempts = 0
    downloaded_files = []
    while download_attempts < 2:
        log_message(f"Download attempt {download_attempts + 1}")

        last_downloaded_time = time.time()
        downloaded_files = []
        with ThreadPoolExecutor(max_workers=max_concurrent_downloads) as executor:
            futures = []
            for current_time in [f"{hour:02d}{minute:02d}" for hour in range(24) for minute in range(60)]:
                filename = f'VDLive_{current_time}.xml.gz'
                file_path = os.path.join(subfolder_path, filename)

                if os.path.isfile(file_path) and os.path.getsize(file_path) > 1024:
                    downloaded_files.append(filename)
                    continue

                url = f'https://tisvcloud.freeway.gov.tw/history/motc20/VD/{date}/VDLive_{current_time}.xml.gz'
                futures.append(executor.submit(download_file, url, file_path))

            monitor_thread = threading.Thread(target=monitor_downloads)
            monitor_thread.start()

            for future in futures:
                result = future.result()
                if result:
                    downloaded_files.append(os.path.basename(result))

            monitor_thread.join()

            download_attempts += 1
            if len(downloaded_files) == 1440:
                break
            elif len(downloaded_files) > 1400:
                max_concurrent_downloads = 1

    log_message(f"Downloaded {len(downloaded_files)} out of 1440 files.")

    if len(downloaded_files) != 1440:
        all_files = [f'VDLive_{hour:02d}{minute:02d}.xml.gz' for hour in range(24) for minute in range(60)]
        missing_files = [f for f in all_files if not os.path.isfile(os.path.join(subfolder_path, f))]
    
        for missing_file in missing_files:
            log_error(f"Failed to download or found corrupted: {missing_file}")

    return downloaded_files, subfolder_path

def decompress_xml(downloaded_files, subfolder_path, date):
    decompressed_dir = os.path.join(subfolder_path, f"decompressed_{date}")
    compressed_dir = os.path.join(subfolder_path, f"compressed_{date}")

    if not os.path.exists(decompressed_dir):
        os.mkdir(decompressed_dir)
        log_message(f"Created decompressed directory at {decompressed_dir}")
    else:
        log_message(f"Decompressed directory {decompressed_dir} already exists.")

    if not os.path.exists(compressed_dir):
        os.mkdir(compressed_dir)
        log_message(f"Created compressed directory at {compressed_dir}")
    else:
        log_message(f"Compressed directory {compressed_dir} already exists.")

    log_message("Starting decompression...")

    for file in tqdm(downloaded_files):
        new_xml_filename = file.replace('.gz', '')
        decompressed_file_path = os.path.join(decompressed_dir, new_xml_filename)

        if os.path.exists(decompressed_file_path):
            log_message(f"File {decompressed_file_path} already decompressed. Skipping.")
            continue

        with gzip.open(os.path.join(subfolder_path, file), 'rb') as f_in:
            xml_data = f_in.read()
            with open(decompressed_file_path, 'wb') as f_out:
                f_out.write(xml_data)

        os.rename(os.path.join(subfolder_path, file), os.path.join(compressed_dir, file))

    log_message("Decompression complete. Generating error report...")

    all_files = [f'VDLive_{hour:02d}{minute:02d}.xml.gz' for hour in range(24) for minute in range(60)]
    missing_compressed = [f for f in all_files if not os.path.exists(os.path.join(compressed_dir, f))]
    missing_decompressed = [f.replace('.gz', '') for f in missing_compressed]

    report_file_path = os.path.join(subfolder_path, f"{date}_error_report.txt")
    with open(report_file_path, 'w') as f:
        f.write("Missing Compressed Files:\n")
        for file in missing_compressed:
            f.write(f"{file}\n")
        f.write("\nMissing Decompressed Files:\n")
        for file in missing_decompressed:
            f.write(f"{file}\n")

    log_message(f"Error report saved to {report_file_path}")

##################################################################################################################

   

def get_vd(date, time, desired_vdids, dataset):
    results = {v: None for v in desired_vdids}
    
    directory = os.path.join(dataset, date, f"decompressed_{date}", f"VDLive_{time}.xml")
    
    if not os.path.isfile(directory):
        for vdid in desired_vdids:
            results[vdid] = {"error": f"File {directory} not found."}
        return results
    elif os.path.getsize(directory) == 0:
        for vdid in desired_vdids:
            results[vdid] = {"error": f"File {directory} is empty."}
        return results

    with open(directory) as xml_file:
        data_dict = xmltodict.parse(xml_file.read())
    json_data = json.dumps(data_dict)
    data = json.loads(json_data)

    for vdlive in data["VDLiveList"]["VDLives"]["VDLive"]:
        if vdlive["VDID"] in desired_vdids:
            desired_vdids.remove(vdlive["VDID"])  # This VDID is found; no need to search for it again
            
            link_flows = vdlive["LinkFlows"]["LinkFlow"]
            date_time = f'{date}_{time}'

            vdid_data = {
                "VDID": vdlive["VDID"],
                "LinkFlows": link_flows,
                "Status": vdlive["Status"],
                "DateTime": date_time
            }
            results[vdlive["VDID"]] = process_data(vdid_data)

            # If we've found all the desired VDIDs, we can exit early
            if not desired_vdids:
                break
                
    for vdid in desired_vdids:  # VDIDs not found
        results[vdid] = {"absent": str(time)}
                
    return results

# [Rest of the code remains unchanged]
def process_data(data):
    rows = []
    vdid = data['VDID']
    date_time = data['DateTime']
    link_id = data['LinkFlows']['LinkID']

    # Situation 1: 'Lane' key is present in the 'Lanes' dictionary
    if 'Lane' in data['LinkFlows']['Lanes']:
        lanes = data['LinkFlows']['Lanes']['Lane']
        if isinstance(lanes, dict):
            lanes = [lanes]
        for lane in lanes:
            rows.extend(process_lane(lane, date_time, vdid, link_id))

    # Situation 2: 'Lane' key is not present in the 'Lanes' dictionary
    elif 'Lane' in data['LinkFlows']['Lanes']:
        lane = data['LinkFlows']['Lanes']['Lane']
        rows.extend(process_lane(lane, date_time, vdid, link_id))

    return rows

def process_lane(lane, date_time, vdid, link_id):
    rows = []
    lane_id = lane['LaneID']
    lane_type = lane['LaneType']
    lane_speed = lane['Speed']
    occupancy = lane['Occupancy']
    for vehicle in lane['Vehicles']['Vehicle']:
        vehicle_type = vehicle['VehicleType']
        volume = vehicle['Volume']
        speed = vehicle['Speed']
        rows.append([date_time, vdid, link_id, lane_id,
                     lane_type, lane_speed, occupancy,
                     vehicle_type, volume, speed])
    return rows

def get_vds(date, desired_vdids, dataset):
    all_results = {vdid: [] for vdid in desired_vdids}
    error_logs = {vdid: [] for vdid in desired_vdids}

    for current_time in tqdm([f"{hour:02d}{minute:02d}" for hour in range(1) for minute in range(60)]):
        results = get_vd(date, current_time, desired_vdids.copy(), dataset)
        for vdid, result in results.items():
            if "error" in result:
                error_logs[vdid].append(result["error"])
            elif "absent" in result:
                error_logs[vdid].append(f"VDID absent at time: {result['absent']}")
            else:
                all_results[vdid].extend(result)

    for vdid in desired_vdids:
        if all_results[vdid]:
            # Process and save the results
            df = pd.DataFrame(all_results[vdid], columns=['DateTime', 'VDID', 'LinkID', 'LaneID', 'LaneType', 'LaneSpeed', 'Occupancy', 'VehicleType', 'Volume', 'Speed'])
            
            #pivot dataframe according to vehicle
            df = df.pivot(index=['DateTime', 'VDID', 'LinkID', 'LaneID', 'LaneType', 'LaneSpeed', 'Occupancy'], columns='VehicleType', values=['Volume', 'Speed'])
            df.columns = ['_'.join(col).strip() for col in df.columns.values]
            df = df.reset_index()
            df = df.reindex(columns=['DateTime', 'VDID', 'LinkID', 'LaneID','LaneType', 'LaneSpeed', 'Occupancy', 'Volume_S', 'Speed_S', 'Volume_L', 'Speed_L', 'Volume_T', 'Speed_T'])
            
            #pivot the dataframe according to lane
            df = df.pivot_table(index=['DateTime', 'VDID'], columns='LaneID', values=['LaneSpeed', 'Occupancy', 'Volume_S', 'Speed_S', 'Volume_L', 'Speed_L', 'Volume_T', 'Speed_T']).reset_index()
            #flatten the column names
            df.columns = ['_'.join(str(col).strip() for col in tup) for tup in df.columns.values]

            filename = os.path.join(dataset, date, f"{vdid}_{date}")
            df.to_csv(f'{filename}.csv', index=False)
            print(f"{filename}.csv saved successfully!")
        
        # Save the error logs
        if error_logs[vdid]:
            filename = os.path.join(dataset, date, f"{vdid}_{date}_errorlog.txt")
            with open(filename, 'w') as log_file:
                log_file.write("\n".join(error_logs[vdid]))
            print(f"{filename} saved successfully!")

In [4]:
if __name__ == "__main__":
    
    date = "20230825"
    path = r"D:\VD_data"
    max_concurrent_downloads = 20

    downloaded_files, subfolder_path = get_xml(date, path, max_concurrent_downloads)
    decompress_xml(downloaded_files, subfolder_path, date)


Created subfolder at D:\VD_data\20230825
Download attempt 1
Monitoring timeout. Stopping the download process.
Downloaded 1440 out of 1440 files.
Created decompressed directory at D:\VD_data\20230825\decompressed_20230825
Created compressed directory at D:\VD_data\20230825\compressed_20230825
Starting decompression...


100%|██████████| 1440/1440 [00:48<00:00, 29.52it/s]


Decompression complete. Generating error report...
Error report saved to D:\VD_data\20230825\20230825_error_report.txt


In [4]:
path = r"D:\VD_data"
date = "20230825"
desired_vdids = ["VD-N1-N-198-O-SE-21-北上出口-彰化", "VD-N1-S-369.007-M-Loop", "VD-N1-S-369.400-M-Loop", "VD-N1-S-370.000-M-Loop", "VD-N1-S-371.010-M-Loop", "VD-T82-W-14-O-EN-1-Loop", "VD-N1-N-33-I-EN-31-五股"]
dataset = path

get_vds(date, desired_vdids, dataset)

100%|██████████| 60/60 [00:55<00:00,  1.07it/s]


D:\VD_data\20230825\VD-N1-N-198-O-SE-21-北上出口-彰化_20230825.csv saved successfully!
D:\VD_data\20230825\VD-N1-S-369.007-M-Loop_20230825.csv saved successfully!
D:\VD_data\20230825\VD-N1-S-369.400-M-Loop_20230825.csv saved successfully!
D:\VD_data\20230825\VD-N1-S-370.000-M-Loop_20230825.csv saved successfully!
D:\VD_data\20230825\VD-N1-S-371.010-M-Loop_20230825.csv saved successfully!
D:\VD_data\20230825\VD-T82-W-14-O-EN-1-Loop_20230825.csv saved successfully!
D:\VD_data\20230825\VD-N1-N-33-I-EN-31-五股_20230825.csv saved successfully!
