## Taiwan Highway VD data fetching

#### Run the code first then execute the xml.gz downloader.

In [1]:
import os
import requests
import concurrent.futures
from tqdm.notebook import tqdm
import gzip
import shutil
import pandas as pd
import xml.etree.ElementTree as ET
from IPython.display import display, HTML

# Specify the path to the directory you want to create
directory_path = f'D:\\VD_data'

# Check if the directory already exists
if not os.path.exists(directory_path):
    # Create the directory if it does not exist
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created successfully.")
else:
    print(f"Directory '{directory_path}' already exists.")

#########################################################################################

def download_file(url, file_path, log_file_path):
    """Download a single file, check its size, and return the status."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        with open(file_path, 'wb') as file:
            file.write(response.content)

        # Check file size (< 1KB)
        if os.path.getsize(file_path) < 1024:
            os.remove(file_path)
            with open(log_file_path, 'a') as log_file:
                log_file.write(f'Deleted: File too small (<1KB): {url}\n')
            print(f'Deleted: {url} (File too small)')
            return url, 'small'
        #print(f'Downloaded: {url}')
        return url, True
    except requests.RequestException as e:
        with open(log_file_path, 'a') as log_file:
            log_file.write(f'Failed to download {url}: {e}\n')
        print(f'Failed to download: {url}')
        return url, False

def download_files_for_day(date, max_concurrent_downloads=10):
    print(f"Starting download for date: {date}")
    base_folder_path = f'D:\\VD_data\\{date}'
    compressed_folder_path = os.path.join(base_folder_path, 'compressed')
    os.makedirs(compressed_folder_path, exist_ok=True)
    log_file_path = os.path.join(base_folder_path, 'download_issues.log')

    # Prepare the download tasks
    download_tasks = []
    skipped_files = 0
    for hour in range(24):
        for minute in range(60):
            current_time = f'{hour:02d}{minute:02d}'
            url = f'https://tisvcloud.freeway.gov.tw/history/motc20/VD/{date}/VDLive_{current_time}.xml.gz'
            file_path = os.path.join(compressed_folder_path, f'VDLive_{current_time}.xml.gz')
            if os.path.exists(file_path):
                skipped_files += 1
                #print(f'Skipped: {url} (File already exists)')
            else:
                download_tasks.append((url, file_path))
    if skipped_files > 0:
        print(f'Skipped {skipped_files} files. (File already exists)')

    # Download files concurrently with a progress bar
    failed_downloads = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent_downloads) as executor, tqdm(total=len(download_tasks) + skipped_files) as progress:
        progress.update(skipped_files)
        future_to_url = {executor.submit(download_file, url, file_path, log_file_path): url for url, file_path in download_tasks}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                _, result = future.result()
                if result != True:
                    failed_downloads.append((url, os.path.join(compressed_folder_path, url.split('/')[-1])))
                progress.update(1)
            except Exception as e:
                failed_downloads.append((url, os.path.join(compressed_folder_path, url.split('/')[-1])))
                print(f'Error during download: {url}')
                progress.update(1)

    # Retry failed downloads
    if len(failed_downloads) > 0:
        print("Retrying failed downloads...")
        with tqdm(total=len(failed_downloads)) as progress:
            for url, file_path in failed_downloads:
                _, result = download_file(url, file_path, log_file_path)
                if result != True:
                    with open(log_file_path, 'a') as log_file:
                        log_file.write(f'Failed to download on retry: {url}\n')
                    print(f'Failed to download on retry: {url}')
                progress.update(1)

    print("Download process completed.")

#########################################################################################

def decompress_files(date):
    base_folder_path = f'D:\\VD_data\\{date}'
    compressed_folder_path = os.path.join(base_folder_path, 'compressed')
    decompressed_folder_path = os.path.join(base_folder_path, 'decompressed')
    log_file_path = os.path.join(base_folder_path, 'download_issues.log')
    
    os.makedirs(decompressed_folder_path, exist_ok=True)

    # List all .xml.gz files in the compressed folder
    compressed_files = [f for f in os.listdir(compressed_folder_path) if f.endswith('.xml.gz')]
    total_files = len(compressed_files)
    print("Decompressing xml.gz files...")

    # Progress bar setup
    with tqdm(total=total_files) as progress:
        for file in compressed_files:
            compressed_file_path = os.path.join(compressed_folder_path, file)
            decompressed_file_path = os.path.join(decompressed_folder_path, file[:-3])  # Remove .gz from filename

            # Skip if decompressed file already exists
            if os.path.exists(decompressed_file_path):
                print(f'Skipped: {file} (Already decompressed)')
                progress.update(1)
                continue

            try:
                # Decompress file
                with gzip.open(compressed_file_path, 'rb') as f_in, open(decompressed_file_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
                #print(f'Decompressed: {file}')
            except Exception as e:
                with open(log_file_path, 'a') as log_file:
                    log_file.write(f'Failed to decompress {file}: {e}\n')
                print(f'Failed to decompress: {file}')
            progress.update(1)

    print("Decompression process completed.")

#########################################################################################

def convert_xml_to_csv(date):
    input_dir = f"D:\\VD_data\\{date}\\decompressed"
    output_dir = f"D:\\VD_data\\{date}\\csv"

    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Define the namespace
    namespace = {'ns1': 'http://traffic.transportdata.tw/standard/traffic/schema/'}

    def get_nested_element_text(parent, path):
        element = parent.find(path, namespace)
        return element.text if element is not None else ''

    # Get the list of XML files
    xml_files = [f for f in os.listdir(input_dir) if f.endswith('.xml')]
    total_files = len(xml_files)

    # Initialize the progress bar
    with tqdm(total=total_files) as progress:
        for file_name in xml_files:
            try:
                tree = ET.parse(os.path.join(input_dir, file_name))
                root = tree.getroot()

                # Prepare a dictionary to store data for each VDID
                data_dict = {}

                for vdlive in root.findall('.//ns1:VDLive', namespace):
                    vdid = get_nested_element_text(vdlive, 'ns1:VDID')

                    if vdid not in data_dict:
                        data_dict[vdid] = {
                            'VDID': vdid
                        }

                    for lane in vdlive.findall('.//ns1:Lane', namespace):
                        lane_id = get_nested_element_text(lane, 'ns1:LaneID')
                        speed = get_nested_element_text(lane, 'ns1:Speed')
                        occupancy = get_nested_element_text(lane, 'ns1:Occupancy')

                        data_dict[vdid][f'L{lane_id}_Speed'] = speed
                        data_dict[vdid][f'L{lane_id}_Occupancy'] = occupancy

                        # Initialize volume and speed values for S, L, T
                        data_dict[vdid][f'L{lane_id}_S_Volume'] = 0
                        data_dict[vdid][f'L{lane_id}_L_Volume'] = 0
                        data_dict[vdid][f'L{lane_id}_T_Volume'] = 0
                        data_dict[vdid][f'L{lane_id}_S_Vehicle_Speed'] = 0
                        data_dict[vdid][f'L{lane_id}_L_Vehicle_Speed'] = 0
                        data_dict[vdid][f'L{lane_id}_T_Vehicle_Speed'] = 0

                        for vehicle in lane.findall('.//ns1:Vehicle', namespace):
                            vehicle_type = get_nested_element_text(vehicle, 'ns1:VehicleType')
                            volume = get_nested_element_text(vehicle, 'ns1:Volume')
                            speed2 = get_nested_element_text(vehicle, 'ns1:Speed')

                            if vehicle_type == 'S':
                                data_dict[vdid][f'L{lane_id}_S_Volume'] = volume
                                data_dict[vdid][f'L{lane_id}_S_Vehicle_Speed'] = speed2
                            elif vehicle_type == 'L':
                                data_dict[vdid][f'L{lane_id}_L_Volume'] = volume
                                data_dict[vdid][f'L{lane_id}_L_Vehicle_Speed'] = speed2
                            elif vehicle_type == 'T':
                                data_dict[vdid][f'L{lane_id}_T_Volume'] = volume
                                data_dict[vdid][f'L{lane_id}_T_Vehicle_Speed'] = speed2

                # Create DataFrame from the data dictionary values
                df = pd.DataFrame(list(data_dict.values()))

                # Save the modified data to a CSV file
                output_file = os.path.join(output_dir, file_name.replace('.xml', '.csv'))
                df.to_csv(output_file, index=False)

                # Update the progress bar
                progress.update(1)
            except Exception as e:
                print(f"Error converting file {file_name}: {e}")

#########################################################################################

def process_csv_files(date):
    # Define input and output directories based on the provided date
    input_directory = f'D:/VD_data/{date}/csv'
    output_directory = f'D:/VD_data/{date}/VDID'

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Initialize an empty list to store DataFrames
    dfs = []

    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_directory) if f.endswith('.csv')]
    
    print(f"Processing {len(csv_files)} CSV files:")
    
    with tqdm(total=len(csv_files), unit='file') as pbar_files:
        for i, filename in enumerate(csv_files, start=1):
            
            try:
                # Read the CSV file and insert the 'time' column at the beginning
                df = pd.read_csv(os.path.join(input_directory, filename))
                df.insert(0, 'file_name', filename)
                
                # Append the DataFrame to the list
                dfs.append(df)
                
                pbar_files.update(1)
            except Exception as e:
                # Print an error message and continue processing other files
                display(HTML(f'<span style="color:red">Error processing file {filename}: {str(e)}</span>'))

    # Concatenate all DataFrames in the list to create the combined DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Group the combined DataFrame by 'VDID'
    groups = combined_df.groupby('VDID')
    
    print(f"\nSaving {len(groups)} VDID-specific CSV files:")
    
    with tqdm(total=len(groups), unit='VDID') as pbar_vdids:
        for i, (vdid, group_df) in enumerate(groups, start=1):
            try:
                # Save the group-specific data to a CSV file in the output directory
                group_df.to_csv(os.path.join(output_directory, f'{vdid}.csv'), index=False)
                
                pbar_vdids.update(1)
            except Exception as e:
                # Print an error message if saving fails
                display(HTML(f'<span style="color:red">Error saving VDID {vdid}: {str(e)}</span>'))
    
    print(f"\n{len(groups)} VDID-specific CSV files saved.")

#########################################################################################

def check_files(date):
    # Define the directory paths based on the input date
    csv_directory = os.path.join(r'D:\VD_data', date, 'csv')
    vdid_directory = os.path.join(r'D:\VD_data', date, 'VDID')
    
    # Create dictionaries to store the distribution of row counts for CSV files and VDID files
    csv_row_counts = {}
    vdid_row_counts = {}
    
    # Get the total number of CSV files in the CSV directory
    total_csv_files = len([filename for filename in os.listdir(csv_directory) if filename.endswith(".csv")])
    
    # Create a progress bar for processing CSV files
    csv_progress_bar = tqdm(total=total_csv_files, desc="Processing CSV files")
    
    # Iterate through CSV files in the CSV directory
    for filename in os.listdir(csv_directory):
        if filename.endswith(".csv"):
            file_path = os.path.join(csv_directory, filename)
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Get the number of rows in the DataFrame
            num_rows = len(df)
            
            # Update the csv_row_counts dictionary
            if num_rows in csv_row_counts:
                csv_row_counts[num_rows] += 1
            else:
                csv_row_counts[num_rows] = 1
            
            # Update the CSV progress bar
            csv_progress_bar.update(1)
    
    # Close the CSV progress bar
    csv_progress_bar.close()
    
    # Survey the 'VDID' directory
    if os.path.exists(vdid_directory):
        vdid_files = os.listdir(vdid_directory)
        
        # Create a progress bar for processing VDID files
        vdid_progress_bar = tqdm(total=len(vdid_files), desc="Processing VDID files")
        
        # Iterate through VDID files in the VDID directory
        for filename in vdid_files:
            file_path = os.path.join(vdid_directory, filename)
            
            # Read the VDID file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Get the number of rows in the DataFrame
            num_rows = len(df)
            
            # Update the vdid_row_counts dictionary
            if num_rows in vdid_row_counts:
                vdid_row_counts[num_rows] += 1
            else:
                vdid_row_counts[num_rows] = 1
            
            # Update the VDID progress bar
            vdid_progress_bar.update(1)
        
        # Close the VDID progress bar
        vdid_progress_bar.close()
    
    # Calculate the total rows for CSV and VDID files
    total_csv_rows = sum(num_rows * count for num_rows, count in csv_row_counts.items())
    total_vdid_rows = sum(num_rows * count for num_rows, count in vdid_row_counts.items())
    
    # Define the output directory and log file path
    output_directory = os.path.join(r'D:\VD_data', date)
    log_file_path = os.path.join(output_directory, 'log.txt')
    
    # Write the distribution of row counts for CSV files to the log file
    with open(log_file_path, 'w') as log_file:
        log_file.write("Distribution of row counts for CSV files:\n")
        for num_rows, count in sorted(csv_row_counts.items()):
            log_file.write(f"CSV files with {num_rows} rows: {count} files\n")
        
        # Write the distribution of row counts for VDID files to the log file
        log_file.write("Distribution of row counts for VDID files:\n")
        for num_rows, count in sorted(vdid_row_counts.items()):
            log_file.write(f"VDID files with {num_rows} rows: {count} files\n")
        
        # Write the total rows for CSV and VDID files
        log_file.write(f"Total rows in CSV files: {total_csv_rows}\n")
        log_file.write(f"Total rows in VDID files: {total_vdid_rows}\n")
    
    # Print the distribution of row counts for CSV files
    print()
    print("Distribution of row counts for CSV files:")
    for num_rows, count in sorted(csv_row_counts.items()):
        print(f"CSV files with {num_rows} rows: {count} files")
    
    # Print the distribution of row counts for VDID files
    print()
    print("Distribution of row counts for VDID files:")
    for num_rows, count in sorted(vdid_row_counts.items()):
        print(f"VDID files with {num_rows} rows: {count} files")
    
    # Print the total rows for CSV and VDID files
    print()
    print(f"Total rows in CSV files: {total_csv_rows}")
    print(f"Total rows in VDID files: {total_vdid_rows}")
    
    # Compare and report any differences in total rows
    if total_csv_rows == total_vdid_rows:
        print("Total rows in CSV and VDID files are the same.")
    else:
        print("Total rows in CSV and VDID files are different.")

#########################################################################################

def delete_files(date, delete_compressed, delete_decompressed, delete_csv):
    # Define the directory paths based on the input date
    compressed_directory = os.path.join(r'D:\VD_data', date, 'compressed')
    decompressed_directory = os.path.join(r'D:\VD_data', date, 'decompressed')
    csv_directory = os.path.join(r'D:\VD_data', date, 'csv')
    
    # Helper function to delete files in a directory
    def delete_files_in_directory(directory):
        if os.path.exists(directory):
            file_list = os.listdir(directory)
            for file in file_list:
                file_path = os.path.join(directory, file)
                try:
                    if os.path.isfile(file_path):
                        os.remove(file_path)
                except Exception as e:
                    print(f"Error deleting file: {file_path} ({e})")
        print(f"Deleted file: {directory}")
    
    # Delete files in the specified directories based on the parameter values
    if delete_compressed == 1:
        delete_files_in_directory(compressed_directory)
    
    if delete_decompressed == 1:
        delete_files_in_directory(decompressed_directory)
    
    if delete_csv == 1:
        delete_files_in_directory(csv_directory)

Directory 'D:\VD_data' already exists.


In [2]:
def fetch_vd(date, delete_compressed, delete_decompressed, delete_csv):
    download_files_for_day(date, max_concurrent_downloads=5)
    decompress_files(date)
    convert_xml_to_csv(date)
    process_csv_files(date)
    check_files(date)
    delete_files(date, delete_compressed, delete_decompressed, delete_csv)


fetch_vd("20240121", 0, 0, 0) # date, delete_compressed, delete_decompressed, delete_csv

Starting download for date: 20240121
Skipped 1440 files. (File already exists)


  0%|          | 0/1440 [00:00<?, ?it/s]

Download process completed.
Decompressing xml.gz files...


  0%|          | 0/1440 [00:00<?, ?it/s]

Skipped: VDLive_0000.xml.gz (Already decompressed)
Skipped: VDLive_0001.xml.gz (Already decompressed)
Skipped: VDLive_0002.xml.gz (Already decompressed)
Skipped: VDLive_0003.xml.gz (Already decompressed)
Skipped: VDLive_0004.xml.gz (Already decompressed)
Skipped: VDLive_0005.xml.gz (Already decompressed)
Skipped: VDLive_0006.xml.gz (Already decompressed)
Skipped: VDLive_0007.xml.gz (Already decompressed)
Skipped: VDLive_0008.xml.gz (Already decompressed)
Skipped: VDLive_0009.xml.gz (Already decompressed)
Skipped: VDLive_0010.xml.gz (Already decompressed)
Skipped: VDLive_0011.xml.gz (Already decompressed)
Skipped: VDLive_0012.xml.gz (Already decompressed)
Skipped: VDLive_0013.xml.gz (Already decompressed)
Skipped: VDLive_0014.xml.gz (Already decompressed)
Skipped: VDLive_0015.xml.gz (Already decompressed)
Skipped: VDLive_0016.xml.gz (Already decompressed)
Skipped: VDLive_0017.xml.gz (Already decompressed)
Skipped: VDLive_0018.xml.gz (Already decompressed)
Skipped: VDLive_0019.xml.gz (Al

  0%|          | 0/1440 [00:00<?, ?it/s]

Processing 1440 CSV files:


  0%|          | 0/1440 [00:00<?, ?file/s]


Saving 3762 VDID-specific CSV files:


  0%|          | 0/3762 [00:00<?, ?VDID/s]


3762 VDID-specific CSV files saved.


Processing CSV files:   0%|          | 0/1440 [00:00<?, ?it/s]

Processing VDID files:   0%|          | 0/3762 [00:00<?, ?it/s]


Distribution of row counts for CSV files:
CSV files with 3762 rows: 1440 files

Distribution of row counts for VDID files:
VDID files with 1440 rows: 3762 files

Total rows in CSV files: 5417280
Total rows in VDID files: 5417280
Total rows in CSV and VDID files are the same.
