In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import os
import requests
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor
from tqdm.notebook import tqdm
import gzip
import shutil
import pandas as pd
import xml.etree.ElementTree as ET
from IPython.display import display, HTML
import zipfile
from datetime import datetime, timedelta
import pytz

# Specify the path to the directory you want to create
directory_path = r"D:\VD_data"

# Check if the directory already exists
if not os.path.exists(directory_path):
    # Create the directory if it does not exist
    os.makedirs(directory_path)
    print(f"Directory '{directory_path}' created successfully.")
else:
    print(f"Directory '{directory_path}' already exists.")

Directory 'D:\VD_data' already exists.


In [2]:
def download_file(url, file_path, log_file_path):
    """Download a single file, check its size, and return the status."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error for bad status codes
        with open(file_path, 'wb') as file:
            file.write(response.content)

        # Check file size (< 1KB)
        if os.path.getsize(file_path) < 1024:
            os.remove(file_path)
            with open(log_file_path, 'a') as log_file:
                log_file.write(f'Deleted: File too small (<1KB): {url}\n')
            print(f'Deleted: {url} (File too small)')
            return url, 'small'
        #print(f'Downloaded: {url}')
        return url, True
    except requests.RequestException as e:
        with open(log_file_path, 'a') as log_file:
            log_file.write(f'Failed to download {url}: {e}\n')
        print(f'Failed to download: {url}')
        return url, False

def download_files_for_day(directory_path, date, max_concurrent_downloads=10):
    print(f"Starting download for date: {date}")
    base_folder_path = os.path.join(directory_path, date)
    compressed_folder_path = os.path.join(base_folder_path, 'compressed')
    os.makedirs(compressed_folder_path, exist_ok=True)
    log_file_path = os.path.join(base_folder_path, 'download_issues.log')

    # Prepare the download tasks
    download_tasks = []
    skipped_files = 0
    for hour in range(24):
        for minute in range(60):
            current_time = f'{hour:02d}{minute:02d}'
            url = f'https://tisvcloud.freeway.gov.tw/history/motc20/VD/{date}/VDLive_{current_time}.xml.gz'
            file_path = os.path.join(compressed_folder_path, f'VDLive_{current_time}.xml.gz')
            if os.path.exists(file_path):
                skipped_files += 1
                #print(f'Skipped: {url} (File already exists)')
            else:
                download_tasks.append((url, file_path))
    if skipped_files > 0:
        print(f'Skipped {skipped_files} files. (File already exists)')

    # Download files concurrently with a progress bar
    failed_downloads = []
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_concurrent_downloads) as executor, tqdm(total=len(download_tasks) + skipped_files) as progress:
        progress.update(skipped_files)
        future_to_url = {executor.submit(download_file, url, file_path, log_file_path): url for url, file_path in download_tasks}
        for future in concurrent.futures.as_completed(future_to_url):
            url = future_to_url[future]
            try:
                _, result = future.result()
                if result != True:
                    failed_downloads.append((url, os.path.join(compressed_folder_path, url.split('/')[-1])))
                progress.update(1)
            except Exception as e:
                failed_downloads.append((url, os.path.join(compressed_folder_path, url.split('/')[-1])))
                print(f'Error during download: {url}')
                progress.update(1)

    # Retry failed downloads
    if len(failed_downloads) > 0:
        print("Retrying failed downloads...")
        with tqdm(total=len(failed_downloads)) as progress:
            for url, file_path in failed_downloads:
                _, result = download_file(url, file_path, log_file_path)
                if result != True:
                    with open(log_file_path, 'a') as log_file:
                        log_file.write(f'Failed to download on retry: {url}\n')
                    print(f'Failed to download on retry: {url}')
                progress.update(1)

    print("Download process completed.")

def decompress_files(directory_path, date):
    base_folder_path = os.path.join(directory_path, date)
    compressed_folder_path = os.path.join(base_folder_path, 'compressed')
    decompressed_folder_path = os.path.join(base_folder_path, 'decompressed')
    log_file_path = os.path.join(base_folder_path, 'download_issues.log')
    
    os.makedirs(decompressed_folder_path, exist_ok=True)

    # List all .xml.gz files in the compressed folder
    compressed_files = [f for f in os.listdir(compressed_folder_path) if f.endswith('.xml.gz')]
    total_files = len(compressed_files)
    print("Decompressing xml.gz files...")

    # Progress bar setup
    with tqdm(total=total_files) as progress:
        for file in compressed_files:
            compressed_file_path = os.path.join(compressed_folder_path, file)
            decompressed_file_path = os.path.join(decompressed_folder_path, file[:-3])  # Remove .gz from filename

            # Skip if decompressed file already exists
            if os.path.exists(decompressed_file_path):
                print(f'Skipped: {file} (Already decompressed)')
                progress.update(1)
                continue

            try:
                # Decompress file
                with gzip.open(compressed_file_path, 'rb') as f_in, open(decompressed_file_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)
                #print(f'Decompressed: {file}')
            except Exception as e:
                with open(log_file_path, 'a') as log_file:
                    log_file.write(f'Failed to decompress {file}: {e}\n')
                print(f'Failed to decompress: {file}')
            progress.update(1)

    print("Decompression process completed.")

def parse_xml_file(file_path, namespace):
    """
    Parse an XML file and return a dictionary of data.
    """
    tree = ET.parse(file_path)
    root = tree.getroot()

    data_dict = {}

    def get_nested_element_text(parent, path):
        element = parent.find(path, namespace)
        return element.text if element is not None else ''

    for vdlive in root.findall('.//ns1:VDLive', namespace):
        vdid = get_nested_element_text(vdlive, 'ns1:VDID')

        if vdid not in data_dict:
            data_dict[vdid] = {'VDID': vdid}

        for lane in vdlive.findall('.//ns1:Lane', namespace):
            lane_id = get_nested_element_text(lane, 'ns1:LaneID')
            speed = get_nested_element_text(lane, 'ns1:Speed')
            occupancy = get_nested_element_text(lane, 'ns1:Occupancy')

            data_dict[vdid].update({
                f'L{lane_id}_Speed': speed,
                f'L{lane_id}_Occupancy': occupancy,
                f'L{lane_id}_S_Volume': 0,
                f'L{lane_id}_L_Volume': 0,
                f'L{lane_id}_T_Volume': 0,
                f'L{lane_id}_S_Vehicle_Speed': 0,
                f'L{lane_id}_L_Vehicle_Speed': 0,
                f'L{lane_id}_T_Vehicle_Speed': 0
            })

            for vehicle in lane.findall('.//ns1:Vehicle', namespace):
                vehicle_type = get_nested_element_text(vehicle, 'ns1:VehicleType')
                volume = get_nested_element_text(vehicle, 'ns1:Volume')
                speed2 = get_nested_element_text(vehicle, 'ns1:Speed')

                prefix = f'L{lane_id}_{vehicle_type}_'
                data_dict[vdid][prefix + 'Volume'] = volume
                data_dict[vdid][prefix + 'Vehicle_Speed'] = speed2

    return data_dict


def convert_xml_to_csv(directory_path, date):
    input_dir = os.path.join(directory_path, date, "decompressed")
    output_dir = os.path.join(directory_path, date, "csv")

    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Define your XML namespace
    namespace = {'ns1': 'http://traffic.transportdata.tw/standard/traffic/schema/'}

    # List all XML files in the input directory
    xml_files = [f for f in os.listdir(input_dir) if f.endswith('.xml')]
    total_files = len(xml_files)

    # Initialize progress bar
    with tqdm(total=total_files) as progress:
        for file_name in xml_files:
            try:
                # Check if corresponding CSV file already exists
                output_file = os.path.join(output_dir, file_name.replace('.xml', '.csv'))
                if os.path.exists(output_file):
                    # Skip this file if the CSV already exists
                    print(f"Skipping {file_name} as CSV already exists.")
                    progress.update(1)
                    continue

                # Proceed with conversion if CSV does not exist
                file_path = os.path.join(input_dir, file_name)
                data_dict = parse_xml_file(file_path, namespace)  # Assuming this is a function you've defined
                df = pd.DataFrame(list(data_dict.values()))

                # Save to CSV, skipping index
                df.to_csv(output_file, index=False)
                progress.update(1)  # Update progress after successful conversion
            except Exception as e:
                # Handle exceptions, such as parse errors or file IO errors
                print(f"Error converting file {file_name}: {e}")

def process_csv_files(directory_path, date):
    # Define input and output directories based on the provided date
    input_directory = os.path.join(directory_path, date, "csv")
    output_directory = os.path.join(directory_path, date, "VDID")

    # Create the output directory if it doesn't exist
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Initialize an empty list to store DataFrames
    dfs = []

    # List all CSV files in the input directory
    csv_files = [f for f in os.listdir(input_directory) if f.endswith('.csv')]
    
    print(f"Processing {len(csv_files)} CSV files:")
    
    with tqdm(total=len(csv_files), unit='file') as pbar_files:
        for i, filename in enumerate(csv_files, start=1):
            
            try:
                # Read the CSV file and insert the 'time' column at the beginning
                df = pd.read_csv(os.path.join(input_directory, filename))
                df.insert(0, 'file_name', filename)
                
                # Append the DataFrame to the list
                dfs.append(df)
                
                pbar_files.update(1)
            except Exception as e:
                # Print an error message and continue processing other files
                display(HTML(f'<span style="color:red">Error processing file {filename}: {str(e)}</span>'))

    # Concatenate all DataFrames in the list to create the combined DataFrame
    combined_df = pd.concat(dfs, ignore_index=True)

    # Group the combined DataFrame by 'VDID'
    groups = combined_df.groupby('VDID')
    
    print(f"\nSaving {len(groups)} VDID-specific CSV files:")
    
    with tqdm(total=len(groups), unit='VDID') as pbar_vdids:
        for i, (vdid, group_df) in enumerate(groups, start=1):
            try:
                # Save the group-specific data to a CSV file in the output directory
                group_df.to_csv(os.path.join(output_directory, f'{vdid}.csv'), index=False)
                
                pbar_vdids.update(1)
            except Exception as e:
                # Print an error message if saving fails
                display(HTML(f'<span style="color:red">Error saving VDID {vdid}: {str(e)}</span>'))
    
    print(f"\n{len(groups)} VDID-specific CSV files saved.")

def check_files(directory_path, date):
    # Define the directory paths based on the input date
    csv_directory = os.path.join(directory_path, date, 'csv')
    vdid_directory = os.path.join(directory_path, date, 'VDID')
    
    # Create dictionaries to store the distribution of row counts
    csv_row_counts = {}
    vdid_row_counts = {}
    
    # Process CSV files
    csv_files = [f for f in os.listdir(csv_directory) if f.endswith(".csv")]
    csv_progress_bar = tqdm(total=len(csv_files), desc="Processing CSV files")
    
    for filename in csv_files:
        file_path = os.path.join(csv_directory, filename)
        # Use 'usecols' to read only the first column
        df = pd.read_csv(file_path, usecols=[0])
        num_rows = len(df)
        
        csv_row_counts[num_rows] = csv_row_counts.get(num_rows, 0) + 1
        csv_progress_bar.update(1)
    
    csv_progress_bar.close()
    
    # Process VDID files if the directory exists
    if os.path.exists(vdid_directory):
        vdid_files = os.listdir(vdid_directory)
        vdid_progress_bar = tqdm(total=len(vdid_files), desc="Processing VDID files")
        
        for filename in vdid_files:
            file_path = os.path.join(vdid_directory, filename)
            # Use 'usecols' to read only the first column
            df = pd.read_csv(file_path, usecols=[0])
            num_rows = len(df)
            
            vdid_row_counts[num_rows] = vdid_row_counts.get(num_rows, 0) + 1
            vdid_progress_bar.update(1)
        
        vdid_progress_bar.close()
    
    # Write the results to a log file and print the distributions
    log_and_print_results(directory_path, date, csv_row_counts, vdid_row_counts)

def log_and_print_results(directory_path, date, csv_row_counts, vdid_row_counts):
    # Calculate total rows
    total_csv_rows = sum(num_rows * count for num_rows, count in csv_row_counts.items())
    total_vdid_rows = sum(num_rows * count for num_rows, count in vdid_row_counts.items())
    
    # Prepare log file
    output_directory = os.path.join(directory_path, date)
    log_file_path = os.path.join(output_directory, 'log.txt')
    
    with open(log_file_path, 'w') as log_file:
        log_file.write("Distribution of row counts for CSV files:\n")
        for num_rows, count in sorted(csv_row_counts.items()):
            log_file.write(f"CSV files with {num_rows} rows: {count} files\n")
        
        log_file.write("Distribution of row counts for VDID files:\n")
        for num_rows, count in sorted(vdid_row_counts.items()):
            log_file.write(f"VDID files with {num_rows} rows: {count} files\n")
        
        log_file.write(f"Total rows in CSV files: {total_csv_rows}\n")
        log_file.write(f"Total rows in VDID files: {total_vdid_rows}\n")
    
    # Optionally, print the same information to the console
    print_distribution("CSV files", csv_row_counts)
    print_distribution("VDID files", vdid_row_counts)
    print(f"Total rows in CSV files: {total_csv_rows}")
    print(f"Total rows in VDID files: {total_vdid_rows}")

def print_distribution(file_type, row_counts):
    print(f"\nDistribution of row counts for {file_type}:")
    for num_rows, count in sorted(row_counts.items()):
        print(f"{file_type} with {num_rows} rows: {count} files")


def delete_files(directory_path, date, delete_compressed, delete_decompressed, delete_csv):
    # Define the directory paths based on the input date
    compressed_directory = os.path.join(directory_path, date, 'compressed')
    decompressed_directory = os.path.join(directory_path, date, 'decompressed')
    csv_directory = os.path.join(directory_path, date, 'csv')
    
    # Helper function to delete files in a directory
    def delete_files_in_directory(directory):
        if os.path.exists(directory):
            file_list = os.listdir(directory)
            for file in file_list:
                file_path = os.path.join(directory, file)
                try:
                    if os.path.isfile(file_path):
                        os.remove(file_path)
                except Exception as e:
                    print(f"Error deleting file: {file_path} ({e})")
        print(f"Deleted file: {directory}")
    
    # Delete files in the specified directories based on the parameter values
    if delete_compressed == 1:
        delete_files_in_directory(compressed_directory)
    
    if delete_decompressed == 1:
        delete_files_in_directory(decompressed_directory)
    
    if delete_csv == 1:
        delete_files_in_directory(csv_directory)

def zip_output(directory_path, date, delete_files_sp_zip=0):
    try:
        # Construct the path to the directory to zip
        dir_to_zip = os.path.join(directory_path, date)
        
        # Check if the directory exists
        if not os.path.exists(dir_to_zip):
            print(f"Directory {dir_to_zip} does not exist.")
            return
        
        # Output zip file path
        output_zip_path = f"{dir_to_zip}.zip"
        
        # Name of the root folder within the zip file
        root_folder_name = os.path.basename(dir_to_zip)
        
        # Count the number of files to zip
        file_count = sum([len(files) for r, d, files in os.walk(dir_to_zip)])
        
        # Notify user about the zipping process
        print(f"Zipping {file_count} files in {dir_to_zip}, please wait...")
        
        # Create a zip file and add files to it
        with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
            for root, dirs, files in os.walk(dir_to_zip):
                for file in files:
                    # File path to add
                    file_path = os.path.join(root, file)
                    # Calculate relative path within the zip, including the new root folder
                    arcname = os.path.join(root_folder_name, os.path.relpath(file_path, dir_to_zip))
                    # Add file to zip
                    zipf.write(file_path, arcname)
        
        # Notify completion
        print(f"Completed zipping directory {dir_to_zip} into {output_zip_path}")
        
        # Delete the original directory if delete_files_sp_zip equals 1
        if delete_files_sp_zip == 1:
            shutil.rmtree(dir_to_zip)
            print(f"Deleted directory {dir_to_zip}")
        
    except Exception as e:
        print(f"An error occurred: {e}")


def get_yesterday_date(timezone):
    # Convert the current time to the specified timezone
    tz = pytz.timezone(timezone)
    now_in_timezone = datetime.now(tz)
    
    # Calculate yesterday's date
    yesterday_in_timezone = now_in_timezone - timedelta(days=1)
    
    # Format yesterday's date as "YYYYMMDD"
    return yesterday_in_timezone.strftime("%Y%m%d")

# Set timezone to Taipei
timezone = "Asia/Taipei"
yesterday_date = get_yesterday_date(timezone)
print(f"yesterday date: {yesterday_date}")


# Main Program

def fetch_vd(directory_path, date, delete_compressed, delete_decompressed, delete_csv, delete_files_sp_zip):
    download_files_for_day(directory_path, date, max_concurrent_downloads=5)
    decompress_files(directory_path, date)
    convert_xml_to_csv(directory_path, date)
    process_csv_files(directory_path, date)
    check_files(directory_path, date)
    delete_files(directory_path, date, delete_compressed, delete_decompressed, delete_csv)
    zip_output(directory_path, date, delete_files_sp_zip)
    
    
def batch_fetch_vd(start_date, num_days_backwards, directory_path, delete_compressed, delete_decompressed, delete_csv, delete_files_sp_zip):
    # Convert start_date string to datetime object
    date_format = "%Y%m%d"
    current_date = datetime.strptime(start_date, date_format)
    
    # Iterate backwards from start_date for num_days_backwards
    for _ in range(num_days_backwards):
        # Convert current_date back to string and call fetch_vd
        formatted_date = current_date.strftime(date_format)
        fetch_vd(directory_path, formatted_date, delete_compressed, delete_decompressed, delete_csv, delete_files_sp_zip)
        
        # Decrement the day by one
        current_date -= timedelta(days=1)

yesterday date: 20240302


In [3]:
#fetch_vd(directory_path, "20240228", 1, 1, 0, 1) # date, delete_compressed, delete_decompressed, delete_csv, delete_files_sp_zip

#fetch_vd(directory_path, yesterday_date, 1, 1, 0, 0)

# Example usage
# You need to replace "your_directory_path_here" with the actual directory path.
# Also, adjust the boolean flags as needed for your use case.
batch_fetch_vd("20240210", 10, directory_path, delete_compressed=True, delete_decompressed=True, delete_csv=False, delete_files_sp_zip=True)

Starting download for date: 20240210


  0%|          | 0/1440 [00:00<?, ?it/s]

Download process completed.
Decompressing xml.gz files...


  0%|          | 0/1440 [00:00<?, ?it/s]

Decompression process completed.


  0%|          | 0/1440 [00:00<?, ?it/s]

Processing 1440 CSV files:


  0%|          | 0/1440 [00:00<?, ?file/s]


Saving 3629 VDID-specific CSV files:


  0%|          | 0/3629 [00:00<?, ?VDID/s]


3629 VDID-specific CSV files saved.


Processing CSV files:   0%|          | 0/1440 [00:00<?, ?it/s]

Processing VDID files:   0%|          | 0/3629 [00:00<?, ?it/s]


Distribution of row counts for CSV files:
CSV files with 3629 rows: 1440 files

Distribution of row counts for VDID files:
VDID files with 1440 rows: 3629 files

Total rows in CSV files: 5225760
Total rows in VDID files: 5225760
Total rows in CSV and VDID files are the same.
Deleted file: D:\VD_data\20240210\compressed
Deleted file: D:\VD_data\20240210\decompressed
Zipping 5070 files in D:\VD_data\20240210, please wait...
Completed zipping directory D:\VD_data\20240210 into D:\VD_data\20240210.zip
Deleted directory D:\VD_data\20240210
Starting download for date: 20240209


  0%|          | 0/1440 [00:00<?, ?it/s]