In [None]:
import os
import shutil
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import glob

def sort_vdid(date):
    try:
        # Extract year and month from the date
        year_month = date[:6]
        
        # Define the source directory
        src_dir = f"D:\\VD_data\\Taiwan VD database 2023\\{year_month}\\{date}\\{date}\\VDID"
        
        # List all CSV files in the source directory
        csv_files = glob.glob(os.path.join(src_dir, "*.csv"))
        
        for src_path in csv_files:
            # Extract the vdid_identifier from the file name
            vdid_identifier = os.path.basename(src_path).replace('.csv', '')
            dest_folder = f"D:\\VD_data\\Taiwan VD database 2023\\vdid_specific\\{vdid_identifier}"
            dest_path = os.path.join(dest_folder, f"{date}.csv")
            
            # Ensure the destination folder exists
            os.makedirs(dest_folder, exist_ok=True)
            
            # Move and rename the file
            shutil.move(src_path, dest_path)
        
        print(f"All files for {date} processed successfully.")
    except Exception as e:
        print(f"An error occurred while processing files for {date}: {e}")

def batch_sort_vdid(start, end):
    # Convert start and end strings to datetime objects
    start_date = datetime.strptime(start, "%Y%m%d")
    end_date = datetime.strptime(end, "%Y%m%d")
    
    current_date = start_date
    dates_to_process = []
    
    # Create a list of dates to process
    while current_date <= end_date:
        dates_to_process.append(current_date.strftime("%Y%m%d"))
        current_date += timedelta(days=1)
    
    # Process each date with a progress bar
    for date in tqdm(dates_to_process, desc="Processing files"):
        sort_vdid(date)


In [None]:
# Example usage
batch_sort_vdid('20230201', '20231231')

In [None]:
import os
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import shutil

def move_folder(date):
    try:
        # Extract year and month from the date
        year_month = date[:6]
        
        # Define source and destination folder paths
        src_folder = f"D:\\VD_data\\Taiwan VD database 2023\\{year_month}\\{date}\\{date}\\csv"
        dest_folder = f"D:\\VD_data\\Taiwan VD database 2023\\xml_to_csv\\{date}"
        
        # Check if the source folder exists
        if os.path.exists(src_folder):
            # Ensure the destination parent folder exists
            os.makedirs(os.path.dirname(dest_folder), exist_ok=True)
            
            # Move and rename the folder
            shutil.move(src_folder, dest_folder)
            print(f"Folder for {date} moved successfully.")
        else:
            print(f"Source folder for {date} does not exist, skipping.")
    except Exception as e:
        print(f"An error occurred while moving folder for {date}: {e}")

def batch_move_folders(start, end):
    # Convert start and end strings to datetime objects
    start_date = datetime.strptime(start, "%Y%m%d")
    end_date = datetime.strptime(end, "%Y%m%d")
    
    current_date = start_date
    dates_to_process = []
    
    # Create a list of dates to process
    while current_date <= end_date:
        dates_to_process.append(current_date.strftime("%Y%m%d"))
        current_date += timedelta(days=1)
    
    # Process each date with a progress bar
    for date in tqdm(dates_to_process, desc="Moving folders"):
        move_folder(date)

In [None]:
# Example usage
batch_move_folders('20230201', '20231231')

In [None]:
import os
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import shutil

def move_log_file(date):
    try:
        # Extract year and month from the date
        year_month = date[:6]
        
        # Define source and destination log file paths
        src_file = f"D:\\VD_data\\Taiwan VD database 2023\\{year_month}\\{date}\\{date}\\download_issues.log"
        dest_folder = f"D:\\VD_data\\Taiwan VD database 2023\\download_issues"
        dest_file = os.path.join(dest_folder, f"{date}.log")
        
        # Check if the source log file exists
        if os.path.isfile(src_file):
            # Ensure the destination folder exists
            os.makedirs(dest_folder, exist_ok=True)
            
            # Move and rename the log file
            shutil.move(src_file, dest_file)
            print(f"Log file for {date} moved successfully.")
        else:
            print(f"Source log file for {date} does not exist, skipping.")
    except Exception as e:
        print(f"An error occurred while moving log file for {date}: {e}")

def batch_move_log_files(start, end):
    # Convert start and end strings to datetime objects
    start_date = datetime.strptime(start, "%Y%m%d")
    end_date = datetime.strptime(end, "%Y%m%d")
    
    current_date = start_date
    dates_to_process = []
    
    # Create a list of dates to process
    while current_date <= end_date:
        dates_to_process.append(current_date.strftime("%Y%m%d"))
        current_date += timedelta(days=1)
    
    # Process each date with a progress bar
    for date in tqdm(dates_to_process, desc="Moving log files"):
        move_log_file(date)


In [None]:
# Example usage
batch_move_log_files('20230201', '20231231')

In [None]:
# The following function will parse the logs considering the requirements specified:
# 1. Removing the colon at the end of URLs.
# 2. Ensuring one file only has one status.
# 3. Correcting the result when there is a retry after an initial failure.

import csv
import os
import re
from collections import defaultdict

def log_to_csv(log_directory, output_csv_file):
    def list_log_files(directory):
        """Lists all .log files in the given directory."""
        return [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.log')]

    def parse_log_line(line):
        """Parses a single line of log to extract relevant information."""
        # Remove colon at the end of the URL if present
        line = line.rstrip(':')
        
        date_match = re.search(r'/(\d{8})/', line)
        date = date_match.group(1) if date_match else "Unknown Date"
        
        file_name_match = re.search(r'([^/]+\.xml\.gz)', line)
        file_name = file_name_match.group(0) if file_name_match else "Unknown File"
        
        # Identify the error reason based on the line content
        if "404 Client Error: Not Found" in line:
            error_reason = "404 Not Found"
        elif "File too small" in line:
            error_reason = "File too small and deleted"
        else:
            error_reason = "Unknown Reason"
        
        # Identify if the download was retried
        retry_status = "Retried" if "Failed to download on retry" in line else "Not Retried"
        
        url_match = re.search(r'https?://[^\s]+', line)
        url = url_match.group(0) if url_match else "Unknown URL"
        
        return date, file_name, url, error_reason, retry_status

    def process_logs_to_csv(log_files, csv_file_path):
        log_entries = defaultdict(dict)
        
        for log_file_path in log_files:
            with open(log_file_path, 'r') as log_file:
                for line in log_file:
                    date, file_name, url, error_reason, retry_status = parse_log_line(line)
                    
                    # If the file entry already exists, update the retry status if necessary
                    if file_name in log_entries[date]:
                        if retry_status == "Retried":
                            log_entries[date][file_name]['Retry Status'] = "Retried"
                    else:
                        log_entries[date][file_name] = {
                            'URL': url,
                            'Error Reason': error_reason,
                            'Retry Status': retry_status
                        }
        
        with open(csv_file_path, 'w', newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Date", "File Name", "URL", "Error Reason", "Retry Status"])
            
            for date, files in log_entries.items():
                for file_name, data in files.items():
                    writer.writerow([date, file_name, data['URL'], data['Error Reason'], data['Retry Status']])

    log_files = list_log_files(log_directory)
    process_logs_to_csv(log_files, output_csv_file)

# Example usage:
# Replace 'path/to/log_directory' with the path to the directory containing your log files
# Replace 'path/to/output_combined_errors.csv' with the desired output path for the CSV file
# log_to_csv('path/to/log_directory', 'path/to/output_combined_errors.csv')


In [None]:
# Example usage:

log_directory = "D:\VD_data\Taiwan VD database 2023\download_issues"
output_csv_file = "D:\VD_data\Taiwan VD database 2023\output_combined_errors.csv"

#log_to_csv(log_directory, output_csv_file)

In [None]:
import os
import requests
import re
import logging
from tqdm import tqdm
from urllib.parse import urlparse

# Initialize logging
log_filename = "C:\\Users\\galen\\Downloads\\error_files_download.log"
logging.basicConfig(filename=log_filename, level=logging.ERROR, format='%(asctime)s %(levelname)s:%(message)s')

# Directory to save downloaded files
base_dir = "C:\\Users\\galen\\Downloads\\error files"

# Ensure the base directory exists
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

# Function to download and save a file from a URL
def download_file(url):
    try:
        # Extract the date and filename from the URL
        match = re.search(r'/(\d{8})/(\w+\.xml\.gz)$', url)
        if match:
            date, filename = match.groups()
            folder_path = os.path.join(base_dir, date)
            
            # Ensure the folder for this date exists
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
            
            # Path to save the file
            file_path = os.path.join(folder_path, filename)
            
            # Download and save the file
            response = requests.get(url)
            if response.status_code == 200:
                with open(file_path, 'wb') as file:
                    file.write(response.content)
                
                # Check file size and delete if less than 1 KB
                file_size = os.path.getsize(file_path)
                if file_size < 1024:
                    logging.error(f"Deleting {filename}: File size {file_size} bytes is less than 1KB")
                    os.remove(file_path)
                    print(f"Deleted {filename} due to size < 1KB")
                else:
                    print(f"Downloaded {filename} to {date} folder")
            else:
                logging.error(f"Failed to download {url} - Status code {response.status_code}")
        else:
            logging.error(f"URL format not recognized: {url}")
    except Exception as e:
        logging.error(f"Error downloading file from {url}: {str(e)}")

# Read URLs from the text file
with open("C:\\Users\\galen\\Downloads\\error_list.txt", 'r') as file:
    urls = [url.strip() for url in file.readlines()]

# Download each file with a progress bar
for url in tqdm(urls, desc="Downloading files"):
    download_file(url)

In [2]:
import os

def remove_empty_folders(path):
    # Check if the given path is a directory
    if not os.path.isdir(path):
        print(f"The path {path} is not a directory.")
        return
    
    # Traverse the directory tree
    for root, dirs, files in os.walk(path, topdown=False):
        for name in dirs:
            dir_path = os.path.join(root, name)
            # If the directory is empty, remove it
            if not os.listdir(dir_path):
                os.rmdir(dir_path)
                print(f"Removed empty folder: {dir_path}")

# Test the function with a specified path
# Since this is an example, and I cannot modify the filesystem in this environment, 
# I'll comment out the actual call to the function.
remove_empty_folders("C:\\Users\\galen\\Downloads\\error files")

# Uncomment the above line and run it in your local Jupyter notebook to remove empty folders under the specified directory.


Removed empty folder: C:\Users\galen\Downloads\error files\20230101
Removed empty folder: C:\Users\galen\Downloads\error files\20230102
Removed empty folder: C:\Users\galen\Downloads\error files\20230103
Removed empty folder: C:\Users\galen\Downloads\error files\20230104
Removed empty folder: C:\Users\galen\Downloads\error files\20230105
Removed empty folder: C:\Users\galen\Downloads\error files\20230106
Removed empty folder: C:\Users\galen\Downloads\error files\20230107
Removed empty folder: C:\Users\galen\Downloads\error files\20230108
Removed empty folder: C:\Users\galen\Downloads\error files\20230109
Removed empty folder: C:\Users\galen\Downloads\error files\20230110
Removed empty folder: C:\Users\galen\Downloads\error files\20230111
Removed empty folder: C:\Users\galen\Downloads\error files\20230112
Removed empty folder: C:\Users\galen\Downloads\error files\20230113
Removed empty folder: C:\Users\galen\Downloads\error files\20230114
Removed empty folder: C:\Users\galen\Downloads\e