In [1]:
import pandas as pd
import os

In [None]:
# Specify the input and output folders
input_folder = 'data'
output_folder = 'data_reduced'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define the columns to keep
columns_to_extract = [
    "location_key", "date", "wikidata_id", "country_name", "new_confirmed", 
    "new_deceased", "population", "latitude", "longitude", "school_closing", 
    "workplace_closing", "cancel_public_events", "restrictions_on_gatherings", 
    "public_transport_closing", "stay_at_home_requirements", 
    "restrictions_on_internal_movement", "international_travel_controls", 
    "contact_tracing", "stringency_index"
]

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    # Check if the file is a CSV
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder, filename)
        
        # Load the file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Identify missing columns
        missing_columns = [col for col in columns_to_extract if col not in df.columns]
        
        # Add missing columns with zeros
        for col in missing_columns:
            df[col] = 0
        
        # Extract and reorder the specified columns
        df_reduced = df[columns_to_extract]
        
        # Save the reduced DataFrame to the output folder
        output_path = os.path.join(output_folder, filename)
        df_reduced.to_csv(output_path, index=False)
        
        print(f'Saved reduced data for {filename} to {output_path}')

In [None]:
# Specify the input and output folders
input_folder = 'data_reduced'          # Load from reduced daily data
output_folder = 'data_reduced_weekly'  # Save to weekly data folder

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define columns and their aggregation rules
aggregation_functions = {
    "location_key": "first",
    "wikidata_id": "first",
    "country_name": "first",
    "population": "first",
    "latitude": "first",
    "longitude": "first",
    "school_closing": "max",
    "workplace_closing": "max",
    "cancel_public_events": "max",
    "restrictions_on_gatherings": "max",
    "public_transport_closing": "max",
    "stay_at_home_requirements": "max",
    "restrictions_on_internal_movement": "max",
    "international_travel_controls": "max",
    "contact_tracing": "max",
    "stringency_index": "max",
    "new_confirmed": "sum",
    "new_deceased": "sum"
}

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder, filename)
        
        # Load the reduced daily data file
        df = pd.read_csv(file_path)
        
        # Ensure the date column is in datetime format
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        
        # Set date as the index for resampling
        df.set_index('date', inplace=True)
        
        # Resample the data to weekly frequency starting on Thursday and apply custom aggregation
        df_weekly = df.resample('W-THU').apply(aggregation_functions)
        
        # Reset the index to get the date back as a column
        df_weekly.reset_index(inplace=True)
        
        # Add a column for the calendar week number based on the starting date of each week
        df_weekly['calendar_week'] = df_weekly['date'].dt.isocalendar().week
        
        # Save the weekly data to the output folder
        output_path = os.path.join(output_folder, filename)
        df_weekly.to_csv(output_path, index=False)
        
        print(f'Saved weekly reduced data for {filename} to {output_path}')
