In [2]:
import pandas as pd
import os

In [3]:
# Specify the input and output folders
input_folder = 'data'
output_folder = 'data_reduced'

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define the columns to keep
columns_to_extract = [
    "location_key", "date", "wikidata_id", "country_name", "new_confirmed", 
    "new_deceased", "population", "latitude", "longitude", "school_closing", 
    "workplace_closing", "cancel_public_events", "restrictions_on_gatherings", 
    "public_transport_closing", "stay_at_home_requirements", 
    "restrictions_on_internal_movement", "international_travel_controls", 
    "contact_tracing", "stringency_index", "average_temperature_celsius", "minimum_temperature_celsius", "maximum_temperature_celsius", "rainfall_mm","snowfall_mm", "dew_point", "relative_humidity"
]

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    # Check if the file is a CSV
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder, filename)
        
        # Load the file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Identify missing columns
        missing_columns = [col for col in columns_to_extract if col not in df.columns]
        
        # Add missing columns with zeros
        for col in missing_columns:
            df[col] = 0
        
        # Extract and reorder the specified columns
        df_reduced = df[columns_to_extract]
        
        # Save the reduced DataFrame to the output folder
        output_path = os.path.join(output_folder, filename)
        df_reduced.to_csv(output_path, index=False)
        
        print(f'Saved reduced data for {filename} to {output_path}')

Saved reduced data for MN.csv to data_reduced/MN.csv
Saved reduced data for NU.csv to data_reduced/NU.csv
Saved reduced data for MY.csv to data_reduced/MY.csv
Saved reduced data for CR.csv to data_reduced/CR.csv
Saved reduced data for BV.csv to data_reduced/BV.csv
Saved reduced data for AM.csv to data_reduced/AM.csv
Saved reduced data for AZ.csv to data_reduced/AZ.csv
Saved reduced data for BA.csv to data_reduced/BA.csv
Saved reduced data for TL.csv to data_reduced/TL.csv
Saved reduced data for VE.csv to data_reduced/VE.csv
Saved reduced data for TZ.csv to data_reduced/TZ.csv
Saved reduced data for TM.csv to data_reduced/TM.csv
Saved reduced data for CD.csv to data_reduced/CD.csv
Saved reduced data for AL.csv to data_reduced/AL.csv
Saved reduced data for BW.csv to data_reduced/BW.csv
Saved reduced data for MX.csv to data_reduced/MX.csv
Saved reduced data for NC.csv to data_reduced/NC.csv
Saved reduced data for MO.csv to data_reduced/MO.csv
Saved reduced data for LK.csv to data_reduced/

In [6]:
# Specify the input and output folders
input_folder = 'data_reduced'          # Load from reduced daily data
output_folder = 'data_reduced_weekly'  # Save to weekly data folder

# Ensure the output folder exists
os.makedirs(output_folder, exist_ok=True)

# Define columns and their aggregation rules
aggregation_functions = {
    "location_key": "first",
    "wikidata_id": "first",
    "country_name": "first",
    "population": "first",
    "latitude": "first",
    "longitude": "first",
    "school_closing": "max",
    "workplace_closing": "max",
    "cancel_public_events": "max",
    "restrictions_on_gatherings": "max",
    "public_transport_closing": "max",
    "stay_at_home_requirements": "max",
    "restrictions_on_internal_movement": "max",
    "international_travel_controls": "max",
    "contact_tracing": "max",
    "stringency_index": "max",
    "new_confirmed": "sum",
    "new_deceased": "sum",
    "average_temperature_celsius": "mean",
    "minimum_temperature_celsius": "min",
    "maximum_temperature_celsius": "max",
    "rainfall_mm": "sum",
    "snowfall_mm": "sum",
    "dew_point": "mean",
    "relative_humidity":"mean"
}

# Loop through all files in the input folder
for filename in os.listdir(input_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(input_folder, filename)
        
        # Load the reduced daily data file
        df = pd.read_csv(file_path)
        
        # Ensure the date column is in datetime format
        df['date'] = pd.to_datetime(df['date'], errors='coerce')
        
        # Set date as the index for resampling
        df.set_index('date', inplace=True)
        
        # Resample the data to weekly frequency starting on Thursday and apply custom aggregation
        df_weekly = df.resample('W-THU').apply(aggregation_functions)
        
        # Reset the index to get the date back as a column
        df_weekly.reset_index(inplace=True)
        
        # Add a column for the calendar week number based on the starting date of each week
        df_weekly['calendar_week'] = df_weekly['date'].dt.isocalendar().week
        
        # Save the weekly data to the output folder
        output_path = os.path.join(output_folder, filename)
        df_weekly.to_csv(output_path, index=False)
        
        print(f'Saved weekly reduced data for {filename} to {output_path}')


Saved weekly reduced data for MN.csv to data_reduced_weekly/MN.csv
Saved weekly reduced data for NU.csv to data_reduced_weekly/NU.csv
Saved weekly reduced data for MY.csv to data_reduced_weekly/MY.csv
Saved weekly reduced data for CR.csv to data_reduced_weekly/CR.csv
Saved weekly reduced data for BV.csv to data_reduced_weekly/BV.csv
Saved weekly reduced data for AM.csv to data_reduced_weekly/AM.csv
Saved weekly reduced data for AZ.csv to data_reduced_weekly/AZ.csv
Saved weekly reduced data for BA.csv to data_reduced_weekly/BA.csv
Saved weekly reduced data for TL.csv to data_reduced_weekly/TL.csv
Saved weekly reduced data for VE.csv to data_reduced_weekly/VE.csv
Saved weekly reduced data for TZ.csv to data_reduced_weekly/TZ.csv
Saved weekly reduced data for TM.csv to data_reduced_weekly/TM.csv
Saved weekly reduced data for CD.csv to data_reduced_weekly/CD.csv
Saved weekly reduced data for AL.csv to data_reduced_weekly/AL.csv
Saved weekly reduced data for BW.csv to data_reduced_weekly/BW