# Purpose
I need to find out what all possible date formats are for the "REF_DATE" field so that when I write the parquet file people will be able to filter on it

These are all the dates I have encountered:
- Just the year. Example found on productId 36100608: 2024
- Year and month. Example found on productId 14100443: 2024-07
- Year, month, date. Example found on productId 33100036: 2025-06-17
- Range. Example found on productId 17100022: 2013/2014

In [107]:
import polars as pl

data_folder = "/data/tables"
input_folder = f"{data_folder}/input"
scratch_folder = f"{data_folder}/scratch"
output_folder = f"{data_folder}/output"

def normalize_ref_date(series):
    # Always cast to string first
    series = series.cast(pl.Utf8)
   
    # Try parsing as full date (YYYY-MM-DD)
    full = series.str.strptime(pl.Date, "%Y-%m-%d", strict=False)

    # For nulls, try parsing as year-month (YYYY-MM)
    ym = series.str.strptime(pl.Date, "%Y-%m", strict=False).dt.replace(day=1)
    full = full.fill_null(ym)

    # For remaining nulls, try just year (YYYY)
    y = series.str.strptime(pl.Date, "%Y", strict=False).dt.replace(month=1, day=1)
    full = full.fill_null(y)

    return full

filepath = f"{output_folder}/en/other/en/33100496.parquet"
df = pl.read_parquet(filepath)
df.head()

REF_DATE,GEO,DGUID,Restriction level,Vaccination status,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
str,str,str,str,str,str,i16,str,i8,str,str,f64,i8,i8,i8,i8
"""2020-01-01""","""Newfoundland and Labrador""","""2016A000210""","""Restriction index""","""Total population""","""Index""",160,"""units""",0,"""v1331468081""","""1.1.1""",1.67,,,,2
"""2020-01-01""","""Newfoundland and Labrador""","""2016A000210""","""Restriction index""","""Vaccinated persons""","""Index""",160,"""units""",0,"""v1331468097""","""1.1.2""",1.67,,,,2
"""2020-01-01""","""Newfoundland and Labrador""","""2016A000210""","""Restriction index""","""Unvaccinated persons""","""Index""",160,"""units""",0,"""v1331468113""","""1.1.3""",1.67,,,,2
"""2020-01-01""","""Newfoundland and Labrador""","""2016A000210""","""School closing""","""Total population""","""Index""",160,"""units""",0,"""v1331468082""","""1.2.1""",0.0,,,,2
"""2020-01-01""","""Newfoundland and Labrador""","""2016A000210""","""School closing""","""Vaccinated persons""","""Index""",160,"""units""",0,"""v1331468098""","""1.2.2""",0.0,,,,2


In [108]:
skip_calculating_ref_date = False
if df.schema["REF_DATE"] == pl.String:
    if df["REF_DATE"].str.contains("/").any():
        # Skip the calculating of the field
        skip_calculating_ref_date = True

In [109]:
if skip_calculating_ref_date == False:
    df = df.with_columns([
        normalize_ref_date(pl.col("REF_DATE")).alias("REF_DATE")
    ])

In [110]:
df.head()

REF_DATE,GEO,DGUID,Restriction level,Vaccination status,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
date,str,str,str,str,str,i16,str,i8,str,str,f64,i8,i8,i8,i8
2020-01-01,"""Newfoundland and Labrador""","""2016A000210""","""Restriction index""","""Total population""","""Index""",160,"""units""",0,"""v1331468081""","""1.1.1""",1.67,,,,2
2020-01-01,"""Newfoundland and Labrador""","""2016A000210""","""Restriction index""","""Vaccinated persons""","""Index""",160,"""units""",0,"""v1331468097""","""1.1.2""",1.67,,,,2
2020-01-01,"""Newfoundland and Labrador""","""2016A000210""","""Restriction index""","""Unvaccinated persons""","""Index""",160,"""units""",0,"""v1331468113""","""1.1.3""",1.67,,,,2
2020-01-01,"""Newfoundland and Labrador""","""2016A000210""","""School closing""","""Total population""","""Index""",160,"""units""",0,"""v1331468082""","""1.2.1""",0.0,,,,2
2020-01-01,"""Newfoundland and Labrador""","""2016A000210""","""School closing""","""Vaccinated persons""","""Index""",160,"""units""",0,"""v1331468098""","""1.2.2""",0.0,,,,2
