In [1]:
import pandas as pd
from datetime import datetime 

START_DATE = '2024-01-01'
END_DATE = '2025-02-28'

end_date = pd.Timestamp(datetime.strptime(END_DATE, '%Y-%m-%d'))
start_date = pd.Timestamp(datetime.strptime(START_DATE, '%Y-%m-%d'))

inputs=[
    "01_raw.{}.foo_raw_df", 
    ]
outputs="02_intermediate.{}.foo_intermediate_df"

if isinstance(inputs, str):
    inputs = [inputs]
if isinstance(outputs, str):
    outputs = [outputs]

print(inputs)

['01_raw.{}.foo_raw_df']


In [120]:
period = 'daily'

if period == 'latest':
    date_list = [end_date]

elif period == 'daily':
    date_list = pd.date_range(start=start_date, end=end_date, freq='D')

elif period == 'monthly':
    date_list = pd.date_range(start=start_date, end=end_date, freq='M')

elif period == 'quarterly':
    date_list = pd.date_range(start=start_date, end=end_date, freq='Q')

elif period == 'yearly':
    date_list = pd.date_range(start=start_date, end=end_date, freq='Y')
else:
    raise ValueError(f"Invalid period: {period}. Must be one of 'latest', 'daily', 'monthly', 'quarterly', 'yearly'")

In [121]:
date_list

DatetimeIndex(['2024-01-01', '2024-01-02', '2024-01-03', '2024-01-04',
               '2024-01-05', '2024-01-06', '2024-01-07', '2024-01-08',
               '2024-01-09', '2024-01-10',
               ...
               '2025-02-19', '2025-02-20', '2025-02-21', '2025-02-22',
               '2025-02-23', '2025-02-24', '2025-02-25', '2025-02-26',
               '2025-02-27', '2025-02-28'],
              dtype='datetime64[ns]', length=425, freq='D')

In [122]:
from dateutil.relativedelta import relativedelta
import datetime as dt

for date in date_list:
    formatted_date = date.strftime('%Y-%m-%d')
    # print(formatted_date)
    # print(type(formatted_date))

    d = dt.datetime.strptime(formatted_date, "%Y-%m-%d")
    # print(d)
    # print(type(d))

    date_range = []
    
    for i in range(1):
        if period == 'latest':
            # Just use the reference date
            date_str = (d - relativedelta(days=i)).strftime("%Y-%m-%d")
            
        elif period == 'daily':
            # Look back by days
            date_str = (d - relativedelta(days=i)).strftime("%Y-%m-%d")
            
        elif period == 'monthly':
            # Look back by months with wildcard for days
            date_str = (d - relativedelta(months=i)).strftime("%Y-%m-??")
            
        elif period == 'quarterly':
            # Look back by quarters with wildcard for month and day
            quarter_date = d - relativedelta(months=i*3)
            quarter = ((quarter_date.month - 1) // 3) + 1
            date_str = f"{quarter_date.year}-Q{quarter}"
            
        elif period == 'yearly':
            # Look back by years with wildcard
            date_str = (d - relativedelta(years=i)).strftime("%Y-??-??")
            
        else:
            # Default to monthly if period is not recognized
            date_str = (d - relativedelta(months=i)).strftime("%Y-%m-??")
            
        date_range.append(date_str)
         
    pattern = "{" + ",".join(date_range) + "}"
    print(pattern)
    formatted_inputs =  [x.format(pattern) if "{" in x else x for x in inputs]
    formatted_inputs.append(f"params:{formatted_date}")
    print(formatted_inputs)
    formatted_outputs =  [x.format(formatted_date) if "{" in x else x for x in outputs]
    print(formatted_outputs)

{2024-01-01}
['01_raw.{2024-01-01}.foo_raw_df', 'params:2024-01-01']
['02_intermediate.2024-01-01.foo_intermediate_df']
{2024-01-02}
['01_raw.{2024-01-02}.foo_raw_df', 'params:2024-01-02']
['02_intermediate.2024-01-02.foo_intermediate_df']
{2024-01-03}
['01_raw.{2024-01-03}.foo_raw_df', 'params:2024-01-03']
['02_intermediate.2024-01-03.foo_intermediate_df']
{2024-01-04}
['01_raw.{2024-01-04}.foo_raw_df', 'params:2024-01-04']
['02_intermediate.2024-01-04.foo_intermediate_df']
{2024-01-05}
['01_raw.{2024-01-05}.foo_raw_df', 'params:2024-01-05']
['02_intermediate.2024-01-05.foo_intermediate_df']
{2024-01-06}
['01_raw.{2024-01-06}.foo_raw_df', 'params:2024-01-06']
['02_intermediate.2024-01-06.foo_intermediate_df']
{2024-01-07}
['01_raw.{2024-01-07}.foo_raw_df', 'params:2024-01-07']
['02_intermediate.2024-01-07.foo_intermediate_df']
{2024-01-08}
['01_raw.{2024-01-08}.foo_raw_df', 'params:2024-01-08']
['02_intermediate.2024-01-08.foo_intermediate_df']
{2024-01-09}
['01_raw.{2024-01-09}.foo_r

In [123]:
inputs = [
    "data_{0}.csv",
    "summary.txt",
    "logs/{0}/log.txt"
]
pattern = "2024-05"

In [124]:
[x.format(pattern) if "{" in x else x for x in inputs]

['data_2024-05.csv', 'summary.txt', 'logs/2024-05/log.txt']

In [1]:
from omegaconf import OmegaConf

params = OmegaConf.load("./conf/base/globals.yml")

In [2]:
print(params)

{'START_DATE': '2025-01-01', 'END_DATE': '2025-01-02', 'PROCESSING_DATE': '2025-01-01'}


In [1]:
from datetime import datetime, timedelta

def get_date_range(processing_date: str, range_date: int) -> list[str]:
    # Chuyển đổi chuỗi ngày thành đối tượng datetime
    end_date = datetime.strptime(processing_date, '%Y-%m-%d')
    
    # Tính ngày bắt đầu
    start_date = end_date - timedelta(days=range_date - 1)
    
    # Tạo danh sách các ngày từ start_date đến end_date
    date_list = [
        (start_date + timedelta(days=i)).strftime('%Y-%m-%d') 
        for i in range(range_date)
    ]
    
    return date_list

In [3]:
date_list = get_date_range('2025-04-30', 40)

In [5]:
print(type(date_list[0]))

<class 'str'>


In [2]:
from urllib.parse import urlparse
import os


parsed = urlparse('s3a://curated/reviews')
table_name = os.path.basename(parsed.path.rstrip('/'))

In [5]:
parsed.scheme

's3a'

In [3]:
parsed.path.lstrip('/')

'reviews'

In [4]:
table_name

'reviews'