In [1]:
import sys
import os
from pathlib import Path
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append(str(Path.cwd().parent))
from config import PATH_KIOSK_USER_PATTERNS_FOLDER, PATH_KIOSK_USER_PATTERNS_REPO, PATH_DATA_SHARING_FOLDER

# Example usage
kupdat03_path = PATH_DATA_SHARING_FOLDER /'DEWEY'
execfile(PATH_KIOSK_USER_PATTERNS_REPO / 'functions/extract_file_list.py')


In [2]:
import numpy as np
import deweydatapy as ddp

# Create date range from Jan 1, 2019 to June 30, 2025
dates = pd.date_range(start='2019-01-01', end='2025-06-30', freq='D')
df_dates = pd.DataFrame({'date': dates})
df_dates['is_weekday'] = df_dates['date'].dt.weekday < 5

# Add year and month columns for grouping
df_dates['year'] = df_dates['date'].dt.year
df_dates['month'] = df_dates['date'].dt.month

# Function to sample one weekday and one weekend per group
def sample_days(group):
    weekday = group[group['is_weekday']].sample(1, random_state=42) if group['is_weekday'].any() else pd.DataFrame()
    weekend = group[~group['is_weekday']].sample(1, random_state=42) if (~group['is_weekday']).any() else pd.DataFrame()
    return pd.concat([weekday, weekend])

# Apply sampling per year-month group
sampled = df_dates.groupby(['year', 'month'], group_keys=False).apply(sample_days).reset_index(drop=True)

sampled.head()

  sampled = df_dates.groupby(['year', 'month'], group_keys=False).apply(sample_days).reset_index(drop=True)


Unnamed: 0,date,is_weekday,year,month
0,2019-01-22,True,2019,1
1,2019-01-06,False,2019,1
2,2019-02-01,True,2019,2
3,2019-02-03,False,2019,2
4,2019-03-01,True,2019,3


In [3]:
# data_endpoint = "https://app.deweydata.io/external-api/v3/products/c117bbd0-ed3a-4c05-8f3c-01ac4b12ef60/files"
data_endpoint = "https://app.deweydata.io/api/v1/external/data/fldr_8zme9bwbekydvezq"


In [4]:
# m = 12 # pending
files_dict = {}
meta_dict = {}

error_dict = {}


for y in range(2020, 2018, -1):
    dest_dir = fr'R:/Global Diabetes Research Center/Patel/Veraset/Sampled Visit Dates/{y}/'
    os.makedirs(dest_dir, exist_ok=True)

    for m in range(12,0,-1):
        execfile(PATH_KIOSK_USER_PATTERNS_REPO / 'constants.py')
        start = f"{y}-{m:02d}-01"
        key = f"{y}-{m:02d}"
        if m == 2:
            end = f"{y}-{m:02d}-28" # doesn't account for leap years, but we are not going that far back
        elif m in [4, 6, 9, 11]:
            end = f"{y}-{m:02d}-30"
        else:
            end = f"{y}-{m:02d}-31"
        if pd.to_datetime(end) > pd.to_datetime('2025-05-31'):
            end = '2025-05-31'

        if(y == 2020 and m in [12, 11,10,9,8,7]):
            files = pd.read_csv(PATH_KIOSK_USER_PATTERNS_REPO / 'data' / f'kupdat03b_veraset visits_{y}_{m:02d}.csv')
            files_dict[key] = files
            merged = pd.read_csv(PATH_KIOSK_USER_PATTERNS_REPO / 'data' / f'kupdat03b_veraset visits after sampling_{y}_{m:02d}.csv')
        else:
            print(start + " to " + end)
            files, meta = extract_file_list(bulk_api_veraset_visits, data_endpoint, sd=start, ed=end)
            files.to_csv(PATH_KIOSK_USER_PATTERNS_REPO / 'data' / f'kupdat03b_veraset visits_{y}_{m:02d}.csv', index=False)


            files_dict[key] = files
            meta_dict[key] = meta

            # Ensure 'partition_key' in files and 'date' in sampled are both datetime type
            files['partition_key_dt'] = pd.to_datetime(files['partition_key'])
            sampled['date_dt'] = pd.to_datetime(sampled['date'])

            # Perform inner join on 'partition_key' and 'date'
            merged = pd.merge(files, sampled, left_on='partition_key_dt', right_on='date_dt', how='inner')
            merged.to_csv(PATH_KIOSK_USER_PATTERNS_REPO / 'data' / f'kupdat03b_veraset visits after sampling_{y}_{m:02d}.csv', index=False)


            
        error_list = []
        for idx, row in merged.iterrows():
            try:
                ddp.download_files(pd.DataFrame([row]), dest_folder=dest_dir, skip_exists=True)
            except Exception as e:
                error_list.append(idx)
                print(f"Download failed for index {idx}: {e}")
        error_dict[key] = pd.Series(error_list, dtype='object')



Downloading 1/1 (file index = 50)
File already exists: R:/Global Diabetes Research Center/Patel/Veraset/Sampled Visit Dates/2020/part-00000-tid-1029250406933512839-3e3962f9-3cc0-4e4a-90c0-62a07806007c-93461-1-c000.snappy.parquet
Skipping...
Downloading 1/1 (file index = 51)
File already exists: R:/Global Diabetes Research Center/Patel/Veraset/Sampled Visit Dates/2020/part-00001-tid-1029250406933512839-3e3962f9-3cc0-4e4a-90c0-62a07806007c-93452-1-c000.snappy.parquet
Skipping...
Downloading 1/1 (file index = 52)
File already exists: R:/Global Diabetes Research Center/Patel/Veraset/Sampled Visit Dates/2020/part-00002-tid-1029250406933512839-3e3962f9-3cc0-4e4a-90c0-62a07806007c-93460-1-c000.snappy.parquet
Skipping...
Downloading 1/1 (file index = 53)
File already exists: R:/Global Diabetes Research Center/Patel/Veraset/Sampled Visit Dates/2020/part-00003-tid-1029250406933512839-3e3962f9-3cc0-4e4a-90c0-62a07806007c-93454-1-c000.snappy.parquet
Skipping...
Downloading 1/1 (file index = 54)
Fi