In [None]:
import pyfredapi as pf
import pandas as pd
import numpy as np
import time
import pickle

In [None]:
API_KEY = '10e0969f13a4b82bc47d736e1047d303'

In [None]:
# Init w/ base categories
category_tree = [
    {'name': 'Money, Banking, & Finance', 'id': 1, 'children': []},
    {'name': 'Population, Employment, & Labor Markets', 'id': 10, 'children': []},
    {'name': 'National Accounts', 'id': 32992, 'children': []},
    {'name': 'Production & Business Activity', 'id': 1, 'children': []},
    {'name': 'Prices', 'id': 32455, 'children': []},
    {'name': 'International Data', 'id': 32263, 'children': []},
    {'name': 'U.S. Regional Data', 'id': 3008, 'children': []},
    {'name': 'Academic Data', 'id': 33060, 'children': []},    
]

In [None]:

def build_category_tree(category_tree):
    
    queue = category_tree.copy()
    while len(queue) > 0:
        elem = queue.pop(0)
        time.sleep(1)
#         print(f'retrieving children for: {elem["name"]}')
        categories = pf.get_category_children(category_id=elem['id'], api_key=API_KEY)
        children = categories['categories']
        elem['children'] = children        
        if len(children) > 0:
            queue += children
    
    return category_tree
            

In [None]:
filled_category_tree = build_category_tree(category_tree)

In [None]:
# pickle.dump(filled_category_tree, open('/Users/jonathanl/fred-categories.pkl', 'wb'))

In [None]:
def extract_terminal_nodes(category_tree):
    
    terminal_nodes = []
    queue = category_tree.copy()
    while len(queue) > 0:
        elem = queue.pop(0)
#         print(f'visiting node: {elem["name"]}')
        if len(elem['children']) < 1:
            terminal_nodes.append(elem)
        else:
            queue.extend(elem['children'])
    return terminal_nodes

In [None]:
terminal_nodes = extract_terminal_nodes(filled_category_tree)

In [None]:
len(terminal_nodes)

In [None]:
def build_series_dictionary(terminal_categories):
    '''
    Given list of categories, get associated series
    link each series to its terminal 
    Series will consist of series metadata + actual series data (with all / full revisions)
    '''
    series_frames = []
    for category in terminal_categories:
        print(f'processing category: {category["name"]}')
        category_series = pf.get_category_series(category_id=category['id'], api_key=API_KEY)
        series_df = pd.DataFrame.from_records([vars(series_info) for series_info in category_series.values()])
        series_df['category_id'] = category['id']
        series_frames.append(series_df)
        time.sleep(1)
    
    return series_frames

In [None]:
series_frames = build_series_dictionary(terminal_nodes)

In [None]:
series_df = pd.concat(series_frames)

In [None]:
series_df.to_parquet('/Users/jonathanl/Workspace/quant-workspace/experimental-fred/data/series-meta/series-meta.pq')

In [None]:
series_df.id.unique()

In [None]:
series_df.columns

In [None]:
excluded_child_categories = pickle.load(open('/Users/jonathanl/Workspace/quant-workspace/experimental-fred/data/regional-categories.pkl', 'rb'))

In [None]:
len(excluded_child_categories)

In [None]:
series_filtered_df = series_df[~series_df.category_id.isin(excluded_child_categories)]

In [None]:
len(series_filtered_df.id.unique())

In [None]:
def get_series_full(series_id: str) -> pd.DataFrame:
    '''
    full data, all revisions
    add series id to dataframe
    '''    
    try:
        data = pf.get_series_all_releases(series_id, api_key=API_KEY)
        data['series'] = series_id
        return data
    except:
        print(f'!! failed to get: {series_id} !!')
        return None
    
# TODO: retrieve incremental data / information
# Figure out release schedule / release logistics


In [None]:
series_grouped = np.array_split(series_filtered_df.id.unique(), 1000)

In [None]:
# all series dataframes
series_df = []

In [None]:

# process by group, avoid issues
idx = 53
for group in series_grouped[53:]:
    print(f"processing group: {idx}")    
    for series_id in group:        
        print(f"{series_id}, ", end="")
        series_data = get_series_full(series_id)
        if series_data is not None:
            series_df.append(series_data)
        time.sleep(1)
    print()    
    idx += 1
    

In [None]:
len(series_df)

In [None]:
len(series_df)

In [None]:
series_df[0]

In [None]:
pd.concat(series_df).to_parquet('/Users/jonathanl/Workspace/quant-workspace/experimental-fred/data/series-data/', partition_cols=['series'])

In [None]:
pd.to_pickle(pd.concat(series_df), open('/Users/jonathanl/Workspace/quant-workspace/experimental-fred/data/series-checkpoint-full.pkl', 'wb'))

In [None]:
pd.to_pickle(pd.concat(series_df), open('/Users/jonathanl/Workspace/quant-workspace/experimental-fred/data/series-checkpoint-1.pkl', 'wb'))

In [None]:
# Storage: 
# (a) Category tree
# (b) Series metadata
# (c) Series data (i.e. parquet load)

In [None]:
# Retrieval from Storage
# (a) for series: initial numbers
# (b) for series: latest revisions

In [None]:
500000 / 120 / 60

In [None]:
69/ 24