# Data Gathering and EDA - Low Frequency Quarterly and Yearly Timeframes

This notebook deals exclusively with the series that have low frequency time scales. These must be resampled into monthly series.

Resampling in this case is going to be done using a naive copy method, where every month in the low frequency period is assumed to have the same value as the period itself, e.g. the average of the monthly values in a quarter is the same as each individual montly value.

In [22]:
# Standard Library Modules
import json
import os
from pathlib import Path

# Pip Modules
import pandas as pd
from requests import HTTPError

# Custom Modules
from src.utilities import new_logger, fetch_with_cache, save_atomic


# Setting Pandas DataFrame options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [23]:
# Start the logging object
logger = new_logger("eda.lf_series", 'logs/eda')

In [24]:
# THIS IS THE EXACT SAME CODE AS MONTHLY, maybe we need a helper function for this?

# Storing a JSON configuration file at the root of the project for EDA
# will use the config.yaml with Hydra during the actual ML pipeline
config_path = Path('fred_api.conf')
abs_config_path = config_path.resolve()

# API Configuration
if os.path.exists(abs_config_path):
    logger.debug(f"Discovered {abs_config_path}, attempting to read...")
    with open(abs_config_path, 'r') as json_fp:
        logger.debug(f"Opened {abs_config_path}, attempting to load.")
        config = json.load(json_fp)
        logger.debug(f"Loaded {abs_config_path}, checking attributes...")
    if not(isinstance(config['api_uri'], str) and len(config['api_uri']) > 0):
        logger.error(f"The JSON config is missing the attribute 'api_uri', please make sure it exists.")
    elif not(isinstance(config['api_key'], str) and len(config['api_key']) > 0):
        logger.error(f"The JSON config is missing the attribute 'api_key', please make sure it exists.")
    else:
        logger.info(f"All attributes found, you may continue.")
else:
    logger.error(f"Could not find {abs_config_path}, make sure it exists before continuing.")

In [25]:
quarterly_series = [
    "MSPUS",
    "ASPUS",
    "RCMFLOACT",
    "RCMFLOLTVPCT50",
    "RCMFLOLTVPCT75",
    "RCMFLOLTVPCT90",
    "DRSFRMACBN",
    "RRVRUSQ156N",
    "RHORUSQ156N",
    'MEHOINUSA646N',
    'MEPAINUSA646N',
    'SPPOPGROWUSA',
    'POPTOTUSA647NWDB'
]

The idea is to be able to use `fetch_with_cache` as normal, but there has to be another processing step where the series undergoes temporal disaggregation. Naive temporal disaggregation in this case, but still.

In [26]:
# ONE OFF EXAMPLE
series_id = 'MSPUS'
request_uri = f"{config['api_uri']}?series_id={series_id}&api_key={config['api_key']}&file_type=json"

series_df = fetch_with_cache(series_id=series_id, request_uri=request_uri, dest="data/orig")

# set date as index
series_df = series_df.set_index('date')

# resample to monthly frequency
monthly_df = series_df.resample('MS').asfreq()

# distribute quarterly values to monthly
monthly_df['MSPUS'] = series_df.resample('MS').ffill()['MSPUS']

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 80, in emit
    self.doRollover()
    ~~~~~~~~~~~~~~~^^
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 185, in doRollover
    self.rotate(self.baseFilename, dfn)
    ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 121, in rotate
    os.rename(source, dest)
    ~~~~~~~~~^^^^^^^^^^^^^^
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dalla\\OneDrive\\Education\\WGU\\Capstone\\logs\\utils\\src.utilities.log' -> 'C:\\Users\\dalla\\OneDrive\\Education\\WGU\\Capstone\\logs\\utils\\src.utilities.log.1'
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\site-packages\ipykern

In [27]:
# fetching all of the quarterly frequency data locally
lf_data_frames = []
for series in quarterly_series:
    request_uri = f"{config['api_uri']}?series_id={series}&api_key={config['api_key']}&file_type=json"
    logger.info(f"Starting fetch process for {series}...")
    try:
        tmp_df = fetch_with_cache(series_id=series, request_uri=request_uri, dest="data/orig")
        logger.debug(f"Setting the {series} DataFrame's index to `date`...")
        tmp_df = tmp_df.set_index('date', drop=True).sort_index()
        # resample to monthly frequency
        tmp_monthly = tmp_df.resample('MS').asfreq()

        # distribute quarterly values to monthly, naively assuming it was the same value in all three months
        tmp_monthly[series] = tmp_df.resample('MS').ffill()[series]

        # add monthly df to dataframes list
        lf_data_frames.append(tmp_monthly)
    except HTTPError as err:
        logger.error(err)
    logger.info(f"Fetch process for {series} ({lf_data_frames[-1].shape}) is complete.")

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 80, in emit
    self.doRollover()
    ~~~~~~~~~~~~~~~^^
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 185, in doRollover
    self.rotate(self.baseFilename, dfn)
    ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 121, in rotate
    os.rename(source, dest)
    ~~~~~~~~~^^^^^^^^^^^^^^
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dalla\\OneDrive\\Education\\WGU\\Capstone\\logs\\utils\\src.utilities.log' -> 'C:\\Users\\dalla\\OneDrive\\Education\\WGU\\Capstone\\logs\\utils\\src.utilities.log.1'
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\site-packages\ipykern

In [28]:
monthly_lf_df = pd.concat(lf_data_frames, axis=1, join='outer', verify_integrity=True)

In [29]:
monthly_lf_df.isna().sum().sort_values(ascending=False)

RCMFLOACT           741
RCMFLOLTVPCT75      741
RCMFLOLTVPCT50      741
RCMFLOLTVPCT90      741
DRSFRMACBN          480
MEHOINUSA646N       414
MEPAINUSA646N       294
RHORUSQ156N         171
MSPUS               147
ASPUS               147
SPPOPGROWUSA        138
POPTOTUSA647NWDB    126
RRVRUSQ156N          63
dtype: int64

In [30]:
restricted_lf_df = monthly_lf_df.query('index >= "2017-01-01" & index <= "2024-12-01"')

In [31]:
restricted_lf_df.isnull().sum().sort_values(ascending=False)

POPTOTUSA647NWDB    11
MEHOINUSA646N       11
MEPAINUSA646N       11
SPPOPGROWUSA        11
MSPUS                0
RCMFLOLTVPCT75       0
RCMFLOLTVPCT50       0
RCMFLOACT            0
ASPUS                0
RHORUSQ156N          0
RRVRUSQ156N          0
DRSFRMACBN           0
RCMFLOLTVPCT90       0
dtype: int64

In [32]:
# expand out the annual values for 2024 to the rest of 2025
restricted_lf_df.loc['2024-02-01':'2024-12-01', ['MEHOINUSA646N']] = restricted_lf_df.loc['2024-01-01', 'MEHOINUSA646N']
restricted_lf_df.loc['2024-02-01':'2024-12-01', ['MEPAINUSA646N']]  = restricted_lf_df.loc['2024-01-01', 'MEPAINUSA646N']
restricted_lf_df.loc['2024-02-01':'2024-12-01', ['SPPOPGROWUSA']]  = restricted_lf_df.loc['2024-01-01', 'SPPOPGROWUSA']
restricted_lf_df.loc['2024-02-01':'2024-12-01', ['POPTOTUSA647NWDB']]  = restricted_lf_df.loc['2024-01-01', 'POPTOTUSA647NWDB']

In [33]:
restricted_lf_df.loc['2024-02-01':'2024-12-01', ['MEHOINUSA646N', 'MEPAINUSA646N', 'SPPOPGROWUSA', 'POPTOTUSA647NWDB']]

Unnamed: 0_level_0,MEHOINUSA646N,MEPAINUSA646N,SPPOPGROWUSA,POPTOTUSA647NWDB
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2024-02-01,83730.0,45140.0,0.976422,340110988.0
2024-03-01,83730.0,45140.0,0.976422,340110988.0
2024-04-01,83730.0,45140.0,0.976422,340110988.0
2024-05-01,83730.0,45140.0,0.976422,340110988.0
2024-06-01,83730.0,45140.0,0.976422,340110988.0
2024-07-01,83730.0,45140.0,0.976422,340110988.0
2024-08-01,83730.0,45140.0,0.976422,340110988.0
2024-09-01,83730.0,45140.0,0.976422,340110988.0
2024-10-01,83730.0,45140.0,0.976422,340110988.0
2024-11-01,83730.0,45140.0,0.976422,340110988.0


In [34]:
restricted_lf_df.isna().sum().sort_values(ascending=False)

MSPUS               0
ASPUS               0
RCMFLOACT           0
RCMFLOLTVPCT50      0
RCMFLOLTVPCT75      0
RCMFLOLTVPCT90      0
DRSFRMACBN          0
RRVRUSQ156N         0
RHORUSQ156N         0
MEHOINUSA646N       0
MEPAINUSA646N       0
SPPOPGROWUSA        0
POPTOTUSA647NWDB    0
dtype: int64

In [35]:
# make sure intermediate path exists
wip_dest = Path("./data/wip")
wip_dest.mkdir(parents=True, exist_ok=True)

save_atomic(restricted_lf_df, Path(f'data/wip/qseries.wip.parquet'), {})

--- Logging error ---
Traceback (most recent call last):
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 80, in emit
    self.doRollover()
    ~~~~~~~~~~~~~~~^^
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 185, in doRollover
    self.rotate(self.baseFilename, dfn)
    ~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\logging\handlers.py", line 121, in rotate
    os.rename(source, dest)
    ~~~~~~~~~^^^^^^^^^^^^^^
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\dalla\\OneDrive\\Education\\WGU\\Capstone\\logs\\utils\\src.utilities.log' -> 'C:\\Users\\dalla\\OneDrive\\Education\\WGU\\Capstone\\logs\\utils\\src.utilities.log.1'
Call stack:
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\dalla\miniconda3\envs\capstone\Lib\site-packages\ipykern