This file will cover the entire data pipeline process for acquiring all the data, cleaning, and processing for the sake of this project

# Importing and configurations, also folder setup

In [None]:
import os # Note: for working with folders / paths
from datetime import datetime

import pandas as pd
import yfinance as yf             # Note: To download stock/market data
from fredapi import Fred          # Note: To download macroeconomic data from FRED
from pytrends.request import TrendReq  # Note: To pull Google Trends sentiment data
import duckdb                     # Note: This one for now is unsure-ish, can be used for storing/querying data with SQL

# Note: This is the BASE_DIR = the folder where this notebook is running
BASE_DIR = os.getcwd()

# # Note: This will be the main data directory:
DATA_DIR = os.path.join(BASE_DIR, "data")

# Note: Subfolders for different stages of the pipeline:
RAW_DIR = os.path.join(DATA_DIR, "raw")
CLEANED_DIR = os.path.join(DATA_DIR, "cleaned")
MERGED_DIR = os.path.join(DATA_DIR, "merged")

# Note: Now also creating more specific subfolders for raw data, each housing different stuff:
# Note: data/raw/prices/     -> raw price data (S&P 500, VIX, etc.)
# Note: data/raw/macro/      -> raw macro data (CPI, unemployment, etc.)
# Note: data/raw/sentiment/  -> raw sentiment data (Google Trends, etc.)
RAW_PRICES_DIR = os.path.join(RAW_DIR, "prices")
RAW_MACRO_DIR = os.path.join(RAW_DIR, "macro")
RAW_SENTIMENT_DIR = os.path.join(RAW_DIR, "sentiment")

# Note: Same structure for cleaned data:
# Note: data/cleaned/prices/
# Note: data/cleaned/macro/
# Note: data/cleaned/sentiment/
CLEAN_PRICES_DIR = os.path.join(CLEANED_DIR, "prices")
CLEAN_MACRO_DIR = os.path.join(CLEANED_DIR, "macro")
CLEAN_SENTIMENT_DIR = os.path.join(CLEANED_DIR, "sentiment")

# Note: We want historical data starting from 1980, so setting that as a golbal variable
START_DATE = "1980-01-01"

# Note: Today's date (so we always pull up-to-date data)
TODAY = datetime.today().strftime("%Y-%m-%d")

# Note: These will be the financial tickers (for yfinance)
# Note: ^GSPC = S&P 500 index; ^VIX  = Volatility Index
TICKERS = {
    "sp500": "^GSPC",
    "vix": "^VIX",
}

# Note: Next we have the FRED macroeconomic series IDs
# Note: These IDs come from the FRED website
FRED_SERIES = {
    "cpi": "CPIAUCSL",           # Note: Consumer Price Index
    "unemployment": "UNRATE",    # Note: Unemployment rate
    "fed_funds_rate": "FEDFUNDS",
    "m2_money_stock": "M2SL",
}

# Note: Finally the Google Trends sentiment settings
# Note: These are some of the keywords we care about for sentiment around crashes/recessions.
TRENDS_KEYWORDS = ["recession", "stock market crash", "bear market", "financial crisis"]

# Note: Google Trends only goes back to 2004, so our sentiment data starts then.
TRENDS_START_DATE = "2004-01-01"

# Note: Let's print it so we can make sure most of this configuration and folder setup worked
print("Base directory:", BASE_DIR)
print("Data directory:", DATA_DIR)
print("Raw data folder:", RAW_DIR)
print("Cleaned data folder:", CLEANED_DIR)
print("Merged data folder:", MERGED_DIR)
