<!-- Import packages -->

## Install and import

Install packages

In [2]:
pip install -r Requirements.txt

Note: you may need to restart the kernel to use updated packages.


Import libraries


In [3]:
import pandas as pd
import numpy as np
import matplotlib as plt
import json
import datetime as dt
import requests
import os
from fredapi import Fred

## S&P500 data extract and clean

Create a date range. This date range has been set to 1950-present because volume data is missing before 1950.

In [4]:
end_date = dt.datetime.now()
start_date = dt.datetime(1950, 1, 1)

Pull S&P500 data using API from rapidapi.com

In [5]:
url = "https://s-p-500-history-api.p.rapidapi.com/historical-stock-data.php"

querystring = {"interval":"day","symbol":"^GSPC","start":start_date,"end":end_date}

headers = {
	"x-rapidapi-key": "1818711435msh7ff1e71cfe61ec4p1b64e1jsn32c289b7a487",
	"x-rapidapi-host": "s-p-500-history-api.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)


Extract data from json file. Daily and weekly timeframes. My API monthly data only went back to 1985.

In [6]:
response_json = json.loads(response.text)
sp = pd.DataFrame(response_json['data'])
sp.head()

querystring = {"interval":"week","symbol":"^GSPC","start":start_date,"end":end_date}
response = requests.get(url, headers=headers, params=querystring)
response_json = json.loads(response.text)
sp_weekly = pd.DataFrame(response_json['data'])
sp_weekly.head()

Unnamed: 0,date_time,timestamp,high,low,close,open,volume,adjusted_close
0,"January 1, 1950, 5:00 AM UTC",-631134000,17.09,16.66,17.09,16.66,9040000,17.09
1,"January 8, 1950, 5:00 AM UTC",-630529200,17.09,16.65,16.65,17.08,14790000,16.65
2,"January 15, 1950, 5:00 AM UTC",-629924400,16.940001,16.719999,16.940001,16.719999,7980000,16.940001
3,"January 22, 1950, 5:00 AM UTC",-629319600,16.92,16.73,16.9,16.92,7430000,16.9
4,"January 29, 1950, 5:00 AM UTC",-628714800,17.35,17.02,17.35,17.02,10120000,17.35


Edit 'date_time' field in sp and sp_weekly to datetime dtype

In [17]:
sp['date_time'].info

<bound method Series.info of 0        January 3, 1950, 2:30 PM UTC
1        January 4, 1950, 2:30 PM UTC
2        January 5, 1950, 2:30 PM UTC
3        January 6, 1950, 2:30 PM UTC
4        January 9, 1950, 2:30 PM UTC
                     ...             
18934      April 4, 2025, 1:30 PM UTC
18935      April 7, 2025, 1:30 PM UTC
18936      April 8, 2025, 1:30 PM UTC
18937      April 9, 2025, 1:30 PM UTC
18938     April 10, 2025, 8:50 PM UTC
Name: date_time, Length: 18939, dtype: object>

In [23]:
sp_weekly['date_time'] = pd.to_datetime(sp_weekly['date_time'])
sp['date_time'] = pd.to_datetime(sp['date_time'])

In [26]:
sp['date_time'].info

<bound method Series.info of 0       1950-01-03
1       1950-01-04
2       1950-01-05
3       1950-01-06
4       1950-01-09
           ...    
18934   2025-04-04
18935   2025-04-07
18936   2025-04-08
18937   2025-04-09
18938   2025-04-10
Name: date_time, Length: 18939, dtype: datetime64[ns]>

Drop 'timestamp' and 'adjusted close' columns

In [8]:
sp.drop(columns=['timestamp','adjusted_close'], inplace=True, axis=1)

sp_weekly.drop(columns=['timestamp','adjusted_close'], inplace=True, axis=1)

## Collect macroeconomic FRED data

Extract urls from 'FRED Data URLs.json' (exported file of FRED webpages I am going to pull from) using 'cntrl+shift+L' in VSCode

In [9]:

url_list = [
"https://fred.stlouisfed.org/series/PAYEMS",
"https://fred.stlouisfed.org/series/INDPRO",
"https://fred.stlouisfed.org/series/CE16OV",
"https://fred.stlouisfed.org/series/UNRATE",
"https://fred.stlouisfed.org/series/GDP",
"https://fred.stlouisfed.org/series/A191RP1Q027SBEA",
"https://fred.stlouisfed.org/series/CPIAUCSL",
"https://fred.stlouisfed.org/series/SAHMCURRENT",
"https://fred.stlouisfed.org/series/LORSGPORUSQ659S",
"https://fred.stlouisfed.org/series/BOGZ1FL135010005Q",
"https://fred.stlouisfed.org/series/UMCSENT",
"https://fred.stlouisfed.org/series/HOUST",
"https://fred.stlouisfed.org/series/W875RX1#0##0",
"https://fred.stlouisfed.org/series/M2SL",
"https://fred.stlouisfed.org/series/PRS85006091",
"https://fred.stlouisfed.org/series/DFF",
"https://fred.stlouisfed.org/series/SHTSAUS",
"https://fred.stlouisfed.org/series/M1V",
"https://fred.stlouisfed.org/series/M2V",
"https://fred.stlouisfed.org/series/INTDSRUSM193N"
]


Use BeautifulSoup to extract metadata from above urls. Create a dictionary and lists of those values.

In [10]:
from bs4 import BeautifulSoup

fred_dictionary = {}
title_list = []
notes_list = []
id_list = []
frequency_list = []

# Loop through the URLs and extract the title and paragraph under "Notes"
for url in url_list:
# Extract the title 
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.title.string.strip()
    title_list.append(title)
# Extract the paragraph under "Notes"
    notes = soup.find("div", id="notes") 
    notes_text = notes.get_text(strip=True)
    notes_list.append(notes_text) 
# Extract the "Frequency"
    frequency = soup.find("span", class_="series-meta-value-frequency")
    frequency_text = frequency.get_text(strip=True)
    frequency_list.append(frequency_text) 
# Extract the ID values from () in the title
    pipe = title.find("|")
    left_of_pipe = title[:pipe]
    id = left_of_pipe[left_of_pipe.rfind("("):]
    id = id.replace("(", "")
    id = id.replace(")", "")
    id_text = id.strip() 
    id_list.append(id_text)

# Create a dictionary with the extracted data
    fred_dictionary[id_text] = {
        "url": url,
        "id": id_text,
        "title": title,
        "frequency": frequency_text,
        "notes": notes_text
    }

# Print the extracted data
    print(f"URL: {url}")
    print(f"ID: {id_text}")
    print(f"Title: {title}") 
    print(f"Frequency: {frequency_text}") 
    print(f"Notes: {notes_text}\n")


URL: https://fred.stlouisfed.org/series/PAYEMS
ID: PAYEMS
Title: All Employees, Total Nonfarm (PAYEMS) | FRED | St. Louis Fed
Frequency: Monthly
Notes: NotesSource:U.S. Bureau of Labor StatisticsRelease:Employment SituationUnits:Thousands of Persons, Seasonally AdjustedFrequency:MonthlyNotes:All Employees: Total Nonfarm, commonly known as Total Nonfarm Payroll, is a measure of the number of U.S. workers in the economy that excludes proprietors, private household employees, unpaid volunteers, farm employees, and the unincorporated self-employed. This measure accounts for approximately 80 percent of the workers who contribute to Gross Domestic Product (GDP).This measure provides useful insights into the current economic situation because it can represent the number of jobs added or lost in an economy. Increases in employment might indicate that businesses are hiring which might also suggest that businesses are growing. Additionally, those who are newly employed have increased their perso

Use the FRED API library and use a key to access series. Make a dictionary out of those series.

In [11]:
# FRED API
fred_api_key = Fred(api_key='9de274aca217f6c8787ddb88fbcf323c')

df_dict = {}

for df_name in id_list:
    fred_series = fred_api_key.get_series(df_name, start_date, end_date)
    df_dict[df_name] = pd.DataFrame(fred_series, columns=[df_name])
df_dict

{'PAYEMS':               PAYEMS
 1950-01-01   43526.0
 1950-02-01   43297.0
 1950-03-01   43954.0
 1950-04-01   44382.0
 1950-05-01   44718.0
 ...              ...
 2024-11-01  158619.0
 2024-12-01  158942.0
 2025-01-01  159053.0
 2025-02-01  159170.0
 2025-03-01  159398.0
 
 [903 rows x 1 columns],
 'INDPRO':               INDPRO
 1950-01-01   14.0049
 1950-02-01   14.0586
 1950-03-01   14.5156
 1950-04-01   14.9995
 1950-05-01   15.3489
 ...              ...
 2024-10-01  102.2669
 2024-11-01  102.0231
 2024-12-01  103.1654
 2025-01-01  103.4350
 2025-02-01  104.2062
 
 [902 rows x 1 columns],
 'CE16OV':               CE16OV
 1950-01-01   57635.0
 1950-02-01   57751.0
 1950-03-01   57728.0
 1950-04-01   58583.0
 1950-05-01   58649.0
 ...              ...
 2024-11-01  161183.0
 2024-12-01  161661.0
 2025-01-01  163895.0
 2025-02-01  163307.0
 2025-03-01  163508.0
 
 [903 rows x 1 columns],
 'UNRATE':             UNRATE
 1950-01-01     6.5
 1950-02-01     6.4
 1950-03-01     6.3
 1950-0

## Clean FRED DataFrame Dictionary

Make a copy of the df_dict

In [13]:
df_dict_copy = df_dict.copy()

Loop through to rename the first column to 'date' and set is as datetime

In [14]:
for key, value in df_dict_copy.items():
    df_dict_copy[key].rename(columns={"index": "date"}, inplace=True) 
    df_dict_copy[key]['date'] = pd.to_datetime(df_dict_copy[key]['date'])   

df_dict_copy['PAYEMS'].head()

KeyError: 'date'

Identify all monthly datasets and quarterly datasets

In [None]:
for key, value in fred_dictionary.items():
    monthtly_dfs = value['frequency'] == 'Monthly'
    quarterly_dfs = value['frequency'] == 'Quarterly'

monthtly_dfs
quarterly_dfs

False