<!-- Import packages -->

## Install and import

Install packages

In [11]:
pip install -r Requirements.txt

Note: you may need to restart the kernel to use updated packages.


Import libraries


In [12]:
import pandas as pd
import numpy as np
import matplotlib as plt
import json
import datetime as dt
import requests
import os
from fredapi import Fred

## S&P500 data

Create a date range

In [13]:
end_date = dt.datetime.now()
start_date = dt.datetime(1900, 1, 1)

Pull S&P500 data using API from rapidapi.com

In [14]:
url = "https://s-p-500-history-api.p.rapidapi.com/historical-stock-data.php"

querystring = {"interval":"day","symbol":"^GSPC","start":start_date,"end":end_date}

headers = {
	"x-rapidapi-key": "1818711435msh7ff1e71cfe61ec4p1b64e1jsn32c289b7a487",
	"x-rapidapi-host": "s-p-500-history-api.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)


Extract data from json file

In [15]:
response_json = json.loads(response.text)
sp = pd.DataFrame(response_json['data'])
sp.head()

Unnamed: 0,date_time,timestamp,high,low,close,open,volume,adjusted_close
0,"December 30, 1927, 2:30 PM UTC",-1325583000,17.66,17.66,17.66,17.66,0,17.66
1,"January 3, 1928, 2:30 PM UTC",-1325237400,17.76,17.76,17.76,17.76,0,17.76
2,"January 4, 1928, 2:30 PM UTC",-1325151000,17.719999,17.719999,17.719999,17.719999,0,17.719999
3,"January 5, 1928, 2:30 PM UTC",-1325064600,17.549999,17.549999,17.549999,17.549999,0,17.549999
4,"January 6, 1928, 2:30 PM UTC",-1324978200,17.66,17.66,17.66,17.66,0,17.66


Some of the data is missing the data from the 'volume' column. I will drop these rows.

In [16]:
sp = sp[sp['volume']!=0]
sp = sp.reset_index(drop=True)
sp

Unnamed: 0,date_time,timestamp,high,low,close,open,volume,adjusted_close
0,"January 3, 1950, 2:30 PM UTC",-630927000,16.660000,16.660000,16.660000,16.660000,1260000,16.660000
1,"January 4, 1950, 2:30 PM UTC",-630840600,16.850000,16.850000,16.850000,16.850000,1890000,16.850000
2,"January 5, 1950, 2:30 PM UTC",-630754200,16.930000,16.930000,16.930000,16.930000,2550000,16.930000
3,"January 6, 1950, 2:30 PM UTC",-630667800,16.980000,16.980000,16.980000,16.980000,2010000,16.980000
4,"January 9, 1950, 2:30 PM UTC",-630408600,17.080000,17.080000,17.080000,17.080000,2520000,17.080000
...,...,...,...,...,...,...,...,...
18930,"March 31, 2025, 1:30 PM UTC",1743427800,5627.560059,5488.729980,5611.850098,5527.910156,5257250000,5611.850098
18931,"April 1, 2025, 1:30 PM UTC",1743514200,5650.569824,5558.520020,5633.069824,5597.529785,4434500000,5633.069824
18932,"April 2, 2025, 1:30 PM UTC",1743600600,5695.310059,5571.479980,5670.970215,5580.759766,4243830000,5670.970215
18933,"April 3, 2025, 1:30 PM UTC",1743687000,5499.529785,5390.830078,5396.520020,5492.740234,7210470000,5396.520020


Drop 'timestamp' and 'adjusted close' columns

In [17]:
sp.drop(columns=['timestamp','adjusted_close'], inplace=True, axis=1)

## Collect macroeconomic FRED data

In [18]:
# Extracted urls from FRED Data URLs.json using 'cntrl+shift+l' in VSCode
url_list = [
"https://fred.stlouisfed.org/series/PAYEMS",
"https://fred.stlouisfed.org/series/INDPRO",
"https://fred.stlouisfed.org/series/CE16OV",
"https://fred.stlouisfed.org/series/UNRATE",
"https://fred.stlouisfed.org/series/GDP",
"https://fred.stlouisfed.org/series/A191RP1Q027SBEA",
"https://fred.stlouisfed.org/series/CPIAUCSL",
"https://fred.stlouisfed.org/series/SAHMCURRENT",
"https://fred.stlouisfed.org/series/LORSGPORUSQ659S",
"https://fred.stlouisfed.org/series/BOGZ1FL135010005Q",
"https://fred.stlouisfed.org/series/UMCSENT",
"https://fred.stlouisfed.org/series/HOUST",
"https://fred.stlouisfed.org/series/W875RX1#0##0",
"https://fred.stlouisfed.org/series/M2SL",
"https://fred.stlouisfed.org/series/PRS85006091",
"https://fred.stlouisfed.org/series/DFF",
"https://fred.stlouisfed.org/series/SHTSAUS",
"https://fred.stlouisfed.org/series/M1V",
"https://fred.stlouisfed.org/series/M2V",
"https://fred.stlouisfed.org/series/INTDSRUSM193N"
]


In [19]:
from bs4 import BeautifulSoup

title_list = []
notes_list = []
id_list = []
frequency_list = []

# Loop through the URLs and extract the title and paragraph under "Notes"
for url in url_list:
# Extract the title 
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    title = soup.title.string.strip()
    title_list.append(title)
# Extract the paragraph under "Notes"
    notes = soup.find("div", id="notes") 
    notes_text = notes.get_text(strip=True)
    notes_list.append(notes_text) 
# Extract the "Frequency"
    frequency = soup.find("span", class_="series-meta-value-frequency")
    frequency_text = frequency.get_text(strip=True)
    frequency_list.append(frequency_text) 
# Extract the ID values from () in the title
    pipe = title.find("|")
    left_of_pipe = title[:pipe]
    id = left_of_pipe[left_of_pipe.rfind("("):]
    id = id.replace("(", "")
    id = id.replace(")", "")
    id_text = id.strip() 
    id_list.append(id_text)

    print(f"URL: {url}")
    print(f"ID: {id_text}")
    print(f"Title: {title}") 
    print(f"Frequency: {frequency_text}") 
    print(f"Notes: {notes_text}\n")


URL: https://fred.stlouisfed.org/series/PAYEMS
ID: PAYEMS
Title: All Employees, Total Nonfarm (PAYEMS) | FRED | St. Louis Fed
Frequency: Monthly
Notes: NotesSource:U.S. Bureau of Labor StatisticsRelease:Employment SituationUnits:Thousands of Persons, Seasonally AdjustedFrequency:MonthlyNotes:All Employees: Total Nonfarm, commonly known as Total Nonfarm Payroll, is a measure of the number of U.S. workers in the economy that excludes proprietors, private household employees, unpaid volunteers, farm employees, and the unincorporated self-employed. This measure accounts for approximately 80 percent of the workers who contribute to Gross Domestic Product (GDP).This measure provides useful insights into the current economic situation because it can represent the number of jobs added or lost in an economy. Increases in employment might indicate that businesses are hiring which might also suggest that businesses are growing. Additionally, those who are newly employed have increased their perso

In [20]:
id_list 

['PAYEMS',
 'INDPRO',
 'CE16OV',
 'UNRATE',
 'GDP',
 'A191RP1Q027SBEA',
 'CPIAUCSL',
 'SAHMCURRENT',
 'LORSGPORUSQ659S',
 'BOGZ1FL135010005Q',
 'UMCSENT',
 'HOUST',
 'W875RX1',
 'M2SL',
 'PRS85006091',
 'DFF',
 'SHTSAUS',
 'M1V',
 'M2V',
 'INTDSRUSM193N']

In [22]:
# FRED API
fred_api_key = Fred(api_key='9de274aca217f6c8787ddb88fbcf323c')

df_dict = {}

for df_name in id_list:
    df_dict[df_name] = fred_api_key.get_series(df_name, start_date, end_date)

In [27]:
df_dict

{'PAYEMS': 1939-01-01     29923.0
 1939-02-01     30100.0
 1939-03-01     30280.0
 1939-04-01     30094.0
 1939-05-01     30299.0
                 ...   
 2024-11-01    158619.0
 2024-12-01    158942.0
 2025-01-01    159053.0
 2025-02-01    159170.0
 2025-03-01    159398.0
 Length: 1035, dtype: float64,
 'INDPRO': 1919-01-01      4.8654
 1919-02-01      4.6504
 1919-03-01      4.5160
 1919-04-01      4.5966
 1919-05-01      4.6235
                 ...   
 2024-10-01    102.2669
 2024-11-01    102.0231
 2024-12-01    103.1654
 2025-01-01    103.4350
 2025-02-01    104.2062
 Length: 1274, dtype: float64,
 'CE16OV': 1948-01-01     58061.0
 1948-02-01     58196.0
 1948-03-01     57671.0
 1948-04-01     58291.0
 1948-05-01     57854.0
                 ...   
 2024-11-01    161183.0
 2024-12-01    161661.0
 2025-01-01    163895.0
 2025-02-01    163307.0
 2025-03-01    163508.0
 Length: 927, dtype: float64,
 'UNRATE': 1948-01-01    3.4
 1948-02-01    3.8
 1948-03-01    4.0
 1948-04-01    3.9


## Clean FRED Data

Unnamed: 0,observation_date,HOUST
0,1959-01-01,1657
1,1959-02-01,1667
2,1959-03-01,1620
3,1959-04-01,1590
4,1959-05-01,1498
...,...,...
789,2024-10-01,1344
790,2024-11-01,1305
791,2024-12-01,1526
792,2025-01-01,1350
