In [1]:
%pip install --upgrade pip
%pip install matplotlib
import sys
!{sys.executable} -m pip install "nbformat>=4.2.0"
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import os
from pathlib import Path
pd.set_option('display.max_columns', None)

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
path = '../data/cleaned_etf_data.csv'
df = pd.read_csv(path)


In [3]:
# Convert date-related columns to datetime
df['Date'] = pd.to_datetime(df['Date'])
df['Mid_Quarter'] = pd.to_datetime(df['Mid_Quarter'])
df['End_Quarter'] = pd.to_datetime(df['End_Quarter'])  # If in YYYY-MM format

# Convert categorical/time-related columns to appropriate types
df['Year'] = df['Year'].astype(int) 
df['Month_Year'] = pd.to_datetime(df['Month_Year'])  
df['Quarter'] = df['Quarter'].astype(str) 
df['Week'] = df['Week'].astype("uint32")  
# Convert categorical columns
df['Ticker'] = df['Ticker'].astype("category")  # Saves memory for categorical values

In [4]:
# EDA Process
# Create two different sub-dataframe, one that contains etf + benchmark,sub_df, and one without benchmark, sub_df2. 
# The sub df will filter out where year is greater that 1998, because most etf weren't avaiable until that year. 
etfs_and_benchmark = ["SPY", "QQQ", "XLE", "XLV", "XLF", "VNQ", "XLI"] #Spy would be the SPY500 benchmark. 
etfs = ["QQQ", "XLE", "XLV", "XLF", "VNQ", "XLI"]

sub_df = df[df["Ticker"].isin(etfs_and_benchmark) & (df["Year"].astype(int) > 1997)].copy()

sub_df2 = df[df["Ticker"].isin(etfs) & (df["Year"].astype(int) > 1997)].copy()

In [5]:
# First EDA. 
# Comparing the normalized prices of ETFs against the SP&500 benchmark. 
pivot_table1 = sub_df.pivot_table(
    index = "Month_Year",
    columns = "Industry",
    values = "Adj Close Normalized 0 - 100",
    aggfunc = "mean"
)

fig = px.line(
    pivot_table1,
    x=pivot_table1.index,
    y=pivot_table1.columns,
    title="Which Sectors Outperformed?",
    labels={'value': 'Normalized Avg Adj Close (0-100)', 'Month_Year': 'Month-Year'}
)

fig.update_layout(
    xaxis_title='Time (Month-Year)',
    yaxis_title='Relative Growth Scale (0-100)',
    legend_title='Sub-Sector',
    xaxis_tickangle=45,
     legend=dict(
        orientation="h",        # horizontal layout
        yanchor="bottom",
        y=-0.7,                 # adjust how far below the x-axis it appears
        xanchor="center",
        x=0.5
    )
)
fig.show()

In [6]:
# EDA 2: Looking at the volitility distribution of each industry's ETF through a box plot. 
volitility_pivot = sub_df2 .pivot_table(
    values = 'Volatility_30d', 
    index = 'Month_Year',
    columns = "Industry", 
    aggfunc='mean'
)

volitility_pivot2 = sub_df2 .pivot_table(
    values = 'Volatility_30d', 
    index = 'Month_Year',
    columns = "Industry", 
    aggfunc='mean'
)

volatility_long = volitility_pivot2.reset_index().melt(
    id_vars="Month_Year",
    var_name="Industry",
    value_name="Volatility_30d"
)

fig = px.box(
    volatility_long,
    x="Industry",
    y="Volatility_30d",
    title="30-Day Volatility by Industry",
    labels={"Volatility_30d": "30-Day Volatility", "Industry": "Industry"},
    color="Industry"  # adds color for each category
)

fig.update_layout(
    legend=dict(
        orientation="h",        # horizontal layout
        yanchor="bottom",
        y=-0.5,                 # adjust how far below the x-axis it appears
        xanchor="center",
        x=0.5
    )
)

# Step 2: Compute medians for each industry
medians = volatility_long.groupby("Industry")["Volatility_30d"].median()

# Step 3: Add annotation for each median
for industry, median_val in medians.items():
    fig.add_annotation(
        x=industry,
        y=median_val,
        text=f"Median: {median_val:.2f}",
        showarrow=False,
        yshift=10,
        font=dict(size=10, color="black"),
    )

fig.show()

In [7]:
# Box plot to visualize the drawdown distribution of each industry 

medians = sub_df2.groupby("Industry")["Drawdown"].median()

fig = px.box(
    sub_df2,
    x="Industry",
    y="Drawdown",
    color="Industry",  # Use default Plotly colors
    title="Drawdown Distribution by Industry"
)
fig.update_layout(
    legend=dict(
        orientation="h",        # horizontal layout
        yanchor="bottom",
        y=-0.5,                 # adjust how far below the x-axis it appears
        xanchor="center",
        x=0.5
    )
)
for i, (industry, median) in enumerate(medians.items()):
    fig.add_annotation(
        x=industry,
        y=median,
        text=f"Median: {median:.2f}%",
        showarrow=False,
        yshift=10,
        font=dict(size=10),
    )
fig.show()

In [8]:
# 3rd EDA. Comparing how the Federal Fund Rate and the Inflation rate percentage change since 1998 - present. 
quarterly_cpi = sub_df2.groupby("End_Quarter").agg({
    "Inflation (CPI)": "mean",
    "Adj Close Normalized 0 - 100": "mean"
}).reset_index()

# Calculate quarterly inflation rate (%) based on CPI
quarterly_cpi["Inflation Rate (%)"] = quarterly_cpi["Inflation (CPI)"].pct_change() * 100
fed_fund_rate = sub_df2.groupby('End_Quarter')["Federal Funds Rate"].mean().reset_index()

fig = make_subplots(specs=[[{"secondary_y": True}]])

fig.update_layout(
    legend=dict(
        orientation="h",        # horizontal layout
        yanchor="bottom",
        y=-0.5,                 # adjust how far below the x-axis it appears
        xanchor="center",
        x=0.45
    )
)

fig.add_trace(
    go.Scatter(x=quarterly_cpi["End_Quarter"], y=quarterly_cpi["Inflation Rate (%)"], name="Inflation Rate"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=fed_fund_rate["End_Quarter"], y=fed_fund_rate["Federal Funds Rate"], name="Federal Funds Rate"),
    secondary_y=True,
)

fig.update_xaxes(title_text="Yearly Quarters")
# Set y-axes titles
fig.update_yaxes(title_text="Inflation Rate % Change", secondary_y=False)
fig.update_yaxes(title_text="Federal Funds Rate", secondary_y=True)
fig.show()