# Introduction to Portfolio Theory: Benchmark #

Our dear friend Nancy is now happy because, fortunately, we were in the universe where she chose Nvidia for her first investment in 2018. But now she is questioning herself: “Did I choose the best investment from all the possible investments?” She goes to Thomas again, and he recommends that she look for a benchmark to compare her Nvidia investment against the market.

Now she is confused again: What is a benchmark, then? And how can she select the proper one?

In [1]:
# Import Libraries

# Data Management
import pandas as pd
import numpy as np

# Statistics
from scipy.stats import skew, kurtosis
from scipy.stats import jarque_bera
from scipy.stats import probplot
from scipy.stats import norm

# Visualization
import matplotlib.pyplot as plt

# Handle Files
import sys
import os

# Import Local Functions
sys.path.append(os.path.abspath("../source"))
from data_downloader import get_market_data

In [2]:
# Call the data for different benchmarks

# S&P500
data_sp500 = get_market_data('^GSPC', '2015-01-01', '2025-01-01')

# Dow Jones
data_dj = get_market_data('^DJI', '2015-01-01', '2025-01-01')

# Nasdaq-100
data_ndx = get_market_data('^NDX', '2015-01-01', '2025-01-01')

# iShares U.S. Technology ETF
data_iwy = get_market_data('IWY', '2015-01-01', '2025-01-01')


In [4]:
# Data for Benchmarks
benchmark_prices = pd.DataFrame(index=data_sp500.index)

# Create the columns
benchmark_prices['SP500'] = data_sp500['close']
benchmark_prices['DJ'] = data_dj['close']
benchmark_prices['NDX'] = data_ndx['close']
benchmark_prices['IWY'] = data_iwy['close']

benchmark_prices

In [5]:
# Calculate Logarithmic Returns
def log_returns(
        price_series: pd.Series
):
    return np.log(price_series / price_series.shift(1))

In [6]:
# Calculate the Returns

benchmark_returns = log_returns(benchmark_prices)
benchmark_returns.dropna(inplace=True)

benchmark_returns

In [7]:
# Time Series Graphs
plt.figure(figsize=(10, 6))
plt.plot(benchmark_returns.cumsum(), label=benchmark_returns.columns, alpha=1)

# Config
plt.title('Benchmarks Cumulative Returns Time Series')
plt.xlabel('Time Index')
plt.ylabel('$r_t$')
plt.legend()

# Show
plt.grid(True)
plt.show()

In [8]:
# Check Daily
benchmark_returns.mul(100).mean()

In [9]:
# Check Annualized Returns
benchmark_returns.mul(100).mean() * 252

In [10]:
# Check Daily Volatility
benchmark_returns.mul(100).std()

In [11]:
# Check Annualized Volatility
benchmark_returns.mul(100).std() * np.sqrt(252)

In [12]:
# Set the number of subplot columns
n_cols = 2
n_rows = int(np.ceil(len(benchmark_returns.columns) / n_cols))

# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4 * n_rows))
axes = axes.flatten()  # Flatten to 1D array

# Loop through each column and plot
for i, column in enumerate(benchmark_returns.columns):
    data = benchmark_returns[column].dropna()
    mean = data.mean()
    std = data.std()
    upper_lim = mean + 3 * std
    lower_lim = mean - 3 * std

    axes[i].plot(data, label=f'{column} Returns', alpha=1)
    axes[i].axhline(y=mean, color='black', linestyle='dashed', label='Mean')
    axes[i].axhline(y=upper_lim, color='red', linestyle='dashed', label='+3 Std Dev')
    axes[i].axhline(y=lower_lim, color='red', linestyle='dashed', label='-3 Std Dev')

    axes[i].set_title(f'{column} Returns Time Series')
    axes[i].set_xlabel('Time Index')
    axes[i].set_ylabel('Returns')
    axes[i].legend()
    axes[i].grid(True)

# Remove unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

In [13]:
# Create figure and axes
fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))
axes = axes.flatten()  # Flatten in case of 1D

# Plot each column
for i, column in enumerate(benchmark_prices.columns):
    data = benchmark_prices[column].dropna()
    mean_p = data.mean()
    median_p = data.median()
    std_p = data.std()
    upper_lim_p = mean_p + std_p
    lower_lim_p = mean_p - std_p

    axes[i].hist(data, bins=30, alpha=0.5, edgecolor='black')
    axes[i].axvline(x=mean_p, color='black', linestyle='dashed', label='Mean')
    axes[i].axvline(x=median_p, color='red', linestyle='dashed', label='Median')
    axes[i].axvline(x=upper_lim_p, color='grey', linestyle='dashed')
    axes[i].axvline(x=lower_lim_p, color='grey', linestyle='dashed')
    
    axes[i].set_title(f'{column} Price Histogram')
    axes[i].set_xlabel('Price')
    axes[i].set_ylabel('Frequency')
    axes[i].legend()
    axes[i].grid(True)

# Hide any unused subplots if columns < rows*cols
for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

In [14]:
# Create subplots
fig, axes = plt.subplots(n_rows, n_cols, figsize=(14, 4 * n_rows))
axes = axes.flatten()

# Loop through each column
for i, column in enumerate(benchmark_returns.columns):
    data = benchmark_returns[column].dropna()
    mu = data.mean()
    sigma = data.std()

    # Histogram
    axes[i].hist(data, bins=30, density=True, color='blue', alpha=0.5, edgecolor='black', label=f'{column} Prices')

    # Normal distribution curve
    x = np.linspace(data.min(), data.max(), 100)
    y = norm.pdf(x, mu, sigma)
    axes[i].plot(x, y, color='black', linestyle='solid', linewidth=2, label='Normal Distribution')

    # Reference lines
    axes[i].axvline(x=mu, color='black', linestyle='dashed', label='Mean')
    axes[i].axvline(x=data.median(), color='red', linestyle='dashed', label='Median')
    axes[i].axvline(x=mu + sigma, color='grey', linestyle='dashed')
    axes[i].axvline(x=mu - sigma, color='grey', linestyle='dashed')

    # Titles and labels
    axes[i].set_title(f'{column} Histogram with Normal Distribution')
    axes[i].set_xlabel('Price')
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True)

# Remove any unused axes
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

# Adjust layout
plt.tight_layout()
plt.show()

In [15]:
# Define the std ranges to analyze
std_ranges = [1, 2, 3]
benchmark_cols = benchmark_returns.columns
results = {col: [] for col in benchmark_cols}

# Calculate the percentage of returns within each std range
for col in benchmark_cols:
    data = benchmark_returns[col].dropna()
    mean = data.mean()
    std = data.std()
    
    for s in std_ranges:
        lower = mean - s * std
        upper = mean + s * std
        pct = ((data >= lower) & (data <= upper)).mean() * 100
        results[col].append(pct)

# Plotting
x = np.arange(len(std_ranges))
bar_width = 0.2

plt.figure(figsize=(10, 6))

for i, col in enumerate(benchmark_cols):
    plt.bar(x + i*bar_width, results[col], width=bar_width, label=col)

plt.xticks(x + bar_width * (len(benchmark_cols) - 1) / 2, [f'±{s}σ' for s in std_ranges])
plt.ylim(0, 100)
plt.ylabel('Percentage of Observations (%)')
plt.title('Percentage of Returns Within ±1σ, ±2σ, ±3σ')
plt.legend()
plt.grid(True, axis='y')

plt.tight_layout()
plt.show()

In [16]:
# For some purposes of this course, we are selecting the SP500

# Calculate Mean
mean = benchmark_returns['SP500'].dropna().mean()

# Calculate Variance
variance = benchmark_returns['SP500'].dropna().var()

# Calculate Skewness
skewness = skew(benchmark_returns['SP500'].dropna())

# Calculate Kurtosis
kurt = kurtosis(benchmark_returns['SP500'].dropna())

print(f"Mean: {mean}")
print(f"Variance: {variance}")
print(f"Skewness: {skewness}")
print(f"Kurtosis: {kurt}")

In [17]:
# Normality Test
jb_stat, p_value = jarque_bera(benchmark_returns['SP500'].dropna())

print(f"Jarque-Bera Stat: {jb_stat}")
print(f"p-value: {p_value}")

In [18]:
# Prices QQ Plot
probplot(benchmark_returns['SP500'], dist="norm", plot=plt)
plt.title('Q-Q Plot')
plt.show()

In [21]:
# Store the Benchmark
benchmark = benchmark_returns['SP500']
benchmark.name = 'sp_500'

benchmark

In [22]:
benchmark.to_csv(r"..\additional_data\sp500.csv")