In [None]:
import pandas as pd
import numpy as np
import yfinance as yf
import requests
from bs4 import BeautifulSoup
import sqlite3
from sqlalchemy import create_engine
from datetime import datetime, timedelta
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
response = requests.get(url, headers=headers)
tables = pd.read_html(response.text)
sp500_table = tables[0]

print("Successfully extracted table from Wikipedia")
print(f"Table shape: {sp500_table.shape}")
print(f"Columns: {list(sp500_table.columns)}\n")

Successfully extracted table from Wikipedia
Table shape: (503, 8)
Columns: ['Symbol', 'Security', 'GICS Sector', 'GICS Sub-Industry', 'Headquarters Location', 'Date added', 'CIK', 'Founded']



In [None]:
# Extract tickers and clean them
tickers = sp500_table['Symbol'].str.replace('.', '-').tolist()
print(f"Total number of tickers extracted: {len(tickers)}")
print(f"First 10 tickers: {tickers[:10]}")
print(f"Last 10 tickers: {tickers[-10:]}\n")

Total number of tickers extracted: 503
First 10 tickers: ['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']
Last 10 tickers: ['WMB', 'WTW', 'WDAY', 'WYNN', 'XEL', 'XYL', 'YUM', 'ZBRA', 'ZBH', 'ZTS']



In [None]:
end_date = datetime.now()
start_date = end_date - timedelta(days=3*365)

print(f"Date range: {start_date.date()} to {end_date.date()}")

Date range: 2022-10-13 to 2025-10-12


In [None]:
sp500_index = yf.download('^GSPC', start=start_date, end=end_date, progress=False)
print(f"✓ S&P 500 Index downloaded: {len(sp500_index)} days of data\n")

✓ S&P 500 Index downloaded: 751 days of data



In [None]:
stock_data = yf.download(tickers, start=start_date, end=end_date, group_by='ticker', progress=True)

print(f"Downloaded: {stock_data.shape}")

[*********************100%***********************]  503 of 503 completed


Downloaded: (751, 2515)


In [None]:
# Step 4: Extract Adjusted Close Prices and Calculate Returns
# Reference: Course materials on return calculation
print("="*60)
print("STEP 4: Extracting Prices and Calculating Returns")
print("="*60)

# Extract Close prices from stocks (we'll call this adj_close_prices for consistency)
print("Extracting 'Close' prices from MultiIndex columns...")
adj_close_prices = stock_data.xs('Close', level=1, axis=1)

print(f"✓ Close prices extracted: {adj_close_prices.shape}")
print(f"  Tickers: {adj_close_prices.shape[1]}")
print(f"  Days: {adj_close_prices.shape[0]}")
print(f"  First 5 tickers: {adj_close_prices.columns[:5].tolist()}")
print(f"  Date range: {adj_close_prices.index[0].date()} to {adj_close_prices.index[-1].date()}")

# Calculate returns
print("\nCalculating returns...")
returns = adj_close_prices.pct_change().dropna()

# For S&P 500, use 'Close' column
sp500_returns = sp500_index['Close'].pct_change().dropna()

print(f"✓ Stock returns calculated: {returns.shape}")
print(f"✓ S&P 500 returns calculated: {sp500_returns.shape}")
print(f"  Date range: {returns.index[0].date()} to {returns.index[-1].date()}")

# Show sample of returns
print(f"\nSample stock returns (first 3 tickers, first 3 days):")
print(returns.iloc[:3, :3])

print(f"\nSample S&P 500 returns (first 5 days):")
print(sp500_returns.head())

print("="*60 + "\n")

STEP 4: Extracting Prices and Calculating Returns
Extracting 'Close' prices from MultiIndex columns...
✓ Close prices extracted: (751, 503)
  Tickers: 503
  Days: 751
  First 5 tickers: ['GLW', 'CTRA', 'GILD', 'PFE', 'SJM']
  Date range: 2022-10-13 to 2025-10-10

Calculating returns...
✓ Stock returns calculated: (386, 503)
✓ S&P 500 returns calculated: (750, 1)
  Date range: 2024-03-28 to 2025-10-10

Sample stock returns (first 3 tickers, first 3 days):
Ticker           GLW      CTRA      GILD
Date                                    
2024-03-28 -0.001817  0.000718  0.003287
2024-04-01 -0.008799  0.004663 -0.005051
2024-04-02 -0.004897  0.010353 -0.010840

Sample S&P 500 returns (first 5 days):
Ticker         ^GSPC
Date                
2022-10-14 -0.023663
2022-10-17  0.026480
2022-10-18  0.011428
2022-10-19 -0.006672
2022-10-20 -0.007951



In [None]:
db_name = 'sp500_data.db'
engine = create_engine(f'sqlite:///{db_name}')

# Store data in database
adj_close_prices.to_sql('adj_close_prices', engine, if_exists='replace', index=True)
print(f"✓ Stored adj_close_prices: {adj_close_prices.shape}")

returns.to_sql('returns', engine, if_exists='replace', index=True)
print(f"✓ Stored returns: {returns.shape}")

sp500_index.to_sql('sp500_index', engine, if_exists='replace', index=True)
print(f"✓ Stored sp500_index: {sp500_index.shape}")

sp500_returns.to_sql('sp500_returns', engine, if_exists='replace', index=True)

✓ Stored adj_close_prices: (751, 503)
✓ Stored returns: (386, 503)
✓ Stored sp500_index: (751, 5)


750

In [None]:
# Standardize returns
scaler = StandardScaler()
returns_standardized = scaler.fit_transform(returns)
print(f"✓ Data standardized")

# Apply PCA with 5 components
pca = PCA(n_components=5)
principal_components = pca.fit_transform(returns_standardized)
print(f"✓ PCA completed: {principal_components.shape}\n")

# Variance explained
variance_explained = pca.explained_variance_ratio_
cumulative_variance = np.cumsum(variance_explained)

print("Variance Explained by Each Component:")
print("-" * 50)
for i in range(5):
    print(f"  PC{i+1}: {variance_explained[i]*100:.2f}% (Cumulative: {cumulative_variance[i]*100:.2f}%)")
print("-" * 50)
print(f"Total: {cumulative_variance[-1]*100:.2f}%\n")

# Create DataFrame with PC returns
pc_returns = pd.DataFrame(
    principal_components,
    index=returns.index,
    columns=[f'PC{i+1}' for i in range(5)]
)
print(f"✓ PC returns DataFrame created: {pc_returns.shape}")
print("="*60 + "\n")

✓ Data standardized
✓ PCA completed: (386, 5)

Variance Explained by Each Component:
--------------------------------------------------
  PC1: 28.84% (Cumulative: 28.84%)
  PC2: 8.77% (Cumulative: 37.62%)
  PC3: 3.58% (Cumulative: 41.20%)
  PC4: 2.53% (Cumulative: 43.73%)
  PC5: 1.92% (Cumulative: 45.64%)
--------------------------------------------------
Total: 45.64%

✓ PC returns DataFrame created: (386, 5)



In [None]:
common_dates = pc_returns.index.intersection(sp500_returns.index)
pc_returns_aligned = pc_returns.loc[common_dates]
sp500_returns_aligned = sp500_returns.loc[common_dates]

print(f"Aligned data: {len(common_dates)} dates\n")

# Run regressions for each PC
regression_results = {}

for i in range(1, 6):
    pc_name = f'PC{i}'
    print(f"{pc_name} Regression:")
    print("-" * 50)

    # Prepare regression data
    X = sm.add_constant(sp500_returns_aligned.values)  # Add intercept
    y = pc_returns_aligned[pc_name].values

    # Fit OLS model
    model = sm.OLS(y, X).fit()

    # Store results
    regression_results[pc_name] = {
        'alpha': model.params[0],
        'beta': model.params[1],
        'alpha_pvalue': model.pvalues[0],
        'beta_pvalue': model.pvalues[1],
        'r_squared': model.rsquared
    }

    # Display results
    print(f"Alpha: {model.params[0]:.6f} (p-value: {model.pvalues[0]:.4f})")
    print(f"Beta:  {model.params[1]:.6f} (p-value: {model.pvalues[1]:.4f})")
    print(f"R^2:    {model.rsquared:.4f}\n")

print("="*60)
print("Summary Table:")
print("="*60)
summary_df = pd.DataFrame({
    'PC': [f'PC{i}' for i in range(1, 6)],
    'Alpha': [regression_results[f'PC{i}']['alpha'] for i in range(1, 6)],
    'Beta': [regression_results[f'PC{i}']['beta'] for i in range(1, 6)],
    'Alpha p-val': [regression_results[f'PC{i}']['alpha_pvalue'] for i in range(1, 6)],
    'Beta p-val': [regression_results[f'PC{i}']['beta_pvalue'] for i in range(1, 6)],
    'R^2': [regression_results[f'PC{i}']['r_squared'] for i in range(1, 6)]
})
print(summary_df.to_string(index=False))

Aligned data: 386 dates

PC1 Regression:
--------------------------------------------------
Alpha: -0.629136 (p-value: 0.0269)
Beta:  994.551780 (p-value: 0.0000)
R^2:    0.7884

PC2 Regression:
--------------------------------------------------
Alpha: 0.113954 (p-value: 0.7259)
Beta:  -180.140603 (p-value: 0.0000)
R^2:    0.0850

PC3 Regression:
--------------------------------------------------
Alpha: -0.035841 (p-value: 0.8675)
Beta:  56.658086 (p-value: 0.0047)
R^2:    0.0206

PC4 Regression:
--------------------------------------------------
Alpha: 0.035498 (p-value: 0.8435)
Beta:  -56.115474 (p-value: 0.0008)
R^2:    0.0286

PC5 Regression:
--------------------------------------------------
Alpha: 0.001408 (p-value: 0.9929)
Beta:  -2.225102 (p-value: 0.8800)
R^2:    0.0001

Summary Table:
 PC     Alpha        Beta  Alpha p-val    Beta p-val      R^2
PC1 -0.629136  994.551780     0.026906 1.393827e-131 0.788440
PC2  0.113954 -180.140603     0.725934  5.285420e-09 0.085034
PC3 -0.0