## Imports

In [1]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import time
import pickle

# webscraping
import requests
from bs4 import BeautifulSoup

# stocks
import yfinance as yf
import cvxopt as opt
from cvxopt import blas, solvers
import cufflinks
import mplfinance as mpf

# import plotly.plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
solvers.options['show_progress'] = False

# ignore warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# import sys
# !{sys.executable} -m pip install mplfinance

## Basic Goal

Plan: 
1. Data gathering
      - Randomly select 30 stocks from the S&P 500 
      - Get data for each of the stocks from the past 3 years using yfinance.  
      - Get the daily Fama-French factors from the Kenneth French website.
2. Implement the PCA Markowitz portfolio optimization
      - PCA on normalized returns
      - Check if PC1 is significant using Tracy-Widow
      - Get the portfolio that corresponds to PC1
3. Implement the Fama-French three-factor model
      - For each stock, run the standard time series regression for the Fama-French model. 
      - Get the covariance matrix from the residuals 
4. Compare the 2 portfolios against the efficient frontier

### Data Gathering

In [3]:
# getting the stocks
headers = {
    'User-Agent': (
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
        '(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    )
}

response = requests.get(
    "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies",
    headers=headers
)
response.raise_for_status()
tables = pd.read_html(response.text)

if len(tables) > 0:
    stocks_df = tables[0]

# randomly selecting 30 stocks
random_stocks = stocks_df['Symbol'].sample(n=30, random_state=42)

In [4]:
# getting closing prices for the 30 stocks with batching
start_date = '2022-08-31'
end_date = '2025-08-31'

# def download_stocks_in_batches(tickers, batch_size=5, delay=1):
#     """
#     Download stock data in batches to avoid rate limiting
#     """
#     all_data = {}
    
#     for i in range(0, len(tickers), batch_size):
#         batch = tickers[i:i + batch_size]
#         print(f"Downloading batch {i//batch_size + 1}: {batch}")
        
#         try:
#             # Download the batch
#             batch_data = yf.download(
#                 batch,
#                 start=start_date,
#                 end=end_date,
#                 progress=False
#             )
            
#             # Extract closing prices for this batch
#             if not batch_data.empty and 'Close' in batch_data.columns:
#                 closes = batch_data['Close']
#                 # Handle single ticker case (returns Series instead of DataFrame)
#                 if isinstance(closes, pd.Series):
#                     all_data[batch[0]] = closes
#                 else:
#                     for ticker in closes.columns:
#                         all_data[ticker] = closes[ticker]
#                 print(f"Successfully downloaded {len(batch)} stocks")
#             else:
#                 print(f"No data returned for batch: {batch}")
            
#         except Exception as e:
#             print(f"Error downloading batch {batch}: {e}")
        
#         # Add delay between batches to avoid rate limiting
#         if i + batch_size < len(tickers):
#             print(f"Waiting {delay} seconds before next batch...")
#             time.sleep(delay)
    
#     # Combine all data into a single DataFrame
#     if all_data:
#         return pd.DataFrame(all_data)
#     else:
#         return pd.DataFrame()

# # Download in batches of 5 stocks with 1-second delay
# closing_df = download_stocks_in_batches(
#     random_stocks.tolist(), 
#     batch_size=5, 
#     delay=15
# )

This code above was generated with ChatGPT.

In [5]:
# if not closing_df.empty:
#     closing_df.to_pickle('closing prices.pkl')

# closing_df.head(5)

In [9]:
# opening pkl file
filename = r'closing prices.pkl'
with open(filename, 'rb') as f: 
    closing_df = pickle.load(f)

# getting log returns
closing_df = np.log(closing_df / closing_df.shift(1))
closing_df.dropna(how='all', inplace=True)
closing_df.round(2)





Unnamed: 0_level_0,BR,CINF,DHI,K,LIN,BA,GLW,IDXX,LHX,O,...,BLDR,HCA,HSIC,WMB,WTW,CB,NWS,ROP,UNH,VTRS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-09-01,0.00,0.01,0.00,0.01,-0.01,-0.04,-0.01,0.01,0.00,0.00,...,0.00,0.01,-0.01,-0.01,0.00,0.02,0.00,0.01,0.01,0.00
2022-09-02,-0.03,-0.01,0.00,-0.01,-0.01,-0.01,-0.02,-0.02,-0.00,-0.01,...,-0.00,0.00,-0.02,0.00,-0.01,-0.01,0.00,-0.01,-0.01,-0.01
2022-09-06,0.01,0.00,-0.02,-0.01,-0.00,0.00,-0.01,-0.01,0.01,0.00,...,-0.02,0.00,-0.00,-0.02,0.00,-0.00,-0.03,-0.01,0.00,-0.00
2022-09-07,0.01,0.02,0.02,0.01,0.03,0.02,0.01,0.04,0.01,0.01,...,0.02,0.06,0.02,0.00,0.03,0.02,0.02,0.02,0.01,0.03
2022-09-08,0.01,0.01,0.01,-0.02,-0.01,0.01,0.00,0.03,0.00,-0.03,...,0.00,0.02,0.01,-0.03,0.00,0.01,-0.00,0.00,0.01,-0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-08-25,-0.02,-0.01,-0.01,-0.00,-0.01,-0.01,0.02,-0.01,0.00,-0.01,...,-0.02,-0.01,-0.02,-0.00,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01
2025-08-26,0.00,0.00,-0.00,-0.00,0.01,0.03,0.01,0.00,0.01,-0.00,...,0.01,-0.00,0.01,0.01,-0.01,-0.01,-0.01,-0.01,-0.01,-0.01
2025-08-27,0.00,0.00,-0.01,-0.00,0.00,0.00,-0.00,-0.00,-0.00,0.01,...,-0.01,0.01,-0.00,0.01,-0.01,0.00,-0.01,0.01,0.01,0.00
2025-08-28,-0.00,-0.00,0.01,-0.00,-0.00,0.00,0.02,0.01,0.00,-0.01,...,-0.01,0.01,0.00,0.01,-0.01,0.00,-0.00,-0.00,-0.01,-0.00


In [7]:
# getting fama french factors
ff_data = pd.read_csv(
    'ff_factors.csv', 
    header=2
)
ff_data.rename(columns={'Unnamed: 0': 'date'}, inplace=True)
ff_data = ff_data.iloc[:422]

# getting dates
ff_data['date'] = ff_data['date'].astype(str).str.strip()

for idx in range(len(ff_data['date'])):
    date = ff_data.iloc[idx, 0]
    ff_data.iloc[idx, 0] = date[:4] + '-' + date[-2:]

ff_data['date'] = pd.to_datetime(ff_data['date'])
ff_data.set_index('date', inplace=True)
ff_data = ff_data.loc[start_date:end_date]
ff_data

Unnamed: 0_level_0,Mkt-RF,SMB,HML,RF
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-09-01,-9.46,-1.66,1.89,0.19
2022-10-01,6.79,-2.02,4.41,0.23
2022-11-01,7.25,-0.24,-0.28,0.29
2022-12-01,-4.33,2.14,2.51,0.33
2023-01-01,6.92,0.98,-1.89,0.35
2023-02-01,-2.67,-0.04,0.69,0.34
2023-03-01,2.27,-3.23,-6.92,0.36
2023-04-01,1.22,-1.75,1.0,0.35
2023-05-01,-1.61,-0.66,-4.76,0.36
2023-06-01,5.6,-1.58,1.04,0.4


### Implement the PCA Markowitz portfolio optimization

### Implement the Fama-French three-factor model

### Compare the 2 portfolios against the efficient frontier

## Intermediate Goal

Plan:
1. Implement the RIE Markowitz portfolio optimization
      - Estimate the covariance matrix by doing RIE on the log returns
2. Check which eigenvectors are significant using Marchenco-Pastur and pick one to use for the portfolio (this should be the second largest eigenvalue).
3. Compare its fit to the efficient frontier.

### Implement the RIE Markowitz portfolio optimization

### Check which eigenvectors are significant using Marchenco-Pastur and pick one to use for the portfolio

### Compare its fit to the efficient frontier.

## Advanced Goal

Plan:
As per the study done by Molero-Gonzales et al., (2023), the steps we will follow are the ff:
1. Apply the Fama-French 3 Factor Model on the log returns matrix.
2. Turn the coefficients for each of the factors for each stock in the portfolio into a matrix.
3. Apply RMT.
      - Eigenvalue decomposition
      - Identify the significant eigenvectors using the Tracy-Widom distribution.
4. Create a portfolio out of the significant factors and the residuals.
5. Compare its performance against the matrix in the Basic Goal section to predict stock risk.

### Apply the Fama-French 3 Factor Model on the log returns matrix.

### Turn the coefficients for each of the factors for each stock in the portfolio into a matrix.

### Apply RMT

### Create a portfolio out of the significant factors and the residuals.

### Compare its performance against the matrix in the Basic Goal section to predict stock risk.