### Fama and French Factor Model: Understanding the Factors ### 

In [1]:
# Import Libraries

# Data Management
import pandas as pd
import numpy as np

# Plots
import matplotlib.pyplot as plt

# Handle Files
import sys
import os

# Import Local Functions
sys.path.append(os.path.abspath("../source"))
from config import get_tickers
from data_downloader import get_market_data
from portfolios_toolkit import calculate_analytics

### Size Factor ###

In [2]:
def build_returns_dataframe(
        tickers: list, 
        start_date='2015-01-01', 
        end_date='2025-01-01'
) -> pd.DataFrame:
    # DataFrame
    df_returns = pd.DataFrame()

    # Loop
    for ticker in tickers:
        df = get_market_data(ticker, start_date, end_date, returns=True)
        if not df.empty:
            df_returns[ticker] = df['returns']
            print(f'Data Ready for {ticker}')

    return df_returns

In [3]:
# Import tickers categorized by market capitalization
tickers_df = pd.read_csv('../additional_data/mktcap_companies.csv')
tickers_df.drop(columns='Unnamed: 0', inplace=True)

tickers_df

In [4]:
# Set a dictionary
company_sizes = {
    'mega': tickers_df['mega_companies'],
    'large': tickers_df['large_companies'],
    'mid': tickers_df['mid_companies'],
    'small': tickers_df['small_companies'],
    'micro': tickers_df['micro_companies'],
}

In [5]:
# Get the data
#returns_by_size = {}

#for size, tickers in company_sizes.items():
#    returns_by_size[size] = build_returns_dataframe(tickers, size)

In [6]:
# Create the Portfolios
#portfolios_df = pd.DataFrame({
#    f'{size}_portfolio': df.mean(axis=1, skipna=True)
#    for size, df in returns_by_size.items()
#})

In [7]:
portfolios_df = pd.read_csv(r'..\additional_data\mktcap_portfolios.csv')
portfolios_df.rename(columns={'Unnamed: 0': 'Date'}, inplace=True)
portfolios_df.set_index('Date', inplace=True)
portfolios_df.index = pd.to_datetime(portfolios_df.index)

portfolios_df

In [8]:
# Create Plot
plt.figure(figsize=(10, 6))
plt.plot(portfolios_df.cumsum(), label=portfolios_df.columns, alpha=1)

# Config
plt.title('Cumulative Returns (Size Adjusted) Time Series')
plt.xlabel('Time')
plt.ylabel('Returns')
plt.legend()
plt.grid()

# Show
plt.show()

In [9]:
# Analytics Table
size_analytics_table = calculate_analytics(portfolios_df)

size_analytics_table

In [10]:
# Calculate a primitive SMB premium
SMB_categories = (1/2*(portfolios_df['small_portfolio']+portfolios_df['micro_portfolio']) - 
       1/2*(portfolios_df['mega_portfolio']+portfolios_df['large_portfolio']))  

In [11]:
# Create Plot
plt.figure(figsize=(10, 6))
plt.plot(SMB_categories.cumsum(), label='SMB Premium', alpha=1)

# Config
plt.title('Cumulative SMB Premium Time Series')
plt.xlabel('Time')
plt.ylabel('Returns')
plt.legend()
plt.grid()

# Show
plt.show()

In [12]:
# Annualized Returns
annualized_returns = portfolios_df.mean() * 252 * 100
annualized_returns.name = 'annualized_returns'

annualized_returns

In [13]:
# Annualized Volatility
annualized_volatility = portfolios_df.std() * np.sqrt(252) * 10
annualized_volatility.name = 'annualized_volatility'

annualized_volatility

In [14]:
# Create Plot
plt.figure(figsize=(10, 6))
plt.plot(annualized_returns.iloc[::-1], label='Annualized Returns', alpha=1, marker='o')

# Config
plt.title('Annualized Returns by Size')
plt.xlabel('Size')
plt.ylabel('Annualized Returns')
plt.legend()
plt.grid()

# Show
plt.show()

In [15]:
# Create DataFrame
data = pd.DataFrame({
    'returns': annualized_returns.iloc[::-1],
    'volatility': annualized_volatility.iloc[::-1],
}, index=annualized_returns.iloc[::-1].index)

errors = data['volatility']

# Create the plot
fig, ax = plt.subplots(figsize=(8, 5))

ax.errorbar(
    x=data['returns'],                # x-values (betas)
    y=range(len(data)),               # y-positions
    xerr=errors,                      # confidence interval errors
    fmt='o',                          # circular markers for betas
    ecolor='gray',                    # color of the error bars
    capsize=5,                        # small caps on error bars
    elinewidth=2,                     # thickness of the error bars
    markeredgewidth=2                 # thickness of the circle edge
)

# Customize the plot
ax.set_yticks(range(len(data)))
ax.set_yticklabels(data.index)
ax.axvline(0, color='red', linestyle='--')  # reference line
ax.set_xlabel('Annualized Returns')
ax.set_title('Returns with Standard Errors')

plt.tight_layout()
plt.show()

### The Look-Ahead-Bias ###

The main problem with this categorization is that it relies on present-day classifications instead of those that were valid at the time. As a result, there’s a risk that today’s Large Cap companies weren’t considered Large in prior years. By not using historically accurate data when constructing our portfolio, we introduce a Look-Ahead Bias. Here, we are assuming in 2025 that today’s large companies were also large in 2020 or 2015. This means we are using current information to explain past events—akin to using future knowledge to predict the present. But of course, we don’t have access to future information today, do we?

In [27]:
# We can download the premiums in the Fama and French website
premiums_df = pd.read_csv(r'..\additional_data\famafrench_premiums.csv')
premiums_df.set_index('Date', inplace=True)
premiums_df.index = pd.to_datetime(premiums_df.index)
premiums_df.columns = ['mkt_premium', 'smb_premium', 'hml_premium', 'risk_free_rate']
premiums_df = premiums_df.div(100)
premiums_df = premiums_df.loc[:'2024-12']

premiums_df

In [28]:
# Create Plot
plt.figure(figsize=(10, 6))
plt.plot(SMB_categories.cumsum(), label='SMB Premium (Using Categories)', alpha=1)
plt.plot(premiums_df['smb_premium'].cumsum(), label='SMB Premium (by Fama & French)', alpha=1)

# Config
plt.title('Cumulative SMB Premium Time Series')
plt.xlabel('Time')
plt.ylabel('Returns')
plt.legend()
plt.grid()

# Show
plt.show()

In [32]:
# Annual Rates
smb_cat_annual_rate = SMB_categories.mean() * 252
smb_ff_annual_rate = premiums_df['smb_premium'].mean() * 252

difference = smb_cat_annual_rate - smb_ff_annual_rate

print(difference)

Naturally, a selection or an allocation effect can also explain the difference of returns, but the look-ahead-bias is also a factor to be considered.

### Calculate SMB using ETFs ###

In [33]:
# Vanguard Created 9 Portfolio Categorizing by Size and Value

# "VTV",  # LargeCap Value
# "VOE",  # MidCap Value
# "VBR",  # Small Cap Value
# "VV",   # LargeCap Blend
# "VO",   # MidCap Blend
# "VB",   # SmallCap Blend
# "VUG",  # LargeCap Growth
# "VOT",  # MidCap Growth
# "VBK"   # SmallCap Growth

# ":)"

In [34]:
# Tickers
tickers = get_tickers(mod="5.1")

tickers

In [35]:
# Import data
df_returns = pd.DataFrame()

for ticker in tickers:
    df = get_market_data(
        ticker=ticker, 
        start_date='2015-01-01', 
        end_date='2025-01-01', 
        returns=True
    )
    
    returns = df['returns'].rename(ticker)
    
    df_returns = pd.concat([df_returns, returns], axis=1)

In [36]:
# Calculate the approximation of the HML prime
SMB_etfs = 1/3*(df_returns['VBR'] + df_returns['VB'] + df_returns['VBK']) - 1/3*(df_returns['VTV'] + df_returns['VV'] + df_returns['VUG'])

In [37]:
# Create Plot
plt.figure(figsize=(10, 6))
plt.plot(SMB_etfs.cumsum(), label='SMB Premium (Using ETFs)', alpha=1)
plt.plot(premiums_df['smb_premium'].cumsum(), label='SMB Premium (by Fama and French)', alpha=1)

# Config
plt.title('Cumulative SMB Premium Time Series')
plt.xlabel('Time')
plt.ylabel('Returns')
plt.legend()
plt.grid()

# Show
plt.show()

In [38]:
# Calculate the Correlation:
smb_correlation = SMB_etfs.corr(premiums_df['smb_premium'])

smb_correlation