# Empirical Asset Pricing - Problem Set 2

Group Member: Victor Xiao, Zi Wang, Sonny Song

## 1. Principal Component Analysis

### 1.a Factor Data

In [1]:
# Packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import warnings
from pandas.tseries.offsets import MonthEnd
warnings.simplefilter('ignore') # 

# Setups
pd.set_option("display.max_rows", 100)

In [20]:
FF5_df = pd.read_csv('data/F-F_Research_Data_5_Factors_2x3_daily.csv', skiprows=3)
Mom_df = pd.read_csv('data/F-F_Momentum_Factor_daily.csv')

In [23]:
FF5_df.head()

Unnamed: 0,Date,Mkt-RF,SMB,HML,RMW,CMA,RF
0,19630701,-0.67,0.02,-0.35,0.03,0.13,0.012
1,19630702,0.79,-0.28,0.28,-0.08,-0.21,0.012
2,19630703,0.63,-0.18,-0.1,0.13,-0.25,0.012
3,19630705,0.4,0.09,-0.28,0.07,-0.3,0.012
4,19630708,-0.63,0.07,-0.2,-0.27,0.06,0.012


In [24]:
Mom_df.head()

Unnamed: 0,Date,Mom
0,19261103,0.56
1,19261104,-0.5
2,19261105,1.17
3,19261106,-0.03
4,19261108,-0.01


In [25]:
# For the FF5_df DataFrame
columns_to_convert = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
FF5_df[columns_to_convert] = FF5_df[columns_to_convert] / 100

Mom_df.columns = Mom_df.columns.str.strip()

# For the Mom_df DataFrame
Mom_df['Mom'] = Mom_df['Mom'] / 100

In [27]:
# Filter the Mom_df DataFrame to include only data from July 1963 onwards
Mom_df = Mom_df[Mom_df['Date'] >= 19630701]

# Now, Mom_df will start from July 1963
Mom_df

Unnamed: 0,Date,Mom
10319,19630701,-0.0021
10320,19630702,0.0042
10321,19630703,0.0041
10322,19630705,0.0007
10323,19630708,-0.0045
...,...,...
25564,20240125,-0.0031
25565,20240126,-0.0011
25566,20240129,0.0035
25567,20240130,0.0048


### 1.b Monthly Realized Variance

$$
    \hat{\sigma}^2_t = \sum^D_{d=1} R^2_{d,t}
$$

In [39]:
# Convert the 'date' columns to datetime format
FF5_df['Date'] = pd.to_datetime(FF5_df['Date'], format='%Y%m%d')
Mom_df['Date'] = pd.to_datetime(Mom_df['Date'], format='%Y%m%d')

# Adjusted function to calculate monthly realized variance
def calculate_monthly_realized_variance(df, factor_name):
    # Calculate squared daily returns
    df[factor_name + '_squared'] = df[factor_name] ** 2
    
    # Group by year and month, and sum the squared returns with renamed indices
    monthly_variance = df.groupby([df['Date'].dt.year.rename('year'), df['Date'].dt.month.rename('month')])[factor_name + '_squared'].sum()
    
    return monthly_variance

# Apply the function to each factor in FF5_df
factors_FF5 = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']
monthly_variances_FF5 = {factor: calculate_monthly_realized_variance(FF5_df, factor) for factor in factors_FF5}

# Apply the function to the Momentum factor in Mom_df
monthly_variance_Mom = calculate_monthly_realized_variance(Mom_df, 'Mom')

# Combine all monthly variances into a single DataFrame
monthly_variances_all = pd.DataFrame(monthly_variances_FF5)
monthly_variances_all['Mom'] = monthly_variance_Mom

# Reset index to make 'year' and 'month' as columns
monthly_variances_all.reset_index(inplace=True)

In [41]:
monthly_variances_all

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,0.000470,0.000045,0.000084,0.000048,0.000074,0.000170
1,1963,8,0.000295,0.000073,0.000096,0.000028,0.000049,0.000102
2,1963,9,0.000305,0.000107,0.000072,0.000058,0.000054,0.000131
3,1963,10,0.000418,0.000172,0.000251,0.000161,0.000193,0.000243
4,1963,11,0.003057,0.000364,0.000168,0.000140,0.000133,0.001297
...,...,...,...,...,...,...,...,...
722,2023,9,0.001068,0.000681,0.000441,0.000289,0.000153,0.000660
723,2023,10,0.001860,0.000946,0.000764,0.000927,0.000309,0.001192
724,2023,11,0.001488,0.002417,0.000871,0.000760,0.000194,0.002206
725,2023,12,0.000961,0.002219,0.001082,0.000394,0.000220,0.001707


### 1.c Scaled Factor Returns

$$
  R^{\text{Scaled}}_t = \frac{\hat{\sigma}^2}{\hat{\sigma}^2_{t-1}} R_t  
$$

where $\hat{\sigma}^2$ is the full-sample average variance in the factor. 

In [36]:
# Selected factors for calculation
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']

# Caculate the full-sample variance for each factor
full_sample_average_variance = monthly_variances_all[factors].mean()

full_sample_average_variance

Mkt-RF    0.002192
SMB       0.000623
HML       0.000714
RMW       0.000336
CMA       0.000301
Mom       0.001260
dtype: float64

In [45]:
# Caculate the full-sample variance for each factor lagged by one month
monthly_variances_lagged = monthly_variances_all.copy()
monthly_variances_lagged[factors] = monthly_variances_all[factors].shift(1)

monthly_variances_lagged

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,,,,,,
1,1963,8,0.000470,0.000045,0.000084,0.000048,0.000074,0.000170
2,1963,9,0.000295,0.000073,0.000096,0.000028,0.000049,0.000102
3,1963,10,0.000305,0.000107,0.000072,0.000058,0.000054,0.000131
4,1963,11,0.000418,0.000172,0.000251,0.000161,0.000193,0.000243
...,...,...,...,...,...,...,...,...
722,2023,9,0.001409,0.000355,0.000668,0.000348,0.000302,0.000647
723,2023,10,0.001068,0.000681,0.000441,0.000289,0.000153,0.000660
724,2023,11,0.001860,0.000946,0.000764,0.000927,0.000309,0.001192
725,2023,12,0.001488,0.002417,0.000871,0.000760,0.000194,0.002206


In [85]:
# importing monthly factor returns

FF5_monthly_df = pd.read_csv('data/F-F_Research_Data_5_Factors_2x3.csv', skiprows=3)

Mom_monthly_df = pd.read_csv('data/F-F_Momentum_Factor.CSV')

Mom_monthly_df = Mom_monthly_df[Mom_monthly_df['Date'] >= 196307]

# For the FF5_df DataFrame
columns_to_convert = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
FF5_monthly_df[columns_to_convert] = FF5_monthly_df[columns_to_convert] / 100

Mom_monthly_df.columns = Mom_monthly_df.columns.str.strip()

# For the Mom_df DataFrame
Mom_monthly_df['Mom'] = Mom_monthly_df['Mom'] / 100

In [86]:
FF5_monthly_df.head()

Unnamed: 0,Date,Mkt-RF,SMB,HML,RMW,CMA,RF
0,196307,-0.0039,-0.0041,-0.0097,0.0068,-0.0118,0.0027
1,196308,0.0507,-0.008,0.018,0.0036,-0.0035,0.0025
2,196309,-0.0157,-0.0052,0.0013,-0.0071,0.0029,0.0027
3,196310,0.0253,-0.0139,-0.001,0.028,-0.0201,0.0029
4,196311,-0.0085,-0.0088,0.0175,-0.0051,0.0224,0.0027


In [87]:
Mom_monthly_df.head()

Unnamed: 0,Date,Mom
438,196307,0.009
439,196308,0.0101
440,196309,0.0019
441,196310,0.0312
442,196311,-0.0074


In [88]:
# Preparing the 'date' columns in factor returns DataFrames
FF5_monthly_df['year'] = FF5_monthly_df['Date'] // 100
FF5_monthly_df['month'] = FF5_monthly_df['Date'] % 100

Mom_monthly_df['year'] = Mom_monthly_df['Date'] // 100
Mom_monthly_df['month'] = Mom_monthly_df['Date'] % 100

In [89]:
Mom_monthly_df.head()

Unnamed: 0,Date,Mom,year,month
438,196307,0.009,1963,7
439,196308,0.0101,1963,8
440,196309,0.0019,1963,9
441,196310,0.0312,1963,10
442,196311,-0.0074,1963,11


In [90]:
def scale_returns(FF5_df, Mom_df, full_sample_variance, monthly_variances_lagged):
    """
    Scales the factor returns using full sample variance and lagged monthly variance.
    """
    scaled_returns = pd.DataFrame(index=FF5_df.index)

    # Calculate scaled returns for each factor in Fama-French 5
    for factor in ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']:
        scaled_returns[factor + '_scaled'] = (full_sample_variance[factor] / monthly_variances_lagged[factor]) * FF5_df[factor]

    # Calculate scaled returns for the Momentum factor
    scaled_returns['Mom_scaled'] = (full_sample_variance['Mom'] / monthly_variances_lagged['Mom']) * Mom_df['Mom']

    # Combine the 'year' and 'month' columns and the scaled returns
    scaled_factor_return = pd.concat([FF5_df[['Date']], scaled_returns], axis=1)

    return scaled_factor_return

In [91]:
scaled_factor_returns = scale_returns(FF5_monthly_df, Mom_monthly_df, full_sample_average_variance, monthly_variances_lagged)
scaled_factor_returns

Unnamed: 0,Date,Mkt-RF_scaled,SMB_scaled,HML_scaled,RMW_scaled,CMA_scaled,Mom_scaled
0,196307,,,,,,
1,196308,0.236256,-0.111257,0.153076,0.025260,-0.014182,
2,196309,-0.116731,-0.044294,0.009709,-0.084487,0.017958,
3,196310,0.181639,-0.081290,-0.009914,0.163328,-0.111256,
4,196311,-0.044616,-0.031795,0.049752,-0.010656,0.034906,
...,...,...,...,...,...,...,...
722,202309,-0.081525,-0.031579,0.016248,0.017970,-0.008278,0.032720
723,202310,-0.065462,-0.036985,0.003075,0.028600,-0.012990,0.004203
724,202311,0.104209,-0.000790,0.015312,-0.014178,-0.009746,-0.007823
725,202312,0.071464,0.018873,0.040382,-0.013528,0.020488,-0.001428


### 1.d Factor returns and Sharpe

In [92]:
# Merge FF5_monthly_df and Mom_monthly_df on 'year' and 'month'
unscaled_factor_returns = pd.merge(FF5_monthly_df, Mom_monthly_df, on=['year', 'month'], how='left', suffixes=('', '_Mom'))

# Drop duplicate or unnecessary columns
unscaled_factor_returns.drop(columns=['Date', 'Date_Mom', 'RF'], inplace=True)

# Rename the 'Mom' column appropriately, if it has a suffix
unscaled_factor_returns.rename(columns={'Mom_Mom': 'Mom', 'Mom': 'Mom'}, inplace=True)

column_order = ['year', 'month', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']

# Reorder the columns in unscaled_factor_returns
unscaled_factor_returns = unscaled_factor_returns[column_order]

unscaled_factor_returns

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,-0.0039,-0.0041,-0.0097,0.0068,-0.0118,0.0090
1,1963,8,0.0507,-0.0080,0.0180,0.0036,-0.0035,0.0101
2,1963,9,-0.0157,-0.0052,0.0013,-0.0071,0.0029,0.0019
3,1963,10,0.0253,-0.0139,-0.0010,0.0280,-0.0201,0.0312
4,1963,11,-0.0085,-0.0088,0.0175,-0.0051,0.0224,-0.0074
...,...,...,...,...,...,...,...,...
722,2023,9,-0.0524,-0.0180,0.0152,0.0186,-0.0083,0.0026
723,2023,10,-0.0319,-0.0404,0.0019,0.0246,-0.0066,0.0173
724,2023,11,0.0884,-0.0012,0.0164,-0.0391,-0.0100,0.0275
725,2023,12,0.0485,0.0732,0.0493,-0.0306,0.0132,-0.0553


In [93]:
# Calculate Average Returns for Raw and Scaled Factors
avg_returns_raw = unscaled_factor_returns[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].mean()
avg_returns_scaled = scaled_factor_returns[['Mkt-RF_scaled', 'SMB_scaled', 'HML_scaled', 'RMW_scaled', 'CMA_scaled', 'Mom_scaled']].mean()

# Calculate Standard Deviations for Raw and Scaled Factors
std_dev_raw = unscaled_factor_returns[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].std()
std_dev_scaled = scaled_factor_returns[['Mkt-RF_scaled', 'SMB_scaled', 'HML_scaled', 'RMW_scaled', 'CMA_scaled', 'Mom_scaled']].std()

# Calculate Sharpe Ratios for Raw and Scaled (since these are excess returns, we don't subtract risk-free rate)
sharpe_ratio_raw = avg_returns_raw / std_dev_raw
sharpe_ratio_scaled = avg_returns_scaled / std_dev_scaled

# Combine all calculated metrics into a single DataFrame for reporting
report_df = pd.DataFrame({
    'Factor': ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom'],
    'Avg. Return (Raw)': avg_returns_raw.values,
    'Avg. Return (Scaled)': avg_returns_scaled.values,
    'Sharpe Ratio (Raw)': sharpe_ratio_raw.values,
    'Sharpe Ratio (Scaled)': sharpe_ratio_scaled.values
})

# Display the report DataFrame
report_df

Unnamed: 0,Factor,Avg. Return (Raw),Avg. Return (Scaled),Sharpe Ratio (Raw),Sharpe Ratio (Scaled)
0,Mkt-RF,0.005683,0.014234,0.126441,0.130002
1,SMB,0.002065,0.003139,0.067949,0.043013
2,HML,0.00288,0.009544,0.09619,0.102302
3,RMW,0.002838,0.007887,0.127598,0.154814
4,CMA,0.00271,0.003931,0.130555,0.098542
5,Mom,0.006065,0.008923,0.14394,0.070135


### 1.e Regression: Alpha and t-statistics

### 1.f Principal Component Decomposition

### 1.g Anomaly Analysis

$$
\hat{\sigma}^2_{it} = \phi_i PC_{it} + u_{it}
$$

## 2. Risk Premium

- PCA (Lettau and Pelger, 2020)

### 2.a Fama-French Industry Portfolio

In [102]:
FF48_industry_df = pd.read_csv('data/48_Industry_Portfolios.csv', skiprows=11,nrows=1171)
FF48_industry_df

Unnamed: 0,Date,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
0,192607,2.37,0.12,-99.99,-5.19,1.29,8.65,2.50,50.21,-0.48,...,7.70,1.92,-23.79,0.07,1.87,4.61,-0.54,2.89,-5.77,5.20
1,192608,2.23,2.68,-99.99,27.03,6.50,16.81,-0.76,42.98,-3.58,...,-2.38,4.85,5.39,-0.75,-0.13,11.83,2.57,5.30,0.32,6.76
2,192609,-0.57,1.58,-99.99,4.02,1.26,8.33,6.42,-4.91,0.73,...,-5.54,0.08,-7.87,0.25,-0.56,-1.75,0.72,-3.06,-4.81,-3.86
3,192610,-0.46,-3.68,-99.99,-3.31,1.06,-1.40,-5.09,5.37,-4.68,...,-5.08,-2.62,-15.38,-2.20,-4.11,-11.82,-4.28,-5.74,-0.94,-8.49
4,192611,6.75,6.26,-99.99,7.29,4.55,0.00,1.82,-6.40,-0.54,...,3.84,1.61,4.67,6.52,4.33,-2.97,3.58,2.21,5.13,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166,202309,0.84,-4.28,-6.43,-5.13,-2.43,-7.38,-11.17,-6.19,-6.81,...,-2.57,-5.15,-2.69,-6.16,-4.45,-3.92,2.57,-10.45,-3.49,-2.72
1167,202310,-5.83,-4.98,-0.39,-4.34,-4.00,-14.31,2.77,-2.05,-0.05,...,-3.46,-6.31,-0.90,0.74,-1.71,-2.89,3.49,-9.05,-6.31,-1.77
1168,202311,-2.29,4.26,4.79,3.90,4.76,5.66,11.08,7.90,4.97,...,10.43,12.31,7.94,6.96,7.67,13.16,3.93,9.95,13.51,6.64
1169,202312,6.72,4.33,2.26,0.55,0.91,10.15,3.98,8.79,0.32,...,3.56,7.30,4.63,6.38,5.06,9.68,-0.58,17.45,10.11,1.46


In [105]:
Rf_df = pd.read_csv('data/F-F_Rf.csv')
Rf_df['Rf'] = Rf_df['Rf']
Rf_df

Unnamed: 0,Date,Rf
0,1926-07-01,0.22
1,1926-08-01,0.25
2,1926-09-01,0.23
3,1926-10-01,0.32
4,1926-11-01,0.31
...,...,...
1166,2023-09-01,0.43
1167,2023-10-01,0.47
1168,2023-11-01,0.44
1169,2023-12-01,0.43


### 2.b Excess Return Decomposition

From APT, the $T \times N$ excess return matrix $R$ can be decomposed into: 

### 2.c RP-PCA Factors

### 2.d Test

### 2.e Maximal Sharpe