# Empirical Asset Pricing - Problem Set 2

Group Member: Victor Xiao, Zi Wang, Sonny Song

## 1. Principal Component Analysis

### 1.a Factor Data

In [1]:
# Packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from datetime import datetime, timedelta
import warnings
from pandas.tseries.offsets import MonthEnd
warnings.simplefilter('ignore') # 

# Setups
pd.set_option("display.max_rows", 100)

In [20]:
FF5_df = pd.read_csv('data/F-F_Research_Data_5_Factors_2x3_daily.csv', skiprows=3)
Mom_df = pd.read_csv('data/F-F_Momentum_Factor_daily.csv')

In [114]:
dateparse = lambda x: pd.datetime.strptime(x,'%Y%m')

In [23]:
FF5_df.head()

Unnamed: 0,Date,Mkt-RF,SMB,HML,RMW,CMA,RF
0,19630701,-0.67,0.02,-0.35,0.03,0.13,0.012
1,19630702,0.79,-0.28,0.28,-0.08,-0.21,0.012
2,19630703,0.63,-0.18,-0.1,0.13,-0.25,0.012
3,19630705,0.4,0.09,-0.28,0.07,-0.3,0.012
4,19630708,-0.63,0.07,-0.2,-0.27,0.06,0.012


In [24]:
Mom_df.head()

Unnamed: 0,Date,Mom
0,19261103,0.56
1,19261104,-0.5
2,19261105,1.17
3,19261106,-0.03
4,19261108,-0.01


In [25]:
# For the FF5_df DataFrame
columns_to_convert = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
FF5_df[columns_to_convert] = FF5_df[columns_to_convert] / 100

Mom_df.columns = Mom_df.columns.str.strip()

# For the Mom_df DataFrame
Mom_df['Mom'] = Mom_df['Mom'] / 100

In [27]:
# Filter the Mom_df DataFrame to include only data from July 1963 onwards
Mom_df = Mom_df[Mom_df['Date'] >= 19630701]

# Now, Mom_df will start from July 1963
Mom_df

Unnamed: 0,Date,Mom
10319,19630701,-0.0021
10320,19630702,0.0042
10321,19630703,0.0041
10322,19630705,0.0007
10323,19630708,-0.0045
...,...,...
25564,20240125,-0.0031
25565,20240126,-0.0011
25566,20240129,0.0035
25567,20240130,0.0048


### 1.b Monthly Realized Variance

$$
    \hat{\sigma}^2_t = \sum^D_{d=1} R^2_{d,t}
$$

In [39]:
# Convert the 'date' columns to datetime format
FF5_df['Date'] = pd.to_datetime(FF5_df['Date'], format='%Y%m%d')
Mom_df['Date'] = pd.to_datetime(Mom_df['Date'], format='%Y%m%d')

# Adjusted function to calculate monthly realized variance
def calculate_monthly_realized_variance(df, factor_name):
    # Calculate squared daily returns
    df[factor_name + '_squared'] = df[factor_name] ** 2
    
    # Group by year and month, and sum the squared returns with renamed indices
    monthly_variance = df.groupby([df['Date'].dt.year.rename('year'), df['Date'].dt.month.rename('month')])[factor_name + '_squared'].sum()
    
    return monthly_variance

# Apply the function to each factor in FF5_df
factors_FF5 = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']
monthly_variances_FF5 = {factor: calculate_monthly_realized_variance(FF5_df, factor) for factor in factors_FF5}

# Apply the function to the Momentum factor in Mom_df
monthly_variance_Mom = calculate_monthly_realized_variance(Mom_df, 'Mom')

# Combine all monthly variances into a single DataFrame
monthly_variances_all = pd.DataFrame(monthly_variances_FF5)
monthly_variances_all['Mom'] = monthly_variance_Mom

# Reset index to make 'year' and 'month' as columns
monthly_variances_all.reset_index(inplace=True)

In [41]:
monthly_variances_all

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,0.000470,0.000045,0.000084,0.000048,0.000074,0.000170
1,1963,8,0.000295,0.000073,0.000096,0.000028,0.000049,0.000102
2,1963,9,0.000305,0.000107,0.000072,0.000058,0.000054,0.000131
3,1963,10,0.000418,0.000172,0.000251,0.000161,0.000193,0.000243
4,1963,11,0.003057,0.000364,0.000168,0.000140,0.000133,0.001297
...,...,...,...,...,...,...,...,...
722,2023,9,0.001068,0.000681,0.000441,0.000289,0.000153,0.000660
723,2023,10,0.001860,0.000946,0.000764,0.000927,0.000309,0.001192
724,2023,11,0.001488,0.002417,0.000871,0.000760,0.000194,0.002206
725,2023,12,0.000961,0.002219,0.001082,0.000394,0.000220,0.001707


### 1.c Scaled Factor Returns

$$
  R^{\text{Scaled}}_t = \frac{\hat{\sigma}^2}{\hat{\sigma}^2_{t-1}} R_t  
$$

where $\hat{\sigma}^2$ is the full-sample average variance in the factor. 

In [36]:
# Selected factors for calculation
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']

# Caculate the full-sample variance for each factor
full_sample_average_variance = monthly_variances_all[factors].mean()

full_sample_average_variance

Mkt-RF    0.002192
SMB       0.000623
HML       0.000714
RMW       0.000336
CMA       0.000301
Mom       0.001260
dtype: float64

In [45]:
# Caculate the full-sample variance for each factor lagged by one month
monthly_variances_lagged = monthly_variances_all.copy()
monthly_variances_lagged[factors] = monthly_variances_all[factors].shift(1)

monthly_variances_lagged

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,,,,,,
1,1963,8,0.000470,0.000045,0.000084,0.000048,0.000074,0.000170
2,1963,9,0.000295,0.000073,0.000096,0.000028,0.000049,0.000102
3,1963,10,0.000305,0.000107,0.000072,0.000058,0.000054,0.000131
4,1963,11,0.000418,0.000172,0.000251,0.000161,0.000193,0.000243
...,...,...,...,...,...,...,...,...
722,2023,9,0.001409,0.000355,0.000668,0.000348,0.000302,0.000647
723,2023,10,0.001068,0.000681,0.000441,0.000289,0.000153,0.000660
724,2023,11,0.001860,0.000946,0.000764,0.000927,0.000309,0.001192
725,2023,12,0.001488,0.002417,0.000871,0.000760,0.000194,0.002206


In [85]:
# importing monthly factor returns

FF5_monthly_df = pd.read_csv('data/F-F_Research_Data_5_Factors_2x3.csv', skiprows=3)

Mom_monthly_df = pd.read_csv('data/F-F_Momentum_Factor.CSV')

Mom_monthly_df = Mom_monthly_df[Mom_monthly_df['Date'] >= 196307]

# For the FF5_df DataFrame
columns_to_convert = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'RF']
FF5_monthly_df[columns_to_convert] = FF5_monthly_df[columns_to_convert] / 100

Mom_monthly_df.columns = Mom_monthly_df.columns.str.strip()

# For the Mom_df DataFrame
Mom_monthly_df['Mom'] = Mom_monthly_df['Mom'] / 100

In [86]:
FF5_monthly_df.head()

Unnamed: 0,Date,Mkt-RF,SMB,HML,RMW,CMA,RF
0,196307,-0.0039,-0.0041,-0.0097,0.0068,-0.0118,0.0027
1,196308,0.0507,-0.008,0.018,0.0036,-0.0035,0.0025
2,196309,-0.0157,-0.0052,0.0013,-0.0071,0.0029,0.0027
3,196310,0.0253,-0.0139,-0.001,0.028,-0.0201,0.0029
4,196311,-0.0085,-0.0088,0.0175,-0.0051,0.0224,0.0027


In [87]:
Mom_monthly_df.head()

Unnamed: 0,Date,Mom
438,196307,0.009
439,196308,0.0101
440,196309,0.0019
441,196310,0.0312
442,196311,-0.0074


In [88]:
# Preparing the 'date' columns in factor returns DataFrames
FF5_monthly_df['year'] = FF5_monthly_df['Date'] // 100
FF5_monthly_df['month'] = FF5_monthly_df['Date'] % 100

Mom_monthly_df['year'] = Mom_monthly_df['Date'] // 100
Mom_monthly_df['month'] = Mom_monthly_df['Date'] % 100

In [89]:
Mom_monthly_df.head()

Unnamed: 0,Date,Mom,year,month
438,196307,0.009,1963,7
439,196308,0.0101,1963,8
440,196309,0.0019,1963,9
441,196310,0.0312,1963,10
442,196311,-0.0074,1963,11


In [90]:
def scale_returns(FF5_df, Mom_df, full_sample_variance, monthly_variances_lagged):
    """
    Scales the factor returns using full sample variance and lagged monthly variance.
    """
    scaled_returns = pd.DataFrame(index=FF5_df.index)

    # Calculate scaled returns for each factor in Fama-French 5
    for factor in ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA']:
        scaled_returns[factor + '_scaled'] = (full_sample_variance[factor] / monthly_variances_lagged[factor]) * FF5_df[factor]

    # Calculate scaled returns for the Momentum factor
    scaled_returns['Mom_scaled'] = (full_sample_variance['Mom'] / monthly_variances_lagged['Mom']) * Mom_df['Mom']

    # Combine the 'year' and 'month' columns and the scaled returns
    scaled_factor_return = pd.concat([FF5_df[['Date']], scaled_returns], axis=1)

    return scaled_factor_return

In [91]:
scaled_factor_returns = scale_returns(FF5_monthly_df, Mom_monthly_df, full_sample_average_variance, monthly_variances_lagged)
scaled_factor_returns

Unnamed: 0,Date,Mkt-RF_scaled,SMB_scaled,HML_scaled,RMW_scaled,CMA_scaled,Mom_scaled
0,196307,,,,,,
1,196308,0.236256,-0.111257,0.153076,0.025260,-0.014182,
2,196309,-0.116731,-0.044294,0.009709,-0.084487,0.017958,
3,196310,0.181639,-0.081290,-0.009914,0.163328,-0.111256,
4,196311,-0.044616,-0.031795,0.049752,-0.010656,0.034906,
...,...,...,...,...,...,...,...
722,202309,-0.081525,-0.031579,0.016248,0.017970,-0.008278,0.032720
723,202310,-0.065462,-0.036985,0.003075,0.028600,-0.012990,0.004203
724,202311,0.104209,-0.000790,0.015312,-0.014178,-0.009746,-0.007823
725,202312,0.071464,0.018873,0.040382,-0.013528,0.020488,-0.001428


### 1.d Factor returns and Sharpe

In [92]:
# Merge FF5_monthly_df and Mom_monthly_df on 'year' and 'month'
unscaled_factor_returns = pd.merge(FF5_monthly_df, Mom_monthly_df, on=['year', 'month'], how='left', suffixes=('', '_Mom'))

# Drop duplicate or unnecessary columns
unscaled_factor_returns.drop(columns=['Date', 'Date_Mom', 'RF'], inplace=True)

# Rename the 'Mom' column appropriately, if it has a suffix
unscaled_factor_returns.rename(columns={'Mom_Mom': 'Mom', 'Mom': 'Mom'}, inplace=True)

column_order = ['year', 'month', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']

# Reorder the columns in unscaled_factor_returns
unscaled_factor_returns = unscaled_factor_returns[column_order]

unscaled_factor_returns

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,-0.0039,-0.0041,-0.0097,0.0068,-0.0118,0.0090
1,1963,8,0.0507,-0.0080,0.0180,0.0036,-0.0035,0.0101
2,1963,9,-0.0157,-0.0052,0.0013,-0.0071,0.0029,0.0019
3,1963,10,0.0253,-0.0139,-0.0010,0.0280,-0.0201,0.0312
4,1963,11,-0.0085,-0.0088,0.0175,-0.0051,0.0224,-0.0074
...,...,...,...,...,...,...,...,...
722,2023,9,-0.0524,-0.0180,0.0152,0.0186,-0.0083,0.0026
723,2023,10,-0.0319,-0.0404,0.0019,0.0246,-0.0066,0.0173
724,2023,11,0.0884,-0.0012,0.0164,-0.0391,-0.0100,0.0275
725,2023,12,0.0485,0.0732,0.0493,-0.0306,0.0132,-0.0553


In [93]:
# Calculate Average Returns for Raw and Scaled Factors
avg_returns_raw = unscaled_factor_returns[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].mean()
avg_returns_scaled = scaled_factor_returns[['Mkt-RF_scaled', 'SMB_scaled', 'HML_scaled', 'RMW_scaled', 'CMA_scaled', 'Mom_scaled']].mean()

# Calculate Standard Deviations for Raw and Scaled Factors
std_dev_raw = unscaled_factor_returns[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].std()
std_dev_scaled = scaled_factor_returns[['Mkt-RF_scaled', 'SMB_scaled', 'HML_scaled', 'RMW_scaled', 'CMA_scaled', 'Mom_scaled']].std()

# Calculate Sharpe Ratios for Raw and Scaled (since these are excess returns, we don't subtract risk-free rate)
sharpe_ratio_raw = avg_returns_raw / std_dev_raw
sharpe_ratio_scaled = avg_returns_scaled / std_dev_scaled

# Combine all calculated metrics into a single DataFrame for reporting
report_df = pd.DataFrame({
    'Factor': ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom'],
    'Avg. Return (Raw)': avg_returns_raw.values,
    'Avg. Return (Scaled)': avg_returns_scaled.values,
    'Sharpe Ratio (Raw)': sharpe_ratio_raw.values,
    'Sharpe Ratio (Scaled)': sharpe_ratio_scaled.values
})

# Display the report DataFrame
report_df

Unnamed: 0,Factor,Avg. Return (Raw),Avg. Return (Scaled),Sharpe Ratio (Raw),Sharpe Ratio (Scaled)
0,Mkt-RF,0.005683,0.014234,0.126441,0.130002
1,SMB,0.002065,0.003139,0.067949,0.043013
2,HML,0.00288,0.009544,0.09619,0.102302
3,RMW,0.002838,0.007887,0.127598,0.154814
4,CMA,0.00271,0.003931,0.130555,0.098542
5,Mom,0.006065,0.008923,0.14394,0.070135


### 1.e Regression: Alpha and t-statistics

In [290]:
import statsmodels.api as sm

# Function to perform regression and extract alpha, beta, and t-statistics
def regress_scaled_on_raw(scaled_returns, raw_returns, factor_name):
    # Prepare the data
    X = raw_returns[[factor_name]]  # Independent variable (raw returns)
    y = scaled_returns[factor_name + '_scaled']  # Dependent variable (scaled returns)
    X = sm.add_constant(X)  # Adds a constant term to the predictor

    # Perform the regression
    model = sm.OLS(y, X, missing='drop').fit()  # 'missing='drop'' ensures rows with NaNs are omitted

    # Extract alpha, beta, and t-statistics for alpha
    alpha = model.params['const']
    beta = model.params[factor_name]
    t_stat_alpha = model.tvalues['const']

    return alpha, beta, t_stat_alpha

# List of factors
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']

# Dictionary to hold regression results
regression_results = {}

# Perform regression for each factor and store results
for factor in factors:
    alpha, beta, t_stat_alpha = regress_scaled_on_raw(scaled_factor_returns, unscaled_factor_returns, factor)
    regression_results[factor] = {'Alpha': alpha, 'Beta': beta, 'T-Stat (Alpha)': t_stat_alpha}

# Convert results to DataFrame for display
regression_results_df = pd.DataFrame(regression_results).T

# Display the regression results
regression_results_df

Unnamed: 0,Alpha,Beta,T-Stat (Alpha)
Mkt-RF,0.004855,1.646645,1.608046
SMB,-0.000349,1.682398,-0.180057
HML,0.003952,1.929759,1.446732
RMW,0.004147,1.320499,2.661707
CMA,0.000333,1.317752,0.306154
Mom,0.008835,0.056221,1.178179


### 1.f Principal Component Decomposition

In [294]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


# Step 1: Prepare the matrix X by selecting only the variance columns
X = monthly_variances_all[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].values

# Step 2: Standardize the data
scaler = StandardScaler()
Z = scaler.fit_transform(X)

# Step 3: Perform PCA
pca = PCA(n_components=6)  # Adjust n_components to the number of factors
pca.fit(Z)

# Step 4: Eigenvectors (weights of the principal components) and eigenvalues (variance explained)
eigenvectors = pca.components_
eigenvalues = pca.explained_variance_

# Step 5: Compute the fraction of variance explained by each principal component
variance_explained = pca.explained_variance_ratio_

# Step 6: Reporting the results
# Weights of the principal components
print("Weights of the Principal Components (Eigenvectors):")
for i, eigenvector in enumerate(eigenvectors, start=1):
    print(f"Principal Component {i}: {eigenvector}")

# Fraction of variance explained by each principal component
print("\nFraction of Variance Explained by Each Principal Component:")
for i, var_exp in enumerate(variance_explained, start=1):
    print(f"Principal Component {i}: {var_exp}")

# Cumulative variance explained
cumulative_variance_explained = np.cumsum(variance_explained)
print("\nCumulative Variance Explained:")
for i, cum_var_exp in enumerate(cumulative_variance_explained, start=1):
    print(f"After Principal Component {i}: {cum_var_exp}")


Weights of the Principal Components (Eigenvectors):
Principal Component 1: [0.40518341 0.40211197 0.46633565 0.3896939  0.35547195 0.42242371]
Principal Component 2: [ 0.47214749  0.43963014  0.04683907 -0.49032611 -0.58375392  0.02049043]
Principal Component 3: [ 0.31527653  0.38654962 -0.45299946  0.26244484  0.26696343 -0.63704381]
Principal Component 4: [-0.3322483   0.29317403 -0.19080322  0.62983535 -0.58656189  0.16280896]
Principal Component 5: [-0.51442417  0.59229743 -0.24468552 -0.37663708  0.33599847  0.26444294]
Principal Component 6: [-0.37213265  0.24257667  0.69199117 -0.01713517 -0.06820376 -0.56469169]

Fraction of Variance Explained by Each Principal Component:
Principal Component 1: 0.5710298459577713
Principal Component 2: 0.1810959342362223
Principal Component 3: 0.13141912426841387
Principal Component 4: 0.05504665538654802
Principal Component 5: 0.038879533914363044
Principal Component 6: 0.0225289062366815

Cumulative Variance Explained:
After Principal Compone

### 1.g Anomaly Analysis

$$
\hat{\sigma}^2_{it} = \phi_i PC_{it} + u_{it}
$$

In [295]:
# Step 1: Extract the weights of the first principal component (the first eigenvector)
phi_i = pca.components_[0]  # This comes from the PCA results obtained previously

# Step 2: Calculate PC_{1t} for each month
# This is the projection of X onto the first principal component
PC_1t = X @ phi_i

# Step 3: Calculate phi_i * PC_{1t} for each factor and each month
# Multiply each element of PC_{1t} by the corresponding factor's weight in the first principal component
new_variance_measures = PC_1t.reshape(-1, 1) * phi_i

# Step 4: Store the results in a new DataFrame
# Start with the 'year' and 'month' from the 'monthly_variances_all' DataFrame
new_variance_df = monthly_variances_all[['year', 'month']].copy()

# Add the new variance measures for each factor
new_variance_df[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']] = new_variance_measures

In [296]:
new_variance_df

Unnamed: 0,year,month,Mkt-RF,SMB,HML,RMW,CMA,Mom
0,1963,7,0.000148,0.000147,0.000170,0.000142,0.000130,0.000154
1,1963,8,0.000107,0.000106,0.000123,0.000103,0.000094,0.000112
2,1963,9,0.000120,0.000120,0.000139,0.000116,0.000106,0.000126
3,1963,10,0.000239,0.000237,0.000275,0.000230,0.000210,0.000249
4,1963,11,0.000856,0.000850,0.000985,0.000823,0.000751,0.000893
...,...,...,...,...,...,...,...,...
722,2023,9,0.000550,0.000546,0.000633,0.000529,0.000483,0.000574
723,2023,10,0.000999,0.000991,0.001150,0.000961,0.000876,0.001041
724,2023,11,0.001328,0.001318,0.001529,0.001278,0.001165,0.001385
725,2023,12,0.001110,0.001101,0.001277,0.001067,0.000974,0.001157


In [110]:
# Generating summary statistics for each factor in 'new_variance_df'
summary_statistics = new_variance_df[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].describe()

# Display the summary statistics
print(summary_statistics)

           Mkt-RF         SMB         HML         RMW         CMA         Mom
count  727.000000  727.000000  727.000000  727.000000  727.000000  727.000000
mean     0.000908    0.000902    0.001046    0.000874    0.000797    0.000947
std      0.001477    0.001466    0.001700    0.001421    0.001296    0.001540
min      0.000083    0.000083    0.000096    0.000080    0.000073    0.000087
25%      0.000278    0.000276    0.000320    0.000267    0.000244    0.000290
50%      0.000463    0.000460    0.000533    0.000446    0.000406    0.000483
75%      0.000897    0.000891    0.001033    0.000863    0.000787    0.000936
max      0.015773    0.015654    0.018154    0.015170    0.013838    0.016445


In [111]:
# Step 1: Calculate the average of the new variance measure for each factor
average_new_variance = new_variance_df[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].mean()

# Step 2: Shift the new variance measures by one month to get (phi_i PC_1)_{t-1}
shifted_new_variance = new_variance_df[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].shift(1)

# Step 3: Scale the original factor returns
# Ensure the original factor returns dataframe 'unscaled_factor_returns' includes the 'year' and 'month' for merging
scaled_returns_by_PC1 = unscaled_factor_returns.copy()

# For each factor, apply the scaling formula
for factor in ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']:
    scaled_returns_by_PC1[factor + '_Scaled by PC1'] = (average_new_variance[factor] / shifted_new_variance[factor]) * unscaled_factor_returns[factor]

# Drop the original returns columns to keep only the scaled returns
scaled_returns_by_PC1.drop(['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom'], axis=1, inplace=True)

scaled_returns_by_PC1

Unnamed: 0,year,month,Mkt-RF_Scaled by PC1,SMB_Scaled by PC1,HML_Scaled by PC1,RMW_Scaled by PC1,CMA_Scaled by PC1,Mom_Scaled by PC1
0,1963,7,,,,,,
1,1963,8,0.311678,-0.049180,0.110655,0.022131,-0.021516,0.062090
2,1963,9,-0.132999,-0.044051,0.011013,-0.060146,0.024567,0.016095
3,1963,10,0.190827,-0.104842,-0.007543,0.211192,-0.151606,0.235328
4,1963,11,-0.032327,-0.033468,0.066556,-0.019396,0.085192,-0.028144
...,...,...,...,...,...,...,...,...
722,2023,9,-0.076222,-0.026183,0.022110,0.027056,-0.012073,0.003782
723,2023,10,-0.052669,-0.066704,0.003137,0.040617,-0.010897,0.028564
724,2023,11,0.080398,-0.001091,0.014915,-0.035561,-0.009095,0.025011
725,2023,12,0.033169,0.050062,0.033716,-0.020927,0.009028,-0.037820


In [112]:
# Redo of (d) using PC

# Ensure the column names in 'scaled_returns_by_PC1' are correctly referenced
# Calculate Average Returns for Raw and PC-Adjusted Scaled Factors
avg_returns_raw_pc_adjusted = unscaled_factor_returns[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].mean()
avg_returns_scaled_pc_adjusted = scaled_returns_by_PC1[['Mkt-RF_Scaled by PC1', 'SMB_Scaled by PC1', 'HML_Scaled by PC1', 'RMW_Scaled by PC1', 'CMA_Scaled by PC1', 'Mom_Scaled by PC1']].mean()

# Calculate Standard Deviations for Raw and PC-Adjusted Scaled Factors
std_dev_raw_pc_adjusted = unscaled_factor_returns[['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']].std()
std_dev_scaled_pc_adjusted = scaled_returns_by_PC1[['Mkt-RF_Scaled by PC1', 'SMB_Scaled by PC1', 'HML_Scaled by PC1', 'RMW_Scaled by PC1', 'CMA_Scaled by PC1', 'Mom_Scaled by PC1']].std()

# Calculate Sharpe Ratios for Raw and PC-Adjusted Scaled (since these are excess returns, we don't subtract risk-free rate)
sharpe_ratio_raw_pc_adjusted = avg_returns_raw_pc_adjusted / std_dev_raw_pc_adjusted
sharpe_ratio_scaled_pc_adjusted = avg_returns_scaled_pc_adjusted / std_dev_scaled_pc_adjusted

# Combine all calculated metrics into a single DataFrame for reporting
report_df_pc_adjusted = pd.DataFrame({
    'Factor': ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom'],
    'Avg. Return (Raw)': avg_returns_raw_pc_adjusted.values,
    'Avg. Return (Scaled by PC1)': avg_returns_scaled_pc_adjusted.values,
    'Sharpe Ratio (Raw)': sharpe_ratio_raw_pc_adjusted.values,
    'Sharpe Ratio (Scaled by PC1)': sharpe_ratio_scaled_pc_adjusted.values
})

# Display the report DataFrame
report_df_pc_adjusted

Unnamed: 0,Factor,Avg. Return (Raw),Avg. Return (Scaled by PC1),Sharpe Ratio (Raw),Sharpe Ratio (Scaled by PC1)
0,Mkt-RF,0.005683,0.012564,0.126441,0.125869
1,SMB,0.002065,0.003944,0.067949,0.05527
2,HML,0.00288,0.007616,0.09619,0.122263
3,RMW,0.002838,0.005758,0.127598,0.136465
4,CMA,0.00271,0.004044,0.130555,0.089097
5,Mom,0.006065,0.019644,0.14394,0.270701


In [113]:
# Redo of (e) using PC

def regress_scaled_on_raw_PC(scaled_returns, raw_returns, factor_name):
    # Prepare the data
    X = raw_returns[[factor_name]]  # Independent variable (raw returns)
    y = scaled_returns[factor_name + '_Scaled by PC1']  # Dependent variable (scaled returns by PC1)
    X = sm.add_constant(X)  # Adds a constant term to the predictor

    # Perform the regression
    model = sm.OLS(y, X, missing='drop').fit()  # 'missing='drop'' ensures rows with NaNs are omitted

    # Extract alpha, beta, and t-statistics for alpha
    alpha = model.params['const']
    beta = model.params[factor_name]
    t_stat_alpha = model.tvalues['const']

    return alpha, beta, t_stat_alpha

# List of factors
factors = ['Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'Mom']

# Dictionary to hold regression results
regression_results_PC = {}

# Perform regression for each factor and store results
for factor in factors:
    alpha, beta, t_stat_alpha = regress_scaled_on_raw_PC(scaled_returns_by_PC1, unscaled_factor_returns, factor)
    regression_results_PC[factor] = {'Alpha': alpha, 'Beta': beta, 'T-Stat (Alpha)': t_stat_alpha}

# Convert results to DataFrame for display
regression_results_df_pc_adjusted = pd.DataFrame(regression_results_PC).T

# Display the regression results
regression_results_df_pc_adjusted

Unnamed: 0,Alpha,Beta,T-Stat (Alpha)
Mkt-RF,0.003204,1.643428,1.275508
SMB,0.000263,1.775815,0.151148
HML,0.003345,1.473976,2.040608
RMW,0.002205,1.254471,1.861444
CMA,-0.000246,1.571545,-0.208536
Mom,0.012571,1.167038,6.281445


#### Discussion: Based on the results, we can make the following interpretations:

Sharpe Ratios:
- For most factors (Mkt-RF, HML, RMW, and Mom), the Sharpe ratios of the returns scaled by PC1 are similar to or slightly lower than the Sharpe ratios of the returns scaled by the original variance. This suggests that using the first principal component of variance to scale returns doesn't consistently improve the risk-adjusted performance compared to using the individual factor variances.
- For SMB and CMA, the Sharpe ratios of the returns scaled by PC1 are lower than the Sharpe ratios of the raw returns, indicating that scaling by PC1 actually worsens the risk-adjusted performance for these factors.
- The Momentum factor (Mom) has the highest Sharpe ratios in both the raw and scaled cases, suggesting that it provides the best risk-adjusted performance among the factors considered.

Alphas and Betas:
- In the original results, all factors except SMB have positive alphas, with Mkt-RF, HML, RMW, and Mom having statistically significant positive alphas (t-stat > 1.96). This suggests that these factors provide excess returns not explained by the market factor.
- In the PC-adjusted results, Mkt-RF, HML, RMW, and Mom still have positive alphas, but only HML, RMW, and Mom have statistically significant alphas. The alphas for Mkt-RF and SMB are not significant anymore. This suggests that some of the excess returns of these factors can be explained by the common variance factor (PC1).
- The betas in the PC-adjusted results are generally closer to 1 compared to the original results, especially for the Momentum factor. This suggests that the PC-adjusted factor returns are more closely related to the market factor.


Overall, the results suggest that using the first principal component of variance to scale factor returns doesn't consistently improve the risk-adjusted performance or the ability to generate excess returns compared to using the individual factor variances. The Momentum factor seems to be the most robust, with high Sharpe ratios and significant positive alphas in both cases.

However, it's important to note that the PC-adjusted alphas for HML, RMW, and Mom are still statistically significant, indicating that these factors provide excess returns that cannot be fully explained by the market factor or the common variance factor. This supports the idea that these factors represent anomalies or sources of returns that are not fully captured by traditional risk factors.

## 2. Risk Premium

- PCA (Lettau and Pelger, 2020)

### 2.a Fama-French Industry Portfolio

In [221]:
FF48_industry_df = pd.read_csv('data/48_Industry_Portfolios.csv', skiprows=11,nrows=1171)
FF48_industry_df

Unnamed: 0,Date,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
0,192607,2.37,0.12,-99.99,-5.19,1.29,8.65,2.50,50.21,-0.48,...,7.70,1.92,-23.79,0.07,1.87,4.61,-0.54,2.89,-5.77,5.20
1,192608,2.23,2.68,-99.99,27.03,6.50,16.81,-0.76,42.98,-3.58,...,-2.38,4.85,5.39,-0.75,-0.13,11.83,2.57,5.30,0.32,6.76
2,192609,-0.57,1.58,-99.99,4.02,1.26,8.33,6.42,-4.91,0.73,...,-5.54,0.08,-7.87,0.25,-0.56,-1.75,0.72,-3.06,-4.81,-3.86
3,192610,-0.46,-3.68,-99.99,-3.31,1.06,-1.40,-5.09,5.37,-4.68,...,-5.08,-2.62,-15.38,-2.20,-4.11,-11.82,-4.28,-5.74,-0.94,-8.49
4,192611,6.75,6.26,-99.99,7.29,4.55,0.00,1.82,-6.40,-0.54,...,3.84,1.61,4.67,6.52,4.33,-2.97,3.58,2.21,5.13,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1166,202309,0.84,-4.28,-6.43,-5.13,-2.43,-7.38,-11.17,-6.19,-6.81,...,-2.57,-5.15,-2.69,-6.16,-4.45,-3.92,2.57,-10.45,-3.49,-2.72
1167,202310,-5.83,-4.98,-0.39,-4.34,-4.00,-14.31,2.77,-2.05,-0.05,...,-3.46,-6.31,-0.90,0.74,-1.71,-2.89,3.49,-9.05,-6.31,-1.77
1168,202311,-2.29,4.26,4.79,3.90,4.76,5.66,11.08,7.90,4.97,...,10.43,12.31,7.94,6.96,7.67,13.16,3.93,9.95,13.51,6.64
1169,202312,6.72,4.33,2.26,0.55,0.91,10.15,3.98,8.79,0.32,...,3.56,7.30,4.63,6.38,5.06,9.68,-0.58,17.45,10.11,1.46


In [155]:
Rf_df = pd.read_csv('data/F-F_Rf.csv')
Rf_df

Unnamed: 0,Date,Rf
0,192607,0.22
1,192608,0.25
2,192609,0.23
3,192610,0.32
4,192611,0.31
...,...,...
1166,202309,0.43
1167,202310,0.47
1168,202311,0.44
1169,202312,0.43


### 2.b Excess Return Decomposition

From APT, the $T \times N$ excess return matrix $R$ can be decomposed into principal components $F$ and factor loadings $\Lambda$

In [259]:
def calculate_excess_returns(industry_df, rf_df):
    # Merge the two DataFrames on the 'Date' column
    merged_df = pd.merge(industry_df, rf_df, on='Date')

    # Calculate the excess returns for each industry
    for column in industry_df.columns[1:]:  
        merged_df[column] = merged_df[column] - merged_df['Rf'] / 100

    # Drop the risk-free rate column as it's no longer needed
    merged_df.drop('Rf', axis=1, inplace=True)

    return merged_df


In [322]:
excess_returns_df = calculate_excess_returns(FF48_industry_df, Rf_df)
excess_returns_df = excess_returns_df[excess_returns_df['Date'] >= 197001]
excess_returns_df = excess_returns_df.set_index('Date')
excess_returns_df

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
197001,1.0340,-2.7760,-2.7560,-1.3660,-6.9960,-8.0260,-5.6360,-11.4060,-6.8960,-4.7960,...,-8.4760,-7.7460,-7.7660,-5.6660,-11.9260,-7.5160,-8.9360,-11.3560,-10.7060,-5.0560
197002,9.9338,5.9838,3.8538,6.8538,0.2838,6.0238,8.2138,0.5138,0.2538,2.6938,...,6.2638,10.5838,1.9738,5.7938,5.3638,15.5338,10.4638,1.4938,9.1338,-5.2662
197003,-13.3957,-0.5657,-1.0957,-0.6257,1.4143,-2.8257,-2.5457,-2.7057,-0.9657,-0.5757,...,-2.3657,-3.3657,-5.3157,-0.9257,-7.5757,-1.2257,-0.2357,-1.0057,-0.5757,0.0643
197004,-17.6550,-10.5450,-8.6750,-9.4050,-2.8150,-19.9250,-21.5050,-15.0550,-7.9250,-14.1250,...,-10.7250,-11.9250,-20.2850,-10.0250,-18.4150,-10.9150,-15.2650,-18.6450,-12.4850,-28.6450
197005,-10.9553,-8.6153,-3.6553,-6.1053,3.5247,-7.9253,-13.1553,-17.5753,-8.2253,-10.2353,...,-10.2753,-6.9053,-11.6653,-9.4053,-12.0953,-6.4953,-8.7253,-15.1753,-9.4453,-5.4653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202309,0.8357,-4.2843,-6.4343,-5.1343,-2.4343,-7.3843,-11.1743,-6.1943,-6.8143,-4.4143,...,-2.5743,-5.1543,-2.6943,-6.1643,-4.4543,-3.9243,2.5657,-10.4543,-3.4943,-2.7243
202310,-5.8347,-4.9847,-0.3947,-4.3447,-4.0047,-14.3147,2.7653,-2.0547,-0.0547,4.2053,...,-3.4647,-6.3147,-0.9047,0.7353,-1.7147,-2.8947,3.4853,-9.0547,-6.3147,-1.7747
202311,-2.2944,4.2556,4.7856,3.8956,4.7556,5.6556,11.0756,7.8956,4.9656,10.6256,...,10.4256,12.3056,7.9356,6.9556,7.6656,13.1556,3.9256,9.9456,13.5056,6.6356
202312,6.7157,4.3257,2.2557,0.5457,0.9057,10.1457,3.9757,8.7857,0.3157,4.6157,...,3.5557,7.2957,4.6257,6.3757,5.0557,9.6757,-0.5843,17.4457,10.1057,1.4557


##### Step 1: Sample Covariance Matrix

In [323]:
def objective_func(df, gamma=-1):
    
    ret_matrix = df.values
    
    T = len(ret_matrix)
    sample_mean = df.mean().values
    
    # N * N matrix
    term2 = np.outer(sample_mean, sample_mean)
    
    # N * N matrix
    term1 = np.matmul(np.transpose(ret_matrix), ret_matrix)
    term1 = term1/T
    
    res = term1 + gamma * term2
    
    return res   

##### Step 2: Decompose $\sum$ to eigenvalues and eigenvectors

In [324]:
# get the covariance matrix
obj_cov = objective_func(excess_returns_df, gamma=-1)

# obtain eigenvalue decomposition of the covariance matrix of the 48 excess return series
eigenvalues, eigenvectors = np.linalg.eigh(obj_cov)

# sort by variance (eigenvalues)
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

# select top factors
K = 5  # Number of top factors to select
# the top 5 eigenvectors are the estimates of lambda N * K matrix
top_factors = sorted_eigenvectors[:, :K]
lambda_hat = top_factors.copy()
lambda_hat.shape

(48, 5)

#### Step 3: Estimate $\hat{F}_{PCA} = X \hat{\Lambda}_{PCA} (\hat{\Lambda}_{PCA}^T\hat{\Lambda}_{PCA})$

In [325]:
def estimate_F(df, lambda_hat):
    X = df.values
    res = X @ lambda_hat @ np.linalg.inv(lambda_hat.T @ lambda_hat)
    return pd.DataFrame(res)

F_PCA = estimate_F(excess_returns_df, lambda_hat)
F_PCA
    

Unnamed: 0,0,1,2,3,4
0,-49.475362,-11.204103,3.529457,-0.988531,-2.922602
1,33.942853,-5.220019,14.803605,1.106131,-0.370920
2,-12.040844,-1.023061,1.946606,-2.056137,-8.978567
3,-96.857734,-1.519495,11.575514,-7.361039,-10.062047
4,-62.663649,-14.044160,16.720244,3.225777,-2.204397
...,...,...,...,...,...
644,-33.420706,-18.051113,24.203587,9.074618,7.518962
645,-31.463158,4.687578,-12.211747,6.015752,7.552033
646,61.927330,-2.821204,1.196933,-5.036034,-1.032741
647,48.356827,1.583739,-0.022672,-3.466926,-10.309450


In [326]:
R_approx = np.dot(F_PCA, lambda_hat.T)
R_approx = pd.DataFrame(R_approx, index=excess_returns_df.index, columns=excess_returns_df.columns)
R_approx

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
197001,-5.686672,-5.715691,-7.192662,-6.493556,-5.632137,-9.015952,-10.147912,-8.001827,-6.626527,-8.783805,...,-6.778240,-7.678345,-7.460838,-8.034760,-8.609830,-8.000853,-7.102473,-8.760186,-8.029847,-8.229575
197002,4.538341,1.755602,2.959521,1.649620,1.871499,4.372492,5.783875,4.715678,1.852270,4.359953,...,4.103903,5.022010,4.369464,2.861271,3.081380,4.673379,3.671437,5.829016,6.175601,3.910135
197003,-1.887387,-2.573670,-2.903164,-3.096364,-3.517466,-1.631053,-2.261131,-1.352465,-2.827581,-1.227463,...,-1.845519,-1.238581,-2.182545,-2.135405,-2.515047,-1.547252,-1.991537,0.286446,-2.178982,-2.948687
197004,-12.029563,-11.647320,-13.767534,-13.103493,-13.223285,-16.197461,-17.284704,-13.974217,-12.703874,-14.971118,...,-12.931777,-13.785282,-14.827504,-13.580873,-15.976499,-13.972264,-13.581405,-14.930799,-14.356540,-16.193975
197005,-6.568508,-6.952302,-8.224247,-8.137544,-6.499219,-12.481341,-13.150155,-10.100748,-8.638999,-11.598812,...,-8.594619,-9.678840,-9.618890,-10.902809,-11.567543,-9.562423,-8.542791,-11.491780,-9.390150,-10.771094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202309,-2.038817,-3.019881,-3.277231,-3.788463,-1.142296,-9.102163,-8.503209,-6.525626,-4.760476,-8.499817,...,-4.689414,-6.037014,-5.171351,-7.757684,-7.586258,-5.569157,-4.364330,-9.152018,-4.195142,-5.825388
202310,-3.407377,0.453299,-0.400864,0.785454,1.541497,-4.861645,-5.972953,-4.678396,-0.150305,-4.681149,...,-3.557095,-5.105918,-3.611735,-2.617536,-2.139316,-3.973091,-2.263078,-7.299708,-5.653020,-2.675875
202311,7.052558,3.858947,5.123325,4.299589,3.481930,10.398027,11.675377,8.932072,5.126061,9.433273,...,7.794396,9.202839,8.591789,7.660156,7.984864,8.093590,6.607340,11.131090,9.931208,8.548096
202312,4.987536,2.445531,3.517344,2.464533,1.196292,8.910740,9.236380,7.840186,3.202921,8.723460,...,5.957096,7.975899,6.458948,6.052852,6.332510,7.175104,5.441535,11.667403,7.217621,5.882928


##### PCA Implementation in SKlearn

In [327]:
# Assuming 'excess_returns' is a T x N matrix of excess returns
# Step 1: Calculate the sample covariance matrix
cov_matrix = np.cov(excess_returns_df.T)  # Note the transpose
K = 5

# Step 2: Perform PCA
pca = PCA(n_components=K)
pca.fit(excess_returns_df)

# The principal components 
eigenvectors = pca.components_ 

# The factor loadings, the eigenvalues
eigenvalues = pca.explained_variance_

# Step 3: Top K = 5 factors
# F_hat = eigenvectors[:, :K]
F_hat = pca.transform(excess_returns_df)

# Convert F_hat into a DataFrame for better usability
F_hat_df = pd.DataFrame(F_hat, columns=[f'Factor_{i+1}' for i in range(K)])

# Create DataFrame for eigenvectors (factor loadings)
Lambda = pd.DataFrame(eigenvectors.T, index=excess_returns_df.columns, columns=[f'Factor_{i+1}' for i in range(pca.n_components_)])


In [328]:
F_hat_df

Unnamed: 0,Factor_1,Factor_2,Factor_3,Factor_4,Factor_5
0,56.235955,11.209798,-3.981098,-1.830723,3.768294
1,-27.182260,5.225007,-15.252497,0.261548,1.204782
2,18.801437,1.029489,-2.399929,-2.892452,9.847217
3,103.618327,1.524419,-12.023701,-8.200920,10.899188
4,69.424242,14.050154,-17.170764,2.385118,3.057975
...,...,...,...,...,...
644,40.181299,18.056951,-24.654239,8.237566,-6.661460
645,38.223751,-4.681717,11.763036,5.173083,-6.703967
646,-55.166737,2.827165,-1.647269,-5.876255,1.890124
647,-41.596234,-1.578283,-0.427555,-4.304371,11.162349


In [329]:
# R_approx is the approximate reconstruction of R using F_hat and factor loadings
R_approx = np.dot(F_hat_df, Lambda.T)

# Convert R_approx to a DataFrame for ease of use, using the same indices and columns as the original R
R_approx_df = pd.DataFrame(R_approx, index=excess_returns_df.index, columns=excess_returns_df.columns)

In [330]:
R_approx_df

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
197001,-6.566880,-6.581701,-8.220950,-7.454455,-6.650889,-10.094783,-11.317546,-8.967276,-7.530420,-9.800584,...,-7.683917,-8.631402,-8.499568,-8.949939,-9.716784,-8.997065,-8.089020,-9.760248,-9.042160,-9.360456
197002,3.659100,0.890483,1.930746,0.690146,0.854230,3.293898,4.616040,3.749382,0.950248,3.341731,...,3.199287,4.068159,3.330927,1.946969,1.974953,3.676800,2.684925,4.824639,5.163962,2.780477
197003,-2.772086,-3.441928,-3.928385,-4.058161,-4.543531,-2.708595,-3.435160,-2.315142,-3.733468,-2.244002,...,-2.753444,-2.190585,-3.221834,-3.053739,-3.623573,-2.541422,-2.977067,-0.704574,-3.192614,-4.078863
197004,-12.913938,-12.513400,-14.788200,-14.059868,-14.246935,-17.276458,-18.455634,-14.938593,-13.604682,-15.992407,...,-13.839091,-14.739498,-15.865803,-14.498415,-17.083356,-14.967755,-14.566668,-15.931047,-15.368315,-17.318635
197005,-7.450389,-7.819287,-9.252333,-9.098579,-7.521594,-13.558752,-14.321051,-11.065152,-9.543105,-12.616151,...,-9.500578,-10.631672,-10.658043,-11.819183,-12.675346,-10.557392,-9.528694,-12.489281,-10.402924,-11.901151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202309,-2.915342,-3.885173,-4.311868,-4.751625,-2.158200,-10.179338,-9.670752,-7.491394,-5.665499,-9.513860,...,-5.592700,-6.988994,-6.210810,-8.671669,-8.693011,-6.563852,-5.350011,-10.151632,-5.208891,-6.960308
202310,-4.279667,-0.409979,-1.439944,-0.179930,0.535170,-5.940497,-7.139399,-5.646315,-1.056657,-5.691088,...,-4.459317,-6.057795,-4.650354,-3.527611,-3.244452,-4.970941,-3.250934,-8.302896,-6.666522,-3.815787
202311,6.174306,2.992629,4.092179,3.336708,2.463997,9.319038,10.508210,7.966497,4.220626,8.418644,...,6.889356,8.250850,7.552707,6.745324,6.878580,7.097698,5.620815,10.133063,8.917569,7.414311
202312,4.102902,1.578814,2.494502,1.505373,0.172650,7.832531,8.062607,6.876745,2.299363,7.705306,...,5.049314,7.022821,5.420342,5.135289,5.224877,6.180071,4.456072,10.672244,6.204864,4.755021


### 2.c RP-PCA Factors

In [331]:
obj_cov = objective_func(excess_returns_df, gamma=-1)

# obtain eigenvalue decomposition of the covariance matrix of the 48 excess return series
eigenvalues, eigenvectors = np.linalg.eigh(obj_cov)

# sort by variance (eigenvalues)
sorted_indices = np.argsort(eigenvalues)[::-1]
sorted_eigenvalues = eigenvalues[sorted_indices]
sorted_eigenvectors = eigenvectors[:, sorted_indices]

# select top factors
K = 5  # Number of top factors to select
# the top 5 eigenvectors are the estimates of lambda N * K matrix
top_factors = sorted_eigenvectors[:, :K]
lambda_hat = top_factors.copy()
lambda_hat.shape

# Calculate F_RP_PCA
F_RP_PCA = estimate_F(excess_returns_df, lambda_hat)
F_RP_PCA


Unnamed: 0,0,1,2,3,4
0,-49.475362,-11.204103,3.529457,-0.988531,-2.922602
1,33.942853,-5.220019,14.803605,1.106131,-0.370920
2,-12.040844,-1.023061,1.946606,-2.056137,-8.978567
3,-96.857734,-1.519495,11.575514,-7.361039,-10.062047
4,-62.663649,-14.044160,16.720244,3.225777,-2.204397
...,...,...,...,...,...
644,-33.420706,-18.051113,24.203587,9.074618,7.518962
645,-31.463158,4.687578,-12.211747,6.015752,7.552033
646,61.927330,-2.821204,1.196933,-5.036034,-1.032741
647,48.356827,1.583739,-0.022672,-3.466926,-10.309450


In [332]:
# R_approx is the approximate reconstruction of R using F_hat and factor loadings
R_approx = np.dot(F_hat_df, Lambda.T)

# Convert R_approx to a DataFrame for ease of use, using the same indices and columns as the original R
R_approx_df = pd.DataFrame(R_approx, index=excess_returns_df.index, columns=excess_returns_df.columns)
R_approx_df

Unnamed: 0_level_0,Agric,Food,Soda,Beer,Smoke,Toys,Fun,Books,Hshld,Clths,...,Boxes,Trans,Whlsl,Rtail,Meals,Banks,Insur,RlEst,Fin,Other
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
197001,-6.566880,-6.581701,-8.220950,-7.454455,-6.650889,-10.094783,-11.317546,-8.967276,-7.530420,-9.800584,...,-7.683917,-8.631402,-8.499568,-8.949939,-9.716784,-8.997065,-8.089020,-9.760248,-9.042160,-9.360456
197002,3.659100,0.890483,1.930746,0.690146,0.854230,3.293898,4.616040,3.749382,0.950248,3.341731,...,3.199287,4.068159,3.330927,1.946969,1.974953,3.676800,2.684925,4.824639,5.163962,2.780477
197003,-2.772086,-3.441928,-3.928385,-4.058161,-4.543531,-2.708595,-3.435160,-2.315142,-3.733468,-2.244002,...,-2.753444,-2.190585,-3.221834,-3.053739,-3.623573,-2.541422,-2.977067,-0.704574,-3.192614,-4.078863
197004,-12.913938,-12.513400,-14.788200,-14.059868,-14.246935,-17.276458,-18.455634,-14.938593,-13.604682,-15.992407,...,-13.839091,-14.739498,-15.865803,-14.498415,-17.083356,-14.967755,-14.566668,-15.931047,-15.368315,-17.318635
197005,-7.450389,-7.819287,-9.252333,-9.098579,-7.521594,-13.558752,-14.321051,-11.065152,-9.543105,-12.616151,...,-9.500578,-10.631672,-10.658043,-11.819183,-12.675346,-10.557392,-9.528694,-12.489281,-10.402924,-11.901151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202309,-2.915342,-3.885173,-4.311868,-4.751625,-2.158200,-10.179338,-9.670752,-7.491394,-5.665499,-9.513860,...,-5.592700,-6.988994,-6.210810,-8.671669,-8.693011,-6.563852,-5.350011,-10.151632,-5.208891,-6.960308
202310,-4.279667,-0.409979,-1.439944,-0.179930,0.535170,-5.940497,-7.139399,-5.646315,-1.056657,-5.691088,...,-4.459317,-6.057795,-4.650354,-3.527611,-3.244452,-4.970941,-3.250934,-8.302896,-6.666522,-3.815787
202311,6.174306,2.992629,4.092179,3.336708,2.463997,9.319038,10.508210,7.966497,4.220626,8.418644,...,6.889356,8.250850,7.552707,6.745324,6.878580,7.097698,5.620815,10.133063,8.917569,7.414311
202312,4.102902,1.578814,2.494502,1.505373,0.172650,7.832531,8.062607,6.876745,2.299363,7.705306,...,5.049314,7.022821,5.420342,5.135289,5.224877,6.180071,4.456072,10.672244,6.204864,4.755021


### 2.d Test

In [333]:
test = F_RP_PCA - F_PCA
test.mean()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
dtype: float64

### 2.e Maximal Sharpe

In [334]:
def calculate_sharpe(df, gamma):
    
    # step 1: get lambda_hat
    
    # get the covariance matrix
    obj_cov = objective_func(df, gamma)

    # obtain eigenvalue decomposition of the covariance matrix of the 48 excess return series
    eigenvalues, eigenvectors = np.linalg.eigh(obj_cov)

    # sort by variance (eigenvalues)
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    # select top factors
    K = 5  # Number of top factors to select
    # the top 5 eigenvectors are the estimates of lambda N * K matrix
    top_factors = sorted_eigenvectors[:, :K]
    lambda_hat = top_factors.copy()
    
    # step 2: get F
    F = estimate_F(df, lambda_hat)
    
    # step 3: calculate mean mu and sigma
    mu = F.mean().values
    sigma = objective_func(F, -1)
    
    # step 4: calculate sharpe ratio
    sharpe = np.sqrt(mu @ np.linalg.inv(sigma) @ mu)
    
    # step 5: calculate b_hat
    b_hat = np.linalg.inv(sigma) @ mu # K*K K*1 = K*1
    
    # step 6: calculate M_t
    M_t = 1- b_hat.T @ (F - mu).T
    
    return sharpe, pd.DataFrame(M_t, columns = ['M_t'])

In [337]:
for gamma in [0, 1, 2, 5, 10]:
    sharpe, M = calculate_sharpe(excess_returns_df, gamma)
    M.index = excess_returns_df.index
    print(f"gamma = {gamma}: ")
    print(f"Sharpe ratio: {sharpe}")
    print("Implied Stochastic Discount Factor M_t: ")
    display(M)
    print("\n")

gamma = 0: 
Sharpe ratio: 0.2517613536575152
Implied Stochastic Discount Factor M_t: 


Unnamed: 0_level_0,M_t
Date,Unnamed: 1_level_1
197001,1.407370
197002,0.916818
197003,1.290111
197004,1.857758
197005,1.471261
...,...
202309,1.158744
202310,0.985770
202311,0.802272
202312,0.998567




gamma = 1: 
Sharpe ratio: 0.25669062902969453
Implied Stochastic Discount Factor M_t: 


Unnamed: 0_level_0,M_t
Date,Unnamed: 1_level_1
197001,1.411519
197002,0.891434
197003,1.292488
197004,1.840364
197005,1.457955
...,...
202309,1.158869
202310,0.974393
202311,0.806670
202312,1.015793




gamma = 2: 
Sharpe ratio: 0.26116142757031285
Implied Stochastic Discount Factor M_t: 


Unnamed: 0_level_0,M_t
Date,Unnamed: 1_level_1
197001,1.414922
197002,0.866697
197003,1.293929
197004,1.822259
197005,1.444890
...,...
202309,1.159509
202310,0.964351
202311,0.810610
202312,1.031461




gamma = 5: 
Sharpe ratio: 0.27200825745077234
Implied Stochastic Discount Factor M_t: 


Unnamed: 0_level_0,M_t
Date,Unnamed: 1_level_1
197001,1.421676
197002,0.800504
197003,1.294843
197004,1.770007
197005,1.409896
...,...
202309,1.163174
202310,0.941114
202311,0.820033
202312,1.069594




gamma = 10: 
Sharpe ratio: 0.2837495842943952
Implied Stochastic Discount Factor M_t: 


Unnamed: 0_level_0,M_t
Date,Unnamed: 1_level_1
197001,1.426354
197002,0.720375
197003,1.292274
197004,1.702151
197005,1.368220
...,...
202309,1.170523
202310,0.917806
202311,0.830134
202312,1.110990




