# Emprical Asset Pricing - Problem Set 3: Mutual Fund

Group Member: Victor Xiao, Zi Wang, Sonny Song

### 0. Data Preprocessing

In [2]:
# Packages
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
import wrds
from datetime import datetime, timedelta
import warnings
from pandas.tseries.offsets import MonthEnd
warnings.simplefilter('ignore') # 
import statsmodels.api as sm

# Setups
pd.set_option("display.max_rows", 100)
pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [3]:
mf_df = pd.read_csv('data/fund_return.csv')
mf_df = mf_df[(mf_df['Date'] >= 198001) & (mf_df['Date'] <= 201903)]
mf_df.tail()

Unnamed: 0,wficn,Date,ret
409848,604492.0,201901,0.103757
409849,604492.0,201902,0.06483
409850,604492.0,201903,-0.019026
409851,604503.0,201902,0.025652
409852,604503.0,201903,0.011613


In [4]:
factor_df = pd.read_csv('data/8_factors.csv')
factor_df = factor_df[(mf_df['Date'] >= 198001) & (factor_df['Date'] <= 201903)]
factor_df.tail()

Unnamed: 0,Date,Mkt-RF,SMB,HML,RMW,CMA,LT_Rev,Mom,ST_Rev,RF
664,201811,1.69,-0.82,0.25,-0.6,0.32,-4.36,-1.42,1.02,0.18
665,201812,-9.55,-3.05,-1.47,-0.13,0.16,-3.27,1.83,-1.81,0.19
666,201901,8.41,3.09,-0.62,-0.69,-1.36,2.73,-8.68,7.45,0.21
667,201902,3.4,1.78,-2.84,0.24,-1.47,-0.68,0.79,-0.01,0.18
668,201903,1.1,-3.54,-4.07,0.93,-1.02,-0.79,2.18,0.16,0.19


In [5]:
merged_df = pd.merge(mf_df, factor_df, on='Date')
merged_df.columns

Index(['wficn', 'Date', 'ret', 'Mkt-RF', 'SMB', 'HML', 'RMW', 'CMA', 'LT_Rev',
       'Mom', 'ST_Rev', 'RF'],
      dtype='object')

### 1. Summary Statistics

#### 1.a Mean Return

In [6]:
# Calculate Mean Return for the mutual fund
summary_stats_mean_return = merged_df.groupby('wficn')['ret'].mean()
summary_stats_mean_return.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count   3275.000000
mean       0.006214
std        0.012657
min       -0.244165
5%        -0.007682
10%       -0.001362
25%        0.004353
50%        0.007352
75%        0.009288
90%        0.011910
95%        0.015491
max        0.382000
Name: ret, dtype: float64

#### 1.b Mean Return in Excess of VWRET

In [7]:
merged_df['ret-Mkt'] = merged_df['ret'] - merged_df['Mkt-RF'] - merged_df['RF']
summary_stats_excess_return = merged_df.groupby('wficn')['ret-Mkt'].mean()
summary_stats_excess_return.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count   3275.000000
mean      -0.736504
std        0.872167
min       -8.103626
5%        -1.646086
10%       -1.278912
25%       -1.011495
50%       -0.811093
75%       -0.588826
90%       -0.097022
95%        0.441391
max       16.905835
Name: ret-Mkt, dtype: float64

#### 1.c stdev of the return

In [8]:
summary_stats_stdev_return = merged_df.groupby('wficn')['ret'].std()
summary_stats_stdev_return.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count   3253.000000
mean       0.050607
std        0.019064
min        0.000226
5%         0.028149
10%        0.034227
25%        0.041241
50%        0.048105
75%        0.056344
90%        0.069331
95%        0.080197
max        0.564978
Name: ret, dtype: float64

#### 1.d stdev of the return in excess of the vw stock market return

In [9]:
summary_stdev_excess_return = merged_df.groupby('wficn')['ret-Mkt'].std()
summary_stdev_excess_return.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count   3253.000000
mean       4.285483
std        0.891849
min        0.007297
5%         2.720664
10%        3.346559
25%        3.944459
50%        4.303713
75%        4.621945
90%        5.281344
95%        5.729365
max        9.259108
Name: ret-Mkt, dtype: float64

#### 1.e Sharpe ratio

In [10]:
merged_df['ret-RF'] = merged_df.ret - merged_df.RF
summary_sharpe_ratio = merged_df.groupby('wficn')['ret-RF'].mean() / merged_df.groupby('wficn')['ret-RF'].std()
summary_sharpe_ratio.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count   3253.000000
mean      -1.523593
std        2.392753
min      -93.565632
5%        -4.805105
10%       -3.021397
25%       -1.742700
50%       -1.028146
75%       -0.633305
90%       -0.298554
95%        0.010450
max        2.974699
Name: ret-RF, dtype: float64

#### 1.f CAPM beta

In [11]:
y = merged_df.groupby('wficn')['ret-RF']
X = merged_df['Mkt-RF']
X = sm.add_constant(X)
beta_values = {}
alpha_values = {}
resid_vol = {}
resid = {}
alpha_tstat = {}
for name, group in y:
    model = sm.OLS(group, X.loc[group.index]).fit()
    beta_values[name] = model.params[1]
    alpha_values[name] = model.params[0]
    resid_vol[name] = model.resid.std()
    resid[name] = model.resid
    alpha_tstat[name] = model.tvalues['const']
beta_df = pd.DataFrame.from_dict(beta_values, orient='index', columns=['Beta'])
beta_description = beta_df.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

beta_description

Unnamed: 0,Beta
count,3254.0
mean,0.011722
std,0.007734
min,-0.10049
5%,0.00565
10%,0.007604
25%,0.009997
50%,0.01185
75%,0.013499
90%,0.015242


#### 1.g CAPM alpha

In [12]:
alpha_df = pd.DataFrame.from_dict(alpha_values, orient='index', columns=['Alpha'])
alpha_description = alpha_df.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

alpha_description

Unnamed: 0,Alpha
count,3254.0
mean,-0.177657
std,0.126191
min,-0.795507
5%,-0.419301
10%,-0.362155
25%,-0.25514
50%,-0.15508
75%,-0.088629
90%,-0.031765


#### 1.h Idiosyncratic volatility (time-series stdev of the CAPM residual)

In [13]:
resid_vol_df = pd.DataFrame.from_dict(resid_vol, orient='index', columns=['resid_vol'])
resid_vol_df.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

Unnamed: 0,resid_vol
count,3233.0
mean,0.112965
std,0.062101
min,0.0
5%,0.015239
10%,0.02749
25%,0.05878
50%,0.130181
75%,0.155565
90%,0.176675


#### 1.i Information ratio (CAPM alpha divided by idiosyncratic volatility)

In [14]:
IR = alpha_df.Alpha / resid_vol_df.resid_vol
IR.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

count                3233.000000
mean                         inf
std                          NaN
min     -7201331264412249.000000
5%                     -7.536965
10%                    -4.326008
25%                    -2.046909
50%                    -1.155747
75%                    -0.779397
90%                    -0.598071
95%                    -0.514230
max                          inf
dtype: float64

#### 1.j FF-3factor model alpha

In [15]:
X_ff3 = merged_df[['Mkt-RF', 'SMB', 'HML']]
X_ff3 = sm.add_constant(X_ff3)
y = merged_df.groupby('wficn')['ret-RF']
alpha_values_ff3 = {}


for name, group in y:
    # Perform OLS regression
    model_ff3 = sm.OLS(group, X_ff3.loc[group.index]).fit()
    # The alpha value is the intercept of the regression
    alpha_values_ff3[name] = model_ff3.params[0]


alpha_df_ff3 = pd.DataFrame.from_dict(alpha_values_ff3, orient='index', columns=['Alpha_ff3'])
alpha_description_ff3 = alpha_df_ff3.describe([0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

alpha_description_ff3

Unnamed: 0,Alpha_ff3
count,3254.0
mean,-0.176598
std,0.127238
min,-0.790652
5%,-0.419942
10%,-0.360476
25%,-0.254017
50%,-0.154265
75%,-0.085916
90%,-0.029452


#### 1.k FF-Carhart 4factor model alpha

In [16]:
X_ff4 = merged_df[['Mkt-RF', 'SMB', 'HML', 'Mom']]  # Assuming 'Mom' is included in merged_df
X_ff4 = sm.add_constant(X_ff4)

# Prepare the dependent variable
y = merged_df.groupby('wficn')['ret-RF']

# Dictionary to hold FF-Carhart 4-factor model alphas
alpha_values_ff4 = {}

for name, group in y:
    # Perform OLS regression
    model_ff4 = sm.OLS(group, X_ff4.loc[group.index]).fit()
    # The alpha value is the intercept of the regression
    alpha_values_ff4[name] = model_ff4.params['const']

# Convert FF-Carhart 4-factor model alpha values to a DataFrame for description
alpha_df_ff4 = pd.DataFrame.from_dict(alpha_values_ff4, orient='index', columns=['Alpha_ff4'])

# Describe the alphas
alpha_description_ff4 = alpha_df_ff4.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

alpha_description_ff4

Unnamed: 0,Alpha_ff4
count,3254.0
mean,-0.174752
std,0.127909
min,-0.797219
5%,-0.415978
10%,-0.359584
25%,-0.253389
50%,-0.151674
75%,-0.085757
90%,-0.028798


#### 1.I FF 5-factor model alpha

In [17]:
# Prepare data for FF 5-factor model regression, assuming 'RMW' and 'CMA' are included in merged_df
X_ff5 = merged_df[['Mkt-RF', 'SMB', 'HML', 'Mom', 'RMW', 'CMA']]
X_ff5 = sm.add_constant(X_ff5)

# Dictionary to hold FF 5-factor model alphas
alpha_values_ff5 = {}
resid_vol_ff5 = {}

for name, group in y:
    # Perform OLS regression
    model_ff5 = sm.OLS(group, X_ff5.loc[group.index]).fit()
    # The alpha value is the intercept of the regression
    resid_vol_ff5[name] = model_ff5.resid.std()
    alpha_values_ff5[name] = model_ff5.params['const']

# Convert FF 5-factor model alpha values to a DataFrame for description
alpha_df_ff5 = pd.DataFrame.from_dict(alpha_values_ff5, orient='index', columns=['Alpha_ff5'])
resid_vol_df_ff5 = pd.DataFrame.from_dict(resid_vol_ff5, orient='index', columns=['resid_vol_ff5'])
# Describe the alphas
alpha_description_ff5 = alpha_df_ff5.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])

alpha_description_ff5

Unnamed: 0,Alpha_ff5
count,3254.0
mean,-0.17599
std,0.136006
min,-2.688798
5%,-0.416357
10%,-0.359675
25%,-0.254326
50%,-0.151066
75%,-0.084243
90%,-0.026327


#### 1.m Idiosyncratic volatility (time-series stdev of the FF 5-factor model residual)

In [18]:
resid_vol_description_ff5 = resid_vol_df_ff5.describe(percentiles=[0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95])
resid_vol_description_ff5

Unnamed: 0,resid_vol_ff5
count,3233.0
mean,0.105743
std,0.064367
min,0.0
5%,0.005303
10%,0.012673
25%,0.053976
50%,0.126193
75%,0.149857
90%,0.172985


#### Discussion:

Our analysis reveals mutual fund active investment strategies predominantly underperform, with the 95th percentile of alphas negative across various risk models, indicating value destruction by fund managers. High idiosyncratic volatilities in top-performing funds suggest these funds' returns are unstable and unreliable over time, despite occasional high returns. In contrast, poorly performing funds show lower volatilities, highlighting consistent underachievement. This dichotomy underscores the complex challenge mutual fund managers face in balancing risk and return, suggesting a critical reevaluation of active investment strategies within mutual funds due to their unstable returns and the consistent failure to generate positive alpha.

### 2. Simulate firm returns under CAPM

To set α to zero, we subtract a fund’s α estimate from its monthly returns.

In [19]:
for j, alpha in alpha_values.items():
    merged_df.loc[merged_df['wficn'] == j, 'alpha'] = alpha

merged_df['ret-alpha'] = merged_df['ret-RF'] - merged_df['alpha']
# len(merged_df.wficn.unique())  ##3275
merged_df

Unnamed: 0,wficn,Date,ret,Mkt-RF,SMB,HML,RMW,CMA,LT_Rev,Mom,ST_Rev,RF,ret-Mkt,ret-RF,alpha,ret-alpha
0,100003.000000,199601,-0.002677,2.260000,-2.590000,0.310000,-0.640000,2.270000,1.350000,0.570000,1.060000,0.430000,-2.692677,-0.432677,-0.178326,-0.254352
1,100003.000000,199602,0.047651,1.330000,1.820000,-1.420000,0.400000,-1.800000,-1.330000,0.570000,2.140000,0.390000,-1.672349,-0.342349,-0.178326,-0.164023
2,100003.000000,199603,0.007687,0.730000,1.540000,1.010000,1.300000,-0.970000,-0.230000,-1.880000,-0.060000,0.390000,-1.112313,-0.382313,-0.178326,-0.203987
3,100003.000000,199604,0.087095,2.060000,4.640000,-3.910000,0.180000,-2.200000,-0.780000,-0.910000,0.720000,0.460000,-2.432905,-0.372905,-0.178326,-0.194580
4,100003.000000,199605,0.037427,2.360000,3.180000,-1.200000,0.430000,-0.220000,1.250000,1.550000,-1.050000,0.420000,-2.742573,-0.382573,-0.178326,-0.204247
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409848,604492.000000,201901,0.103757,8.410000,3.090000,-0.620000,-0.690000,-1.360000,2.730000,-8.680000,7.450000,0.210000,-8.516243,-0.106243,-0.123510,0.017267
409849,604492.000000,201902,0.064830,3.400000,1.780000,-2.840000,0.240000,-1.470000,-0.680000,0.790000,-0.010000,0.180000,-3.515170,-0.115170,-0.123510,0.008340
409850,604492.000000,201903,-0.019026,1.100000,-3.540000,-4.070000,0.930000,-1.020000,-0.790000,2.180000,0.160000,0.190000,-1.309026,-0.209026,-0.123510,-0.085516
409851,604503.000000,201902,0.025652,3.400000,1.780000,-2.840000,0.240000,-1.470000,-0.680000,0.790000,-0.010000,0.180000,-3.554348,-0.154348,-0.189884,0.035537


A simulation run is a random sample (with replacement) of all months 

In [20]:
n_months = len(merged_df.Date.unique())  # 471
# n_months = 30
df_pool = merged_df[['Date', 'wficn', 'ret-alpha', 'Mkt-RF']].dropna()
df_pool

Unnamed: 0,Date,wficn,ret-alpha,Mkt-RF
0,199601,100003.000000,-0.254352,2.260000
1,199602,100003.000000,-0.164023,1.330000
2,199603,100003.000000,-0.203987,0.730000
3,199604,100003.000000,-0.194580,2.060000
4,199605,100003.000000,-0.204247,2.360000
...,...,...,...,...
409848,201901,604492.000000,0.017267,8.410000
409849,201902,604492.000000,0.008340,3.400000
409850,201903,604492.000000,-0.085516,1.100000
409851,201902,604503.000000,0.035537,3.400000


In [23]:
import numpy as np
import statsmodels.api as sm
from concurrent.futures import ProcessPoolExecutor

def simulate_alpha_tstats(all_dates, df, n_months_per_simulation):
    np.random.seed()  # Remove the fixed seed here to ensure different seeds in each process
    sampled_dates = np.random.choice(all_dates, n_months_per_simulation, replace=True)
    sampled_df = df[df['Date'].isin(sampled_dates)]
    alpha_tstats = []
    for _, fund_df in sampled_df.groupby('wficn'):
        if len(fund_df) >= 8: 
            # CAPM: ret-alpha ~ Mkt-RF
            X = sm.add_constant(fund_df[['Mkt-RF']])
            y = fund_df['ret-alpha']
            model = sm.OLS(y, X).fit()
            alpha_tstats.append(model.tvalues['const'])
    return alpha_tstats

def bootstrap_capm_alpha(df, n_simulations=2500, n_months_per_simulation=None):
    all_dates = df['Date'].unique()
    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(simulate_alpha_tstats, all_dates, df, n_months_per_simulation) for _ in range(n_simulations)]
        results = [future.result() for future in futures]
    # Flatten the list of lists to a single list
    alpha_tstats = [item for sublist in results for item in sublist]
    return alpha_tstats


In [24]:
simulation_alphas = bootstrap_capm_alpha(df_pool)

In [None]:
np.mean(simulation_alphas), np.std(simulation_alphas), np.mean(simulation_alphas)/np.std(simulation_alphas)
# np.max(simulation_alphas)

In [None]:
plt.figure(figsize=(10, 6))
plt.hist(simulation_alphas, bins=300, density=True, alpha=0.6, color='b')

In [None]:
mean_tstat = np.mean([tstat for tstat in alpha_tstat.values() if not np.isnan(tstat)])
mean_tstat

In [None]:
mean_alpha = merged_df['alpha'].mean()

We can conclude that there is some close to zero positive simulated alpha, different from the actual data that there is consistently negative alpha on average. The original conclusion about fund skills on average is that most managers do not have skills in creating value and in fact destroys value. And the tail of the distribution is very extreme. 