In [16]:
# CFM 301 W23
# Final Project
# Keshav, Andrew, Gavin, Anson

import pandas as pd
import numpy as np
from pandasql import sqldf
import statsmodels.api as sm
import seaborn as sns
from scipy import stats
from sklearn import linear_model
import math

# Note the the code from the ML lectures provided was utilized here in order to build our model

# Python Kernal Version: 3.11.1

In [17]:
# Code as provided in ML lecture exercise 1

dataset = pd.read_csv("dataset_files/provided_dataset_merged_df.csv")
dataset['permno'] = dataset['permno'].astype(np.int64)
factors = list(dataset.columns[9:59]) + list(dataset.columns[81:])
ret_cols = list(dataset.columns[61:75])

#Filtering stocks based on stock price and market cap
dataset['PRC'] = abs(dataset['PRC'])
dataset['size'] = dataset['PRC'] * dataset['SHROUT'] * 1000
dataset['botm_size'] = dataset.groupby(["permno"])["size"].shift(1)
dataset['botm_prc'] = dataset.groupby(["permno"])["PRC"].shift(1)
dataset['permno_year'] = dataset['permno'].astype(str) + '_' + dataset['yyyymm'].astype(str).str.slice(0,4)

dates = np.sort(pd.unique(dataset["yyyymm"]))
januaries = list(filter(lambda x: x % 100 == 1, dates))

indices_to_drop = set()

for date in januaries:
  df = dataset.loc[dataset["yyyymm"] == date]
  for index, row in df.iterrows():
    if row["botm_prc"] < 5 or row['botm_size'] < 10 ** 8:
      indices_to_drop.add(row["permno_year"])
indices_to_drop
dataset = dataset[~dataset['permno_year'].isin(indices_to_drop)].reset_index(drop=True)
dataset

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,STOCK_SPLIT,DIVAMT,SPREAD,O_SCORE_Q,HIRING_RATE,INVESTMENT_GROWTH_3Y,size,botm_size,botm_prc,permno_year
0,10026,198603.0,75.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,-0.183465,...,1.000000,,1.5,,,,4.133125e+07,,,10026_1986
1,10026,198604.0,76.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.636488,...,1.000000,,1.5,-1.288174,,,4.424875e+07,4.133125e+07,21.250000,10026_1986
2,10026,198605.0,77.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,,...,1.000000,,,-1.288174,,,6.224000e+07,4.424875e+07,22.750000,10026_1986
3,10026,198606.0,78.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,,...,0.666667,0.00,,-1.288174,,,5.252400e+07,6.224000e+07,32.000000,10026_1986
4,10026,198607.0,79.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,,...,1.000000,,,-1.273626,,,3.282750e+07,5.252400e+07,18.000000,10026_1986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364446,93429,201802.0,458.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.145937,...,1.000000,,,-3.977201,0.0,,1.262409e+10,1.515126e+10,134.389999,93429_2018
364447,93429,201803.0,459.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.193968,...,1.000000,0.27,,-3.977201,0.0,,1.284230e+10,1.262409e+10,112.010002,93429_2018
364448,93429,201804.0,460.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.175603,...,1.000000,,,-4.053557,0.0,,1.201168e+10,1.284230e+10,114.099998,93429_2018
364449,93429,201805.0,461.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.081776,...,1.000000,0.27,,-4.053557,0.0,,1.097452e+10,1.201168e+10,106.779999,93429_2018


In [18]:
# Keshav: Imputing data as provided the code for in ML lecture exercise 1

# NEW IMPUTE METHOD
dataset.dropna(subset= ['ticker', 'conm', 'gvkey', 'cusip', 'naics', 'gsubind'], inplace = True)

# .groupby crates a Pandas groupby object, where I've grouped the rows by their permno
#grouped_med = merged_df.groupby(by= 'permno')
grouped_med = dataset.groupby(by= 'monthid')
# the lambda function gets the median per group in the groupby object, and fills the NaN values with the median per group
imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))

# This line assigns the values of the medians 
dataset = dataset.assign(**imputed_grouped.to_dict(orient='series'))

# Some NAN VALUES STILL IN DATA, DROPNA TO TAKE THEM OUT  - Figuring out what's going on with Python difficult.
# Suggest using other programs such as SAS if issue is severe
dataset.dropna(subset=dataset.columns.difference(['noOfMAActivity','STOCK_SPLIT','DIVAMT','SPREAD','O_SCORE_Q','HIRING_RATE','INVESTMENT_GROWTH_3Y']), inplace = True)

  imputed_grouped = grouped_med.transform(lambda y: y.fillna(y.median()))


KeyboardInterrupt: 

In [None]:
# Data Cleaning/Factor Selection
# 1. Reduce stocks based on stock price/market cap
# 2. Imputing
# 3. Winsorizing
# 4. Correlation matrix
# 5. Normalization
# 6. Split into training/test/validation sets
# 7. Fama-Macbeth regression on test data --> determine which factors to finalize/keep based on t-stats

In [None]:
# Out of Sample Testing
# 1. Get predicted returns based on alphas/factor betas (for Fama-Macbeth method)
# 2. ML Method

In [None]:
#Winsorizing the factors by month
for monthid, group in dataset.groupby('monthid'):
    for factor in factors:
        max_cutoff = group[factor].quantile(0.01)
        min_cutoff = group[factor].quantile(0.99)
        group[factor] = group[factor].clip(min_cutoff, max_cutoff)
        dataset.loc[dataset['monthid'] == monthid, factor] = group[factor].values
dataset[factors].describe()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group[factor] = group[factor].clip(min_cutoff, max_cutoff)


Unnamed: 0,IM,range_20,log_vol_dollar_20,range_120,log_vol_dollar_120,xret_5,xret_10,xret_20,xret_indsize_20,xret_indsize_std20,...,Cto,pe_ttm,lag_log_size,noOfMAActivity,STOCK_SPLIT,DIVAMT,SPREAD,O_SCORE_Q,HIRING_RATE,INVESTMENT_GROWTH_3Y
count,363585.0,363585.0,363585.0,363585.0,363585.0,363585.0,363585.0,363585.0,363585.0,363585.0,...,363585.0,363585.0,363585.0,269345.0,363585.0,363585.0,142068.0,363585.0,363585.0,363585.0
mean,0.067284,0.028395,15.903513,0.028431,15.883744,0.002446,0.003252,0.005692,0.005587,0.017776,...,0.26146,41.560535,7.591516,1.433046,0.998697,0.202284,0.768966,-3.457481,0.061954,0.642025
std,0.176663,0.01581,2.142866,0.01401,2.123494,0.042671,0.058884,0.083692,0.080569,0.010238,...,0.197799,58.070272,1.647206,1.543648,0.022332,0.098856,0.603356,1.029877,0.177384,1.676695
min,-0.855087,0.003852,8.286666,0.005321,8.222638,-0.305643,-0.421193,-0.61241,-0.592648,0.003682,...,0.005305,2.496501,1.747901,1.0,0.5,0.0,0.015,-6.39624,-0.444543,-0.923538
25%,-0.024333,0.018247,14.5176,0.01913,14.496957,-0.01889,-0.02746,-0.039769,-0.037208,0.010764,...,0.117106,13.300014,6.400958,1.0,1.0,0.15,0.375,-3.946266,0.0,-0.014319
50%,0.067203,0.024426,16.000691,0.024938,15.971158,0.000897,0.001734,0.003887,0.003745,0.01527,...,0.228226,18.739761,7.432022,1.0,1.0,0.1775,0.5625,-3.454973,0.020377,0.291795
75%,0.147772,0.033983,17.455465,0.033914,17.412338,0.022344,0.032843,0.049381,0.046909,0.022018,...,0.351078,30.363902,8.634589,1.5,1.0,0.24,1.0,-3.042077,0.078541,0.618244
max,2.318851,0.229597,20.569824,0.147269,20.423303,0.492096,0.490022,0.684095,0.64978,0.135778,...,1.237098,200.0,12.517654,24.0,1.0,2.516,4.0,0.502599,2.216023,30.467908


In [None]:
ff3_monthly_df_original = pd.read_excel('dataset_files/ff3_monthly.xlsx', sheet_name='ff3_monthly')
ff3_monthly_df = ff3_monthly_df_original.copy()
ff3_monthly_df['monthid'] = ff3_monthly_df['month_id'] = (ff3_monthly_df['DATE'].dt.year - 1980) * 12 + ff3_monthly_df['DATE'].dt.month

dataset = pd.merge(dataset, ff3_monthly_df, how='left', on='monthid').reset_index(drop=True)
dataset

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,botm_size,botm_prc,permno_year,RETX,SMB,HML,RF,MOM,DATE,month_id
0,10026,198603.0,75.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,-0.183465,...,7.992216e+08,39.000000,10026_1986,0.0488,-0.0059,-0.0039,0.0060,0.0241,1986-03-31,75
1,10026,198604.0,76.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.636488,...,4.133125e+07,21.250000,10026_1986,-0.0131,0.0278,-0.0287,0.0052,-0.0037,1986-04-30,76
2,10026,198605.0,77.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.354652,...,4.424875e+07,22.750000,10026_1986,0.0462,-0.0135,-0.0021,0.0049,0.0205,1986-05-30,77
3,10026,198606.0,78.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.308972,...,6.224000e+07,32.000000,10026_1986,0.0103,-0.0096,0.0128,0.0052,0.0507,1986-06-30,78
4,10026,198607.0,79.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.273834,...,5.252400e+07,18.000000,10026_1986,-0.0645,-0.0336,0.0470,0.0052,0.0183,1986-07-31,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363580,93429,201802.0,458.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.145937,...,1.515126e+10,134.389999,93429_2018,-0.0365,0.0023,-0.0107,0.0011,0.0358,2018-02-28,458
363581,93429,201803.0,459.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.193968,...,1.262409e+10,112.010002,93429_2018,-0.0235,0.0405,-0.0023,0.0011,-0.0113,2018-03-29,459
363582,93429,201804.0,460.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.175603,...,1.284230e+10,114.099998,93429_2018,0.0028,0.0114,0.0054,0.0014,0.0036,2018-04-30,460
363583,93429,201805.0,461.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.081776,...,1.201168e+10,106.779999,93429_2018,0.0265,0.0526,-0.0318,0.0014,0.0389,2018-05-31,461


In [None]:
#Chosen portfolio factors
portfolio_factors = ['lag_log_size', 'deviation_pct20', 'roe', 'O_SCORE_Q', 'pe_ttm', 'INVESTMENT_GROWTH_3Y', 'RSI_20', 'IV_capm', 'profitability', 'beta_5y', 'sales_g_q', 'HIRING_RATE']

# Code as provided in ML lecture exercise 1

pd.set_option("mode.chained_assignment", None)
corr_matrix = dataset[portfolio_factors].corr().abs()

# creates an upper triangular matrix of the corr matrix (since it's symmetric about the diagonal)
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))

# gets the columns that fit the criteria to be dropped
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.7)]
print(to_drop)
for factor in to_drop:
    portfolio_factors.remove(factor)

# drops those columns
#dataset.drop(dataset[to_drop], axis=1, inplace= True)

['RSI_20']


In [None]:
#Normalize by calculating z-scores for each chosen factor
for monthid, group in dataset.groupby('monthid'):
    zscore_cols = []
    for factor in portfolio_factors:
        group[factor] = group[factor].apply(lambda x: (x - group[factor].mean()) / group[factor].std())
        zscore_cols.append(factor)
    dataset.loc[dataset['monthid'] == monthid, zscore_cols] = group[zscore_cols].values

dataset

Unnamed: 0,permno,yyyymm,monthid,ticker,conm,gvkey,cusip,naics,gsubind,IM,...,botm_size,botm_prc,permno_year,RETX,SMB,HML,RF,MOM,DATE,month_id
0,10026,198603.0,75.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,-0.183465,...,7.992216e+08,39.000000,10026_1986,0.0488,-0.0059,-0.0039,0.0060,0.0241,1986-03-31,75
1,10026,198604.0,76.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.636488,...,4.133125e+07,21.250000,10026_1986,-0.0131,0.0278,-0.0287,0.0052,-0.0037,1986-04-30,76
2,10026,198605.0,77.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.354652,...,4.424875e+07,22.750000,10026_1986,0.0462,-0.0135,-0.0021,0.0049,0.0205,1986-05-30,77
3,10026,198606.0,78.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.308972,...,6.224000e+07,32.000000,10026_1986,0.0103,-0.0096,0.0128,0.0052,0.0507,1986-06-30,78
4,10026,198607.0,79.0,JJSF,J & J SNACK FOODS CORP,12825,466032109,311812,30202030,0.273834,...,5.252400e+07,18.000000,10026_1986,-0.0645,-0.0336,0.0470,0.0052,0.0183,1986-07-31,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363580,93429,201802.0,458.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.145937,...,1.515126e+10,134.389999,93429_2018,-0.0365,0.0023,-0.0107,0.0011,0.0358,2018-02-28,458
363581,93429,201803.0,459.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.193968,...,1.262409e+10,112.010002,93429_2018,-0.0235,0.0405,-0.0023,0.0011,-0.0113,2018-03-29,459
363582,93429,201804.0,460.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.175603,...,1.284230e+10,114.099998,93429_2018,0.0028,0.0114,0.0054,0.0014,0.0036,2018-04-30,460
363583,93429,201805.0,461.0,CBOE,CBOE GLOBAL MARKETS INC,184500,12503M108,523210,40203040,0.081776,...,1.201168e+10,106.779999,93429_2018,0.0265,0.0526,-0.0318,0.0014,0.0389,2018-05-31,461


In [None]:
dataset.to_csv('dataset_files/merged_df.csv', index=False)

In [None]:
# Keshav: Splitting the dataset into each respective period based on the in-sample / out-of-sample recomendation
TRAINING_SPLIT = 0.50
VALIDATE_SPLIT = 0.75

train, validate, test = \
                        np.split(dataset.sample(frac=1, random_state=301), 
                        [int(TRAINING_SPLIT*len(dataset))-1, int(VALIDATE_SPLIT*len(dataset))])

print("Training", train.shape)
print("Validating", validate.shape)
print("Testing", test.shape)

Training (181791, 99)
Validating (90897, 99)
Testing (90897, 99)


In [21]:
#Fama-Macbeth Regression (NOTE: only use training set for regression - 1980-2010)

def famaMacbethRegression(portfolio_factors_fama_macbeth):
    month_groups = train.groupby('monthid')

    cross_sect_dict = {}
    for factor in portfolio_factors_fama_macbeth:
        cross_sect_dict[factor + '_beta'] = []
    cross_sect_dict['xret'] = []

    for month, group in month_groups:
        ret = group['RET']
        explanatory_variates = group[portfolio_factors_fama_macbeth]
        w = sm.add_constant(explanatory_variates, has_constant='add')
        cross_sect_reg = sm.OLS(ret, w, missing='drop').fit()
        for i in range(len(portfolio_factors_fama_macbeth)):
            #Note: params[0] is the alpha estimate
            beta_val = cross_sect_reg.params[i+1]
            cross_sect_dict[portfolio_factors_fama_macbeth[i] + '_beta'].append(beta_val)
        cross_sect_dict['xret'].append(ret.mean())

    cross_sect_df = pd.DataFrame(cross_sect_dict)
    print("T-Statistics:")
    for factor in portfolio_factors_fama_macbeth:
        print('{} T-stat: '.format(factor), stats.ttest_1samp(cross_sect_df['{}_beta'.format(factor)], 0)[0])

In [22]:
# Original 11 Factors
famaMacbethRegression(portfolio_factors)

T-Statistics:
lag_log_size T-stat:  -7.22601107392784
deviation_pct20 T-stat:  -9.804105585572547
roe T-stat:  2.00599472619291
O_SCORE_Q T-stat:  -6.945638021434757
pe_ttm T-stat:  2.2490275893687466
INVESTMENT_GROWTH_3Y T-stat:  -3.2897909820522124
IV_capm T-stat:  -0.7838581796587386
profitability T-stat:  -0.08496428981298729
beta_5y T-stat:  0.07603489747688816
sales_g_q T-stat:  4.912445623233632
HIRING_RATE T-stat:  -2.05065457876826


In [23]:
# Final Factors
portfolio_factors_finalized = ['lag_log_size', 'O_SCORE_Q', 'INVESTMENT_GROWTH_3Y', 'deviation_pct20', 'sales_g_q', 'HIRING_RATE']
famaMacbethRegression(portfolio_factors_finalized)

T-Statistics:
lag_log_size T-stat:  -6.3108307964026
O_SCORE_Q T-stat:  -5.623707619425542
INVESTMENT_GROWTH_3Y T-stat:  -2.834180801230732
deviation_pct20 T-stat:  -7.574604372052687
sales_g_q T-stat:  4.147352934756886
HIRING_RATE T-stat:  -2.4807749638271543
