# Notebook to get, store, and clean our data both from Fama French and CRSP

### Get FF factors data

In [22]:
import sys
from pathlib import Path

In [23]:
project_root = Path.cwd().parents[0]
sys.path.append(str(project_root))


In [24]:
import src.data.load_factors as lf

#### Daily FF factors

In [25]:
ff_daily = lf.build_factors_dataset('daily')
ff_daily.head()

Unnamed: 0_level_0,MKT_RF,SMB,HML,RMW,CMA,RF,MOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1963-07-01,-0.0067,0.0,-0.0034,-0.0001,0.0016,0.0001,-0.0023
1963-07-02,0.0079,-0.0026,0.0026,-0.0007,-0.002,0.0001,0.0044
1963-07-03,0.0063,-0.0017,-0.0009,0.0018,-0.0034,0.0001,0.0038
1963-07-05,0.004,0.0008,-0.0027,0.0009,-0.0034,0.0001,0.0006
1963-07-08,-0.0063,0.0004,-0.0018,-0.0029,0.0014,0.0001,-0.0045


#### Monthly FF factors

In [26]:
ff_monthly= lf.build_factors_dataset('monthly')
ff_monthly.head()

Unnamed: 0_level_0,MKT_RF,SMB,HML,RMW,CMA,RF,MOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1963-07-01,-0.0039,-0.0048,-0.0081,0.0064,-0.0115,0.0027,0.0101
1963-08-01,0.0508,-0.008,0.017,0.004,-0.0038,0.0025,0.01
1963-09-01,-0.0157,-0.0043,0.0,-0.0078,0.0015,0.0027,0.0012
1963-10-01,0.0254,-0.0134,-0.0004,0.0279,-0.0225,0.0029,0.0313
1963-11-01,-0.0086,-0.0085,0.0173,-0.0043,0.0227,0.0027,-0.0078


### Load CRSP data

In [6]:
import src.data.load_data as ld

#### Monthly CRSP Data

In [27]:
rets_m, caps_m = ld.build_crsp_dataset(frequency="daily")


WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [8]:
## expect same # or rows and # of columns
rets_m.shape
caps_m.shape


(660, 1830)

#### Daily CRSP Data

In [9]:
rets_d, caps_d = ld.build_crsp_dataset(start = "2018-01-01", end = "2024-12-31",frequency="daily")


WRDS recommends setting up a .pgpass file.
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [10]:
## expect same # or rows and # of columns
rets_d.shape
caps_d.shape


(1761, 677)

In [11]:
# should be true
rets_d.index.equals(caps_d.index)


True

In [12]:
rets_d.columns.equals(caps_d.columns)


False

In [13]:
rets_d.isna().sum().sum()
caps_d.isna().sum().sum()


304250

In [14]:
rets_d.iloc[:5, :5]
caps_d.iloc[:5, :5]


ticker,A,AAL,AAP,AAPL,ABBV
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-02,21836016.8,25197168.92,7839838.82,876296215.68944,157104676.3
2018-01-03,22391607.76,24888088.72,7910780.9,876143654.88,159563178.5
2018-01-04,22223638.4,25045006.36,8202678.0,880213299.68,158653213.4
2018-01-05,22578958.2,25035496.2,8289877.64,890234800.0,161415037.3
2018-01-08,22627410.9,24788232.04,8231498.22,886928264.47056,158828820.7


### Combining Data and Building CW-EW Preference

In [15]:
from src.preference_factors.build_preference_dataset import (
    build_preference_factor_dataset
)

#### Monthy Preference Dataset

In [16]:
df_monthly = build_preference_factor_dataset(
    returns_file="sp500_returns_monthly_with_tickers.csv",
    market_caps_file="sp500_market_caps_monthly.csv",
    ff_factors_file="ff_factors_monthly.csv",
    frequency="monthly",
)


In [17]:
df_monthly

Unnamed: 0_level_0,CW,EW,CW-EW,MKT_RF,SMB,HML,RMW,CMA,RF,MOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1970-01,-0.071530,-0.058388,-0.013141,-0.0811,0.0308,0.0317,-0.0175,0.0396,0.0060,0.0056
1970-02,0.063582,0.068592,-0.005010,0.0514,-0.0257,0.0369,-0.0241,0.0287,0.0062,0.0027
1970-03,0.005553,0.003084,0.002469,-0.0106,-0.0242,0.0408,-0.0106,0.0442,0.0057,-0.0036
1970-04,-0.085739,-0.108800,0.023062,-0.1099,-0.0634,0.0614,-0.0063,0.0590,0.0050,-0.0083
1970-05,-0.049981,-0.082317,0.032337,-0.0691,-0.0446,0.0332,-0.0129,0.0379,0.0053,-0.0266
...,...,...,...,...,...,...,...,...,...,...
2024-08,0.028116,0.024943,0.003173,0.0161,-0.0355,-0.0110,0.0076,0.0082,0.0048,0.0481
2024-09,0.025856,0.023297,0.002560,0.0173,-0.0092,-0.0277,0.0020,-0.0029,0.0040,-0.0062
2024-10,-0.004786,-0.016410,0.011624,-0.0100,-0.0089,0.0086,-0.0148,0.0098,0.0039,0.0300
2024-11,0.066461,0.062051,0.004410,0.0649,0.0459,0.0015,-0.0231,-0.0205,0.0040,0.0100


In [18]:
df_monthly.shape

(660, 10)

#### Daily Preference Dataset

In [19]:
df_daily = build_preference_factor_dataset(
    returns_file="sp500_returns_daily_with_tickers.csv",
    market_caps_file="sp500_market_caps_daily.csv",
    ff_factors_file="ff_factors_daily.csv",
    frequency="daily",
)


In [20]:
df_daily

Unnamed: 0_level_0,CW,EW,CW-EW,MKT_RF,SMB,HML,RMW,CMA,RF,MOM
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-01-02,0.008494,0.008031,0.000463,0.0085,0.0032,-0.0021,-0.0051,0.0022,0.0001,-0.0064
2018-01-03,0.006608,0.005611,0.000997,0.0059,-0.0047,-0.0019,-0.0071,-0.0007,0.0001,0.0021
2018-01-04,0.004364,0.003066,0.001298,0.0042,-0.0022,0.0024,-0.0002,0.0030,0.0001,-0.0020
2018-01-05,0.007134,0.005368,0.001766,0.0066,-0.0037,-0.0025,0.0039,-0.0038,0.0001,0.0061
2018-01-08,0.001901,0.003099,-0.001198,0.0019,-0.0020,0.0004,-0.0006,0.0004,0.0001,0.0016
...,...,...,...,...,...,...,...,...,...,...
2024-12-24,0.011481,0.007806,0.003675,0.0111,-0.0012,-0.0006,-0.0011,-0.0036,0.0002,0.0067
2024-12-26,-0.000346,0.001047,-0.001393,0.0001,0.0107,-0.0018,-0.0042,0.0036,0.0002,0.0000
2024-12-27,-0.011024,-0.007075,-0.003949,-0.0117,-0.0044,0.0057,0.0039,0.0004,0.0002,-0.0085
2024-12-30,-0.010723,-0.009846,-0.000877,-0.0109,0.0025,0.0074,0.0055,0.0014,0.0002,0.0009


In [21]:
df_daily.shape

(1761, 10)