# Data Preparation

## Imports

In [3]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
import numpy as np
import scipy as sp
import pandas as pd

import warnings
import tqdm

import pandas_datareader.data as web
import datetime as dt

import statsmodels.api as sm
import linearmodels as lm
import arch as am

import matplotlib.pyplot as plt
import seaborn as sns

import mypack.data as data
import mypack.calc as calc
import mypack.plot as plot
import mypack.clas as clas

In [5]:
pd.set_option('display.float_format', lambda x: '%.4f' % x) #sets decimals to be displayed

In [6]:
sns.set()
sns.set(rc={'figure.figsize': (17, 6)})

In [7]:
%matplotlib inline

## Data

Data is imported from [Kenneth French's data library](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html) and [FRED](https://fred.stlouisfed.org/series/). Currently, this workbook uses 49 industry portfolios and the FF3 factors. Additionally, NBER recession indicators complement the data.

In [8]:
#sample_start = dt.datetime(1962,1,1)
#sample_start = dt.datetime(1945,1,1)
sample_start = dt.datetime(1900,1,1)
sample_end = dt.datetime(2019,12,31)
#sample_end = dt.datetime.today()

### Factor data

In [7]:
factors_daily = data.download_factor_data('D')/100
factors_monthly = data.download_factor_data('M')/100

factors_daily = factors_daily.loc[(factors_daily.index >= sample_start) & (factors_daily.index <= sample_end)]
factors_monthly = factors_monthly.loc[(factors_monthly.index >= sample_start) & (factors_monthly.index <= sample_end)]

### Market Data

In [8]:
market_daily = factors_daily['Mkt-RF']
market_monthly = factors_monthly['Mkt-RF']

### Industry data

In [9]:
industries_daily = data.download_industry_data('D', excessreturns=True)/100
industries_monthly = data.download_industry_data('M', excessreturns=True)/100

In [10]:
industries_daily = industries_daily.loc[(industries_daily.index >= sample_start) & (industries_daily.index <= sample_end)]
industries_monthly = industries_monthly.loc[(industries_monthly.index >= sample_start) & (industries_monthly.index <= sample_end)]

## Export Data Files

In [11]:
market_daily.to_pickle('s_market_daily.pkl')
market_monthly.to_pickle('s_market_monthly.pkl')

In [11]:
industries_daily.to_pickle('df_industries_daily.pkl')
industries_monthly.to_pickle('df_industries_monthly.pkl')

In [13]:
%whos

Variable             Type         Data/Info
-------------------------------------------
am                   module       <module 'arch' from 'C:\\<...>ages\\arch\\__init__.py'>
calc                 module       <module 'mypack.calc' fro<...>n Code\\mypack\\calc.py'>
clas                 module       <module 'mypack.clas' fro<...>n Code\\mypack\\clas.py'>
data                 module       <module 'mypack.data' fro<...>n Code\\mypack\\data.py'>
dt                   module       <module 'datetime' from '<...>onda3\\lib\\datetime.py'>
factors_daily        DataFrame                Mkt-RF     SM<...>n[19115 rows x 5 columns]
factors_monthly      DataFrame                Mkt-RF     SM<...>n\n[900 rows x 4 columns]
industries_daily     DataFrame    Industry     Agric   Food<...>[19115 rows x 49 columns]
industries_monthly   DataFrame    Industry     Agric   Food<...>\n[900 rows x 49 columns]
lm                   module       <module 'linearmodels' fr<...>nearmodels\\__init__.py'>
market_daily