# 00 - Raw Data Download
## Description

This notebook downloads the daily stock file data from CRSP to output tables containing the following variables:
- date
- permno as unique identifier
- mcap as shares outstanding times price
- return
- intraday extreme value volatility estimate $\bar{\sigma}^{}_{i,t} = \sqrt{0.3607}(p_{i,t}^{high}-p_{i,t}^{low})^{}$ based on Parkinson (1980), where $p_{i,t}$ is the logarithm of the dollar price


## TO DO
- Same permco can have multiple permno

## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import wrds
import pandas as pd
import numpy as np
import datetime as dt
import sys
sys.path.append('../')
import src

## Set up WRDS Connection

In [3]:
wrds_conn = wrds.Connection(wrds_username='felixbru')
# wrds_conn.create_pgpass_file()
#wrds_connection.close()

Loading library list...
Done


#### Explore database

In [4]:
libraries = wrds_conn.list_libraries()
library = 'crsp'

In [5]:
library_tables = wrds_conn.list_tables(library=library)
table = 'dsf'

In [6]:
table_description = wrds_conn.describe_table(library=library, table=table)

Approximately 96285900 rows in crsp.dsf.


## Download CRSP data

### Daily stock data

EXCHCD:
- 1: NYSE
- 2: NYSE MKT
- 3: NASDAQ

SHRCD:
- 10: Ordinary common share, no special status found
- 11: Ordinary common share, no special status necessary

In [7]:
# def make_crsp_query(year):
#     query = '''
#     SELECT 
#     a.permno,
#     b.ticker,
#     a.date,
#     a.ret,
#     c.dlret,
#     a.shrout * a.prc AS mcap,
#     SQRT(0.3607) * POWER(LOG(NULLIF(a.askhi, 0)) - LOG(NULLIF(a.bidlo, 0)), 1) AS vola
    
#     FROM crsp.dsf AS a
    
#     LEFT JOIN crsp.msenames AS b
#     ON a.permno=b.permno
#     AND b.namedt<=a.date
#     AND a.date<=b.nameendt
    
#     LEFT JOIN crsp.msedelist AS c
#     ON a.permno=c.permno
#     AND a.date=c.dlstdt
    
#     WHERE a.date BETWEEN '01/01/{}' AND '12/31/{}'
#     AND b.exchcd BETWEEN 1 AND 3
#     AND b.shrcd BETWEEN 10 AND 11
#     '''\
#     .format(year, year)
#     return query

In [8]:
# def download_crsp_year(connection, year):
#     # SQL qumake_crsp_query
#     query = make_crsp_query(year)
#     df = src.loader.query_SQL(connection, query)
    
#     # edit data formats
#     df['date'] = pd.to_datetime(df.date, yearfirst=True)
#     df['permno'] = df.permno.astype(int)
    
#     # adjust returns for delisting
#     df['retadj'] = (1+df['ret'].fillna(0))*(1+df['dlret'].fillna(0))-1
#     df['retadj'] = df['retadj'].where(df['ret'].notna() | df['dlret'].notna())
#     df = df.drop(columns=['ret', 'dlret'])
    
#     # declare index & sort
#     df.set_index(['date','permno'], inplace=True)
#     df = df.sort_index()
    
#     # save
#     df.to_pickle(path='../data/raw/crsp_{}.pkl'.format(year))
    
#     return df

In [10]:
for year in range(1993, 2020):
    df = src.crsp.download_crsp_year(wrds_conn, year)
    df.to_pickle(path='../data/raw/crsp_{}.pkl'.format(year))

collected 217.31 MB on 2020-10-05 20:56:12.389834 in 31 seconds
collected 235.08 MB on 2020-10-05 20:56:48.259062 in 32 seconds
collected 240.97 MB on 2020-10-05 20:57:27.736546 in 35 seconds
collected 257.01 MB on 2020-10-05 20:58:10.428337 in 39 seconds
collected 264.94 MB on 2020-10-05 20:58:53.253316 in 38 seconds
collected 257.68 MB on 2020-10-05 20:59:35.595328 in 37 seconds
collected 240.31 MB on 2020-10-05 21:00:21.410822 in 40 seconds
collected 232.54 MB on 2020-10-05 21:01:12.233252 in 46 seconds
collected 208.39 MB on 2020-10-05 21:01:59.675220 in 43 seconds
collected 192.13 MB on 2020-10-05 21:03:43.902275 in 100 seconds
collected 177.62 MB on 2020-10-05 21:04:35.989116 in 48 seconds
collected 171.78 MB on 2020-10-05 21:05:22.853649 in 43 seconds
collected 169.76 MB on 2020-10-05 21:06:14.027251 in 48 seconds
collected 166.43 MB on 2020-10-05 21:06:43.371404 in 26 seconds
collected 163.86 MB on 2020-10-05 21:07:12.032965 in 26 seconds
collected 159.73 MB on 2020-10-05 21:07

In [11]:
# start_date = '12/01/1993'
# end_date = '12/31/2019'

# sql_crsp = '''
#     SELECT 
#     a.permno,
#     a.date,
#     a.ret,
#     a.shrout * a.prc AS mcap,
#     SQRT(0.3607) * POWER(LOG(NULLIF(a.askhi, 0)) - LOG(NULLIF(a.bidlo, 0)), 1) AS vola
    
#     FROM crsp.dsf AS a
    
#     LEFT JOIN crsp.msenames AS b
#     ON a.permno=b.permno
#     AND b.namedt<=a.date
#     AND a.date<=b.nameendt
    
#     WHERE a.date BETWEEN '{}' AND '{}'
#     AND b.exchcd BETWEEN 1 AND 3
#     AND b.shrcd BETWEEN 10 AND 11
#     '''\
#     .format(start_date, end_date)

In [12]:
# df_crsp_raw = src.loader.query_SQL(wrds_conn, sql_crsp)
# #df_crsp_raw.to_pickle(path='../data/raw/crsp_raw.pkl')

### Delisting Returns

In [13]:
# sql_delisting = '''
#     SELECT
#     permno,
#     dlret,
#     dlstdt AS date
    
#     FROM crsp.msedelist
#     '''

In [14]:
df_delist = src.crsp.download_delisting(wrds_conn)
df_delist.to_pickle(path='../data/raw/delisting.pkl')

collected 1.88 MB on 2020-10-05 21:14:02.418003 in 0 seconds


### Descriptive Data

In [15]:
# sql_descriptive = '''
#     SELECT
#     permno,
#     comnam, 
#     ticker,
#     st_date,
#     end_date,
#     exchcd
    
#     FROM crsp_a_stock.stocknames
#     '''

In [16]:
# sql_descriptive = '''
#     SELECT
#     permno,
#     comnam, 
#     ticker,
#     namedt,
#     nameendt,
#     exchcd
    
#     FROM crsp.msenames
#     '''

In [17]:
df_descriptive = src.crsp.download_descriptive(wrds_conn)
df_descriptive.to_pickle(path='../data/raw/descriptive.pkl')

collected 24.02 MB on 2020-10-05 21:14:04.716836 in 1 seconds


## Download FF data

### SQL Query

In [18]:
# sql_ff = '''
#     SELECT *
    
#     FROM ff_all.factors_daily 
    
#     WHERE date BETWEEN '01/01/2000' AND '12/31/2019'
#     '''

In [19]:
df_ff = src.crsp.download_famafrench(wrds_conn)
df_ff.to_pickle(path='../data/raw/ff_factors.pkl')

collected 1.99 MB on 2020-10-05 21:14:05.796680 in 0 seconds


## SPDR Trust SPY Index data

In [20]:
# sql_spy = '''
#     SELECT
#     a.date,
#     a.prc,
#     a.ret,
#     SQRT(0.361) * POWER(LOG(NULLIF(a.askhi, 0)) - LOG(NULLIF(a.bidlo, 0)), 1) AS vola
    
#     FROM crsp.dsf AS a
    
#     LEFT JOIN crsp.msenames AS b
#     ON a.permno=b.permno
    
#     WHERE b.exchcd=4
#     AND b.ticker='SPY'
#     AND b.comnam='SPDR TRUST'
#     '''

In [21]:
df_spy = src.crsp.download_SPY(wrds_conn)
df_spy.to_pickle(path='../data/raw/spy.pkl')

collected 0.43 MB on 2020-10-05 21:14:06.230889 in 0 seconds
