# Description

This notebook downloads the daily stock file data from CRSP to output tables containing the following variables:
- date
- permno as unique identifier
- mcap as shares outstanding times price
- return
- intraday extreme value volatility estimate $\bar{\sigma}^{}_{i,t} = \sqrt{0.3607}(p_{i,t}^{high}-p_{i,t}^{low})^{}$ based on Parkinson (1980), where $p_{i,t}$ is the logarithm of a price


## TODO
- Same permco can have multiple permno

## Imports

In [1]:
import wrds
import pandas as pd
import numpy as np
import datetime as dt
import sys

## Set up WRDS Connection

In [2]:
wrds_conn = wrds.Connection(wrds_username='felixbru')
#wrds_connection.create_pgpass_file()
#wrds_connection.close()

Loading library list...
Done


## Explore database

In [3]:
libraries = wrds_conn.list_libraries()
library = 'crsp'

In [4]:
library_tables = wrds_conn.list_tables(library=library)
table = 'dsf'

In [5]:
table_description = wrds_conn.describe_table(library=library, table=table)

Approximately 96285900 rows in crsp.dsf.


## Download CRSP data

### Daily stock data

EXCHCD:
- 1: NYSE
- 2: NYSE MKT
- 3: NASDAQ

SHRCD:
- 10: Ordinary common share, no special status found
- 11: Ordinary common share, no special status necessary

In [6]:
sql_crsp = '''
    SELECT 
    a.permno,
    a.date,
    a.ret,
    a.shrout * a.prc as mcap,
    SQRT(0.3607) * POWER(LOG(NULLIF(a.askhi, 0)) - LOG(NULLIF(a.bidlo, 0)), 1) as vola
    
    FROM crsp.dsf as a
    
    LEFT JOIN crsp.msenames as b
    ON a.permno=b.permno
    AND b.namedt<=a.date
    AND a.date<=b.nameendt
    
    WHERE a.date BETWEEN '12/01/1999' AND '12/31/2019'
    AND b.exchcd BETWEEN 1 AND 3
    AND b.shrcd BETWEEN 10 AND 11
    '''

In [7]:
t0 = dt.datetime.today()
df_crsp_raw = wrds_conn.raw_sql(sql_crsp)
t1 = dt.datetime.today()
print('downloaded', sys.getsizeof(df_crsp_raw)/1e+6, 'MByte on', str(dt.datetime.today()), 'in', (t1-t0).seconds, 'seconds')

downloaded 1614.336272 MByte on 2020-05-21 13:11:45.973682 in 1222 seconds


In [8]:
df_crsp_raw.to_pickle(path='../data/raw/df_crsp_raw.pkl')

### Delisting Returns

In [9]:
sql_crsp_delist = '''
    SELECT
    permno,
    dlret,
    dlstdt AS date
    FROM crsp.msedelist
    '''

In [10]:
t0 = dt.datetime.today()
df_crsp_delist = wrds_conn.raw_sql(sql_crsp_delist)
t1 = dt.datetime.today()
print('downloaded', sys.getsizeof(df_crsp_delist)/1e+6, 'MByte on', str(dt.datetime.today()), 'in', (t1-t0).seconds, 'seconds')

downloaded 1.879288 MByte on 2020-05-21 13:14:42.024547 in 2 seconds


In [19]:
df_crsp_delist.to_pickle(path='../data/raw/df_crsp_delist.pkl')

### Descriptive Data

In [12]:
sql_crsp_desc = '''
    SELECT
    permno,
    comnam, 
    ticker,
    st_date,
    end_date,
    exchcd
    
    FROM crsp_a_stock.stocknames
    '''

In [13]:
t0 = dt.datetime.today()
df_crsp_desc = wrds_conn.raw_sql(sql_crsp_desc)
t1 = dt.datetime.today()
print('downloaded', sys.getsizeof(df_crsp_desc)/1e+6, 'MByte on', str(dt.datetime.today()), 'in', (t1-t0).seconds, 'seconds')

downloaded 14.425234 MByte on 2020-05-21 13:14:46.577521 in 3 seconds


In [15]:
df_crsp_desc.to_pickle(path='../data/raw/df_crsp_desc.pkl')

## Download FF data

### SQL Query

In [16]:
sql_ff = '''
    SELECT
    date, 
    mktrf,
    smb,
    hml,
    rf,
    umd
    
    FROM ff_all.factors_daily 
    
    WHERE date BETWEEN '01/01/2000' AND '12/31/2019'
    '''

In [17]:
t0 = dt.datetime.today()
df_ff_raw = wrds_conn.raw_sql(sql_ff)
t1 = dt.datetime.today()
print('downloaded', sys.getsizeof(df_ff_raw)/1e+6, 'MByte on', str(dt.datetime.today()), 'in', (t1-t0).seconds, 'seconds')

downloaded 0.402632 MByte on 2020-05-21 13:16:22.551584 in 0 seconds


In [18]:
df_ff_raw.to_pickle(path='../data/raw/df_ff_raw.pkl')

## Analytics

In [10]:
len(df_crsp_raw.permno.unique())

11395

In [13]:
df_crsp_raw.describe()

## Transform CRSP data to tidy data format

#### Return data

In [6]:
df_crsp_tidy = df_crsp_raw.reset_index()
del df_crsp_raw

In [7]:
# edit data formats
df_crsp_tidy['date'] = pd.to_datetime(df_crsp_tidy['date'], yearfirst=True)
df_crsp_tidy[['permno']] = df_crsp_tidy[['permno']].astype(int)

df_crsp_delist['permno'] = df_crsp_delist['permno'].astype(int)
df_crsp_delist['date'] = pd.to_datetime(df_crsp_delist['date'], yearfirst=True)

In [8]:
# merge
df_crsp_tidy = df_crsp_tidy.merge(df_crsp_delist, how='left', on=['permno','date'])

In [9]:
# adjusted returns (for delisting)
df_crsp_tidy['retadj'] = (1+df_crsp_tidy['ret'].fillna(0))*(1+df_crsp_tidy['dlret'].fillna(0))-1
df_crsp_tidy['retadj'] = df_crsp_tidy['retadj'].where(df_crsp_tidy['ret'].notna() | df_crsp_tidy['dlret'].notna())

In [10]:
# declare index & sort
df_crsp_tidy.set_index(['date','permno'], inplace=True)
df_crsp_tidy = df_crsp_tidy.drop(columns=['index', 'dlret'])
df_crsp_tidy = df_crsp_tidy.sort_index()

In [11]:
df_crsp_tidy.to_pickle(path='../data/interim/df_crsp_tidy.pkl')

In [172]:
#df_crsp_tidy = pd.read_pickle('../data/interim/df_crsp_tidy.pkl')

#### Descriptive data

In [25]:
df_crsp_desc['permno'] = df_crsp_desc['permno'].astype(int)
df_aux = df_crsp_desc\
                    .groupby('permno').last()\
                    .drop(columns=['exchcd'])

In [97]:
s_min_rank = df_crsp_tidy['size_rank']\
                    .groupby('permno').min()
s_min_rank.name = 'min_rank'
s_ret_missing = df_crsp_tidy['ret']\
                    .unstack()\
                    .isna()\
                    .any()
s_ret_missing.name = 'ret_missing'
s_vola_missing = df_crsp_tidy['vola']\
                    .unstack()\
                    .isna()\
                    .any()
s_vola_missing.name = 'vola_missing'

In [98]:
df_aux = df_aux\
                    .merge(s_min_rank, how='outer', left_index=True, right_index=True)\
                    .merge(s_ret_missing, how='outer', left_index=True, right_index=True)\
                    .merge(s_vola_missing, how='outer', left_index=True, right_index=True)