# WRDS Data

## Verify installation & Login

https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/jupyterhub-wrds/

In [1]:
try:
    import wrds
except ImportError:
    print("WRDS not installed")


In [2]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

db = wrds.Connection(wrds_username=os.getenv("WRDS_USERNAME")) # ~/.pgpass

Loading library list...
Done


## Check Data Availability

In [3]:
import pandas as pd

In [4]:
price_table_list = [option for option in db.list_tables('optionm') if option.startswith('option')]
price_table_list

['option',
 'option_history',
 'option_price_2002',
 'option_price_2003',
 'option_price_2004',
 'option_price_2005',
 'option_price_2006',
 'option_price_2007',
 'option_price_2008',
 'option_price_2009',
 'option_price_2010',
 'option_price_2011',
 'option_price_2012',
 'option_price_2013',
 'option_price_2014',
 'option_price_2015',
 'option_price_2016',
 'option_price_2017',
 'option_price_2018',
 'option_price_2019',
 'option_price_2020',
 'option_price_2021',
 'option_price_2022',
 'option_price_2023',
 'option_price_view',
 'optionmeurnames',
 'optionmnames']

## Get Data

In [5]:
ticker_name = "SPY"
date = "2023-01-04"


In [6]:
# Get security ID

name_to_secid_df = db.raw_sql("SELECT * FROM optionm.optionmnames WHERE ticker = '"+ticker_name+"'")
name_to_secid_df = name_to_secid_df.dropna(subset=["optionid"])
secid_list = name_to_secid_df["secid"].unique().tolist()
secid_list


[109820.0]

In [7]:
# Get ticker info dataframe on certain date

def get_data_df(secid: float, date: str, label: str):
    """
    label: opprcd, secprd, zerocd, securd, vsurfd, stdopd, distrprojd
    """
    secid = str(secid)
    year = date[:4]
    date = "'"+date+"'"
    
    if label in ["opprcd", "secprd", "vsurfd", "stdopd", "distrprojd"]:
        return db.raw_sql("SELECT * FROM optionm."+label+year+" WHERE secid = "+secid+" AND date = "+date)
    elif label == "zerocd":
        return db.raw_sql("SELECT * FROM optionm."+label+" WHERE date = "+date)
    elif label == "securd":
        return db.raw_sql("SELECT * FROM optionm."+label+" WHERE secid = "+secid)
    else:
        raise ValueError(f"Invalid label: {label}.")


In [8]:
raw_data_path = os.getenv("RAW_DATA_PATH")
date_name = date.translate(str.maketrans('', '', "-"))

# Option price
option_price_dfs = [get_data_df(secid=secid, date=date, label="opprcd") for secid in secid_list]
pd.concat(option_price_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_option_price.parquet", 
                                                          engine="pyarrow", 
                                                          compression="snappy")

# Security price
security_price_dfs = [get_data_df(secid=secid, date=date, label="secprd") for secid in secid_list]
pd.concat(security_price_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_security_price.parquet", 
                                                            engine="pyarrow", 
                                                            compression="snappy")

# Volatility surface
vk_surface_dfs = [get_data_df(secid=secid, date=date, label="vsurfd") for secid in secid_list]
pd.concat(vk_surface_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_volatility_surface.parquet", 
                                                        engine="pyarrow", 
                                                        compression="snappy")

# Standard Option Price
vk_surface_dfs = [get_data_df(secid=secid, date=date, label="stdopd") for secid in secid_list]
pd.concat(vk_surface_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_stdoption_price.parquet", 
                                                        engine="pyarrow", 
                                                        compression="snappy")

# Distribution Projection
distr_proj_dfs = [get_data_df(secid=secid, date=date, label="distrprojd") for secid in secid_list]
pd.concat(distr_proj_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_distr_proj.parquet", 
                                                        engine="pyarrow", 
                                                        compression="snappy")

# Zero Curve
zero_curve_df = get_data_df(secid=secid_list[0], date=date, label="zerocd")
zero_curve_df.to_parquet(raw_data_path+date_name+"_zero_curve.parquet", 
                         engine="pyarrow",
                         compression="snappy")

# Securd
securd_dfs = [get_data_df(secid=secid, date=date, label="securd") for secid in secid_list]
pd.concat(securd_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+"_securd.parquet", 
                                                    engine="pyarrow", 
                                                    compression="snappy")


In [9]:
# Log out

db.close()

## Data Preview

In [16]:
ticker_name = "SPY"
date = "2023-01-04"

In [17]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from datetime import timedelta, datetime

raw_data_path = os.getenv("RAW_DATA_PATH")
date_name = date.translate(str.maketrans('', '', "-"))
date = datetime.strptime(date, '%Y-%m-%d')

In [18]:
# Option price
op_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_option_price.parquet")
op_df.head()

Unnamed: 0,secid,date,symbol,symbol_flag,exdate,last_date,cp_flag,strike_price,best_bid,best_offer,...,theta,optionid,cfadj,am_settlement,contract_size,ss_flag,forward_price,expiry_indicator,root,suffix
0,109820.0,2023-01-04,SPY 230104C310000,1,2023-01-04,2023-01-04,C,310000.0,73.43,73.57,...,,151713856.0,1.0,0.0,100.0,0,,w,,
1,109820.0,2023-01-04,SPY 230104C320000,1,2023-01-04,2023-01-04,C,320000.0,63.43,63.57,...,,151613017.0,1.0,0.0,100.0,0,,w,,
2,109820.0,2023-01-04,SPY 230104C324000,1,2023-01-04,2023-01-04,C,324000.0,59.43,59.57,...,,151782809.0,1.0,0.0,100.0,0,,w,,
3,109820.0,2023-01-04,SPY 230104C325000,1,2023-01-04,2022-12-27,C,325000.0,57.87,59.06,...,,151674691.0,1.0,0.0,100.0,0,,w,,
4,109820.0,2023-01-04,SPY 230104C326000,1,2023-01-04,,C,326000.0,56.87,58.06,...,,151674692.0,1.0,0.0,100.0,0,,w,,


In [19]:
list(op_df.columns)

['secid',
 'date',
 'symbol',
 'symbol_flag',
 'exdate',
 'last_date',
 'cp_flag',
 'strike_price',
 'best_bid',
 'best_offer',
 'volume',
 'open_interest',
 'impl_volatility',
 'delta',
 'gamma',
 'vega',
 'theta',
 'optionid',
 'cfadj',
 'am_settlement',
 'contract_size',
 'ss_flag',
 'forward_price',
 'expiry_indicator',
 'root',
 'suffix']

In [20]:
# Security price
sp_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_security_price.parquet")
sp_df.head()

Unnamed: 0,secid,date,low,high,close,volume,return,cfadj,open,cfret,shrout
0,109820.0,2023-01-04,380.0,385.88,383.76,85934098.0,0.00772,1.0,383.18,1.62005,932632.0


In [21]:
# Volatility surface
vs_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_volatility_surface.parquet")
vs_df.head()

Unnamed: 0,secid,date,days,delta,impl_volatility,impl_strike,impl_premium,dispersion,cp_flag
0,109820.0,2023-01-04,10.0,-90.0,0.321296,411.0635,28.06339,0.143597,P
1,109820.0,2023-01-04,10.0,-85.0,0.274703,402.6119,19.97429,0.113671,P
2,109820.0,2023-01-04,10.0,-80.0,0.237204,396.9987,14.66986,0.068136,P
3,109820.0,2023-01-04,10.0,-75.0,0.223466,393.8376,11.92302,0.030408,P
4,109820.0,2023-01-04,10.0,-70.0,0.221669,391.6418,10.26653,0.017412,P


In [22]:
# Standard Option Price
sop_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_stdoption_price.parquet")
sop_df.head()

Unnamed: 0,secid,date,days,forward_price,strike_price,premium,impl_volatility,delta,gamma,theta,vega,cp_flag
0,109820.0,2023-01-04,10.0,384.093581,384.093581,5.665285,0.223612,0.507262,0.028066,-109.28202,25.332821,C
1,109820.0,2023-01-04,10.0,384.093581,384.093581,5.769759,0.226936,-0.494697,0.027871,-99.442789,25.31437,P
2,109820.0,2023-01-04,30.0,384.970901,384.970901,9.208946,0.209882,0.511874,0.017259,-63.132999,43.865672,C
3,109820.0,2023-01-04,30.0,384.970901,384.970901,9.343285,0.210843,-0.49444,0.017581,-49.893364,43.760624,P
4,109820.0,2023-01-04,60.0,386.450983,386.450983,13.133916,0.211658,0.517091,0.012099,-47.785831,62.01814,C


In [23]:
# Distribution Projection
dp_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_distr_proj.parquet")
dp_df.head()

Unnamed: 0,secid,date,exdate,amount
0,109820.0,2023-01-04,2023-03-17,0.004642
1,109820.0,2023-01-04,2023-06-16,0.004642
2,109820.0,2023-01-04,2023-09-15,0.004642
3,109820.0,2023-01-04,2023-12-15,0.004642
4,109820.0,2023-01-04,2024-03-15,0.004642


In [24]:
# Zero Curve
zc_df = pd.read_parquet(raw_data_path+date_name+"_zero_curve.parquet")
zc_df.head()

Unnamed: 0,date,days,rate
0,2023-01-04,10.0,4.012935
1,2023-01-04,30.0,4.121626
2,2023-01-04,60.0,4.272607
3,2023-01-04,91.0,4.413859
4,2023-01-04,122.0,4.540607


In [25]:
# Securd
sc_df = pd.read_parquet(raw_data_path+ticker_name+"_securd.parquet")
sc_df.head()

Unnamed: 0,secid,cusip,ticker,sic,index_flag,exchange_d,class,issue_type,industry_group
0,109820.0,78462F10,SPY,,0,1.0,,%,
