# WRDS Data

## Verify installation & Login

https://wrds-www.wharton.upenn.edu/pages/support/programming-wrds/programming-python/jupyterhub-wrds/

In [None]:
try:
    import wrds
except ImportError:
    print("WRDS not installed")


In [None]:
import os
from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

db = wrds.Connection(wrds_username=os.getenv("WRDS_USERNAME")) # ~/.pgpass

## Check Data Availability

In [None]:
import pandas as pd

In [None]:
price_table_list = [option for option in db.list_tables('optionm') if option.startswith('option')]
price_table_list

## Get Data

In [None]:
ticker_name = "SPY"
date = "2023-01-03"


In [None]:
# Get security ID

name_to_secid_df = db.raw_sql("SELECT * FROM optionm.optionmnames WHERE ticker = '"+ticker_name+"'")
name_to_secid_df = name_to_secid_df.dropna(subset=["optionid"])
secid_list = name_to_secid_df["secid"].unique().tolist()
secid_list


In [None]:
# Get ticker info dataframe on certain date

def get_data_df(secid: float, date: str, label: str):
    """
    label: opprcd, secprd, zerocd, securd, vsurfd, stdopd, distrprojd
    """
    secid = str(secid)
    year = date[:4]
    date = "'"+date+"'"
    
    if label in ["opprcd", "secprd", "vsurfd", "stdopd", "distrprojd"]:
        return db.raw_sql("SELECT * FROM optionm."+label+year+" WHERE secid = "+secid+" AND date = "+date)
    elif label == "zerocd":
        return db.raw_sql("SELECT * FROM optionm."+label+" WHERE date = "+date)
    elif label == "securd":
        return db.raw_sql("SELECT * FROM optionm."+label+" WHERE secid = "+secid)
    else:
        raise ValueError(f"Invalid label: {label}.")


In [None]:
raw_data_path = os.getenv("RAW_DATA_PATH")
date_name = date.translate(str.maketrans('', '', "-"))

# Option price
option_price_dfs = [get_data_df(secid=secid, date=date, label="opprcd") for secid in secid_list]
pd.concat(option_price_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_option_price.parquet", 
                                                          engine="pyarrow", 
                                                          compression="snappy")

# Security price
security_price_dfs = [get_data_df(secid=secid, date=date, label="secprd") for secid in secid_list]
pd.concat(security_price_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_security_price.parquet", 
                                                            engine="pyarrow", 
                                                            compression="snappy")

# Volatility surface
vk_surface_dfs = [get_data_df(secid=secid, date=date, label="vsurfd") for secid in secid_list]
pd.concat(vk_surface_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_volatility_surface.parquet", 
                                                        engine="pyarrow", 
                                                        compression="snappy")

# Standard Option Price
vk_surface_dfs = [get_data_df(secid=secid, date=date, label="stdopd") for secid in secid_list]
pd.concat(vk_surface_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_stdoption_price.parquet", 
                                                        engine="pyarrow", 
                                                        compression="snappy")

# Distribution Projection
distr_proj_dfs = [get_data_df(secid=secid, date=date, label="distrprojd") for secid in secid_list]
pd.concat(distr_proj_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+date_name+"_distr_proj.parquet", 
                                                        engine="pyarrow", 
                                                        compression="snappy")

# Zero Curve
zero_curve_df = get_data_df(secid=secid_list[0], date=date, label="zerocd")
zero_curve_df.to_parquet(raw_data_path+date_name+"_zero_curve.parquet", 
                         engine="pyarrow",
                         compression="snappy")

# Securd
securd_dfs = [get_data_df(secid=secid, date=date, label="securd") for secid in secid_list]
pd.concat(securd_dfs, ignore_index=True).to_parquet(raw_data_path+ticker_name+"_securd.parquet", 
                                                    engine="pyarrow", 
                                                    compression="snappy")


In [None]:
# Log out

db.close()

## Data Preview

In [1]:
ticker_name = "SPY"
date = "2023-01-03"

In [2]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv, find_dotenv
from datetime import timedelta, datetime

raw_data_path = os.getenv("RAW_DATA_PATH")
date_name = date.translate(str.maketrans('', '', "-"))
date = datetime.strptime(date, '%Y-%m-%d')

In [3]:
# Option price
op_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_option_price.parquet")
op_df.head()

Unnamed: 0,secid,date,symbol,symbol_flag,exdate,last_date,cp_flag,strike_price,best_bid,best_offer,...,theta,optionid,cfadj,am_settlement,contract_size,ss_flag,forward_price,expiry_indicator,root,suffix
0,109820.0,2023-01-03,SPY 230120P410000,1,2023-01-20,2023-01-03,P,410000.0,28.8,29.06,...,,136011273.0,1.0,0.0,100.0,0,,,,
1,109820.0,2023-01-03,SPY 230120P411000,1,2023-01-20,2023-01-03,P,411000.0,29.8,30.06,...,,150272659.0,1.0,0.0,100.0,0,,,,
2,109820.0,2023-01-03,SPY 230120P412000,1,2023-01-20,2022-12-30,P,412000.0,30.8,31.06,...,,150272660.0,1.0,0.0,100.0,0,,,,
3,109820.0,2023-01-03,SPY 230120P413000,1,2023-01-20,2023-01-03,P,413000.0,31.8,32.06,...,,150272661.0,1.0,0.0,100.0,0,,,,
4,109820.0,2023-01-03,SPY 230120P414000,1,2023-01-20,2022-12-29,P,414000.0,32.8,33.06,...,,150272662.0,1.0,0.0,100.0,0,,,,


In [15]:
list(op_df.columns)

['secid',
 'date',
 'symbol',
 'symbol_flag',
 'exdate',
 'last_date',
 'cp_flag',
 'strike_price',
 'best_bid',
 'best_offer',
 'volume',
 'open_interest',
 'impl_volatility',
 'delta',
 'gamma',
 'vega',
 'theta',
 'optionid',
 'cfadj',
 'am_settlement',
 'contract_size',
 'ss_flag',
 'forward_price',
 'expiry_indicator',
 'root',
 'suffix']

In [8]:
# Security price
sp_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_security_price.parquet")
sp_df.head()

Unnamed: 0,secid,date,low,high,close,volume,return,cfadj,open,cfret,shrout
0,109820.0,2023-01-03,377.831,386.43,380.82,74850731.0,-0.00421,1.0,384.37,1.62005,932632.0


In [7]:
# Volatility surface
vs_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_volatility_surface.parquet")
vs_df.head()

Unnamed: 0,secid,date,days,delta,impl_volatility,impl_strike,impl_premium,dispersion,cp_flag
0,109820.0,2023-01-03,10.0,-90.0,0.194341,396.5254,16.09188,0.016883,P
1,109820.0,2023-01-03,10.0,-85.0,0.20004,394.1617,14.08037,0.016961,P
2,109820.0,2023-01-03,10.0,-80.0,0.205494,392.1157,12.463,0.016847,P
3,109820.0,2023-01-03,10.0,-75.0,0.210444,390.2253,11.07216,0.016971,P
4,109820.0,2023-01-03,10.0,-70.0,0.214993,388.4226,9.840296,0.017357,P


In [9]:
# Standard Option Price
sop_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_stdoption_price.parquet")
sop_df.head()

Unnamed: 0,secid,date,days,forward_price,strike_price,premium,impl_volatility,delta,gamma,theta,vega,cp_flag
0,109820.0,2023-01-03,10.0,381.237881,381.237881,6.175776,0.24559,0.508104,0.025756,-120.143617,25.143701,C
1,109820.0,2023-01-03,10.0,381.237881,381.237881,5.805164,0.229798,-0.495411,0.027808,-98.697524,25.121911,P
2,109820.0,2023-01-03,30.0,382.10917,382.10917,9.701881,0.222775,0.512732,0.016389,-66.607316,43.537459,C
3,109820.0,2023-01-03,30.0,382.10917,382.10917,9.3854,0.213227,-0.494903,0.017553,-49.733154,43.419682,P
4,109820.0,2023-01-03,60.0,383.497985,383.497985,13.774894,0.223701,0.518081,0.011535,-49.679531,61.538878,C


In [10]:
# Distribution Projection
dp_df = pd.read_parquet(raw_data_path+ticker_name+date_name+"_distr_proj.parquet")
dp_df.head()

Unnamed: 0,secid,date,exdate,amount
0,109820.0,2023-01-03,2023-03-17,0.004678
1,109820.0,2023-01-03,2023-06-16,0.004678
2,109820.0,2023-01-03,2023-09-15,0.004678
3,109820.0,2023-01-03,2023-12-15,0.004678
4,109820.0,2023-01-03,2024-03-15,0.004678


In [16]:
# Zero Curve
zc_df = pd.read_parquet(raw_data_path+date_name+"_zero_curve.parquet")
zc_df.head()

Unnamed: 0,date,days,rate
0,2023-01-03,10.0,4.002951
1,2023-01-03,30.0,4.111739
2,2023-01-03,60.0,4.26291
3,2023-01-03,91.0,4.404409
4,2023-01-03,122.0,4.531454


In [12]:
# Securd
sc_df = pd.read_parquet(raw_data_path+ticker_name+"_securd.parquet")
sc_df.head()

Unnamed: 0,secid,cusip,ticker,sic,index_flag,exchange_d,class,issue_type,industry_group
0,109820.0,78462F10,SPY,,0,1.0,,%,
