In [2]:
import requests
import zipfile
import io

In [2]:
url = "https://dachxiu.chicagobooth.edu/download/datashare.zip"

# Download the zip file
response = requests.get(url)
response.raise_for_status()   # ensure download success

# Extract the CSV from the ZIP
with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    # List files in the zip (optional)
    print("Files in ZIP:", z.namelist())

    # Extract the CSV
    z.extract("datashare.csv", path=".")
    print("Saved datashare.csv!")

Files in ZIP: ['datashare.csv', '__MACOSX/._datashare.csv', 'readme.txt', '__MACOSX/._readme.txt']
Saved datashare.csv!


In [3]:
import pandas as pd

In [4]:
df_head = pd.read_csv("datashare.csv", nrows=5)
print(df_head)

   permno      DATE      mvel1      beta    betasq     chmom    dolvol  \
0   10006  19570131  82249.000  1.122846  1.260784  0.047180  9.569953   
1   10014  19570131   3903.375  0.426734  0.182102 -0.275641  6.237836   
2   10022  19570131   9273.250  1.066449  1.137313 -0.025490  7.008844   
3   10030  19570131  54465.875  0.926038  0.857547  0.018171  9.825337   
4   10057  19570131  40250.000  1.247748  1.556875  0.025785  7.901007   

    idiovol    indmom     mom1m  ...  stdcf  ms  baspread           ill  \
0  0.025742  0.046433  0.044843  ...    NaN NaN  0.013234  9.411565e-08   
1  0.072103  0.046433 -0.086957  ...    NaN NaN  0.033305  6.610609e-06   
2  0.027648  0.046433 -0.060377  ...    NaN NaN  0.016023  2.286832e-06   
3  0.021700  0.046433  0.044633  ...    NaN NaN  0.015295  1.464273e-07   
4  0.025506  0.046433  0.086667  ...    NaN NaN  0.005954  1.380375e-06   

     maxret    retvol  std_dolvol  std_turn     zerotrade  sic2  
0  0.015453  0.008058    0.355638  0.4

In [3]:
df_head.columns

Index(['permno', 'DATE', 'mvel1', 'beta', 'betasq', 'chmom', 'dolvol',
       'idiovol', 'indmom', 'mom1m', 'mom6m', 'mom12m', 'mom36m', 'pricedelay',
       'turn', 'absacc', 'acc', 'age', 'agr', 'bm', 'bm_ia', 'cashdebt',
       'cashpr', 'cfp', 'cfp_ia', 'chatoia', 'chcsho', 'chempia', 'chinv',
       'chpmia', 'convind', 'currat', 'depr', 'divi', 'divo', 'dy', 'egr',
       'ep', 'gma', 'grcapx', 'grltnoa', 'herf', 'hire', 'invest', 'lev',
       'lgr', 'mve_ia', 'operprof', 'orgcap', 'pchcapx_ia', 'pchcurrat',
       'pchdepr', 'pchgm_pchsale', 'pchquick', 'pchsale_pchinvt',
       'pchsale_pchrect', 'pchsale_pchxsga', 'pchsaleinv', 'pctacc', 'ps',
       'quick', 'rd', 'rd_mve', 'rd_sale', 'realestate', 'roic', 'salecash',
       'saleinv', 'salerec', 'secured', 'securedind', 'sgr', 'sin', 'sp',
       'tang', 'tb', 'aeavol', 'cash', 'chtx', 'cinvest', 'ear', 'nincr',
       'roaq', 'roavol', 'roeq', 'rsup', 'stdacc', 'stdcf', 'ms', 'baspread',
       'ill', 'maxret', 'retvol', '

In [5]:
df = pd.read_csv("datashare.csv")

# 1. Convert DATE (int like 19570131) → datetime
df['date'] = pd.to_datetime(df['DATE'].astype(str), format="%Y%m%d")

# 2. Drop or keep the original column (optional)
df.drop(columns=['DATE'], inplace=True)

# 3. Add end-of-month column
df['eom'] = df['date'] + pd.offsets.MonthEnd(0)

df.head()

Unnamed: 0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,baspread,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,date,eom
0,10006,82249.0,1.122846,1.260784,0.04718,9.569953,0.025742,0.046433,0.044843,-0.059517,...,0.013234,9.411565e-08,0.015453,0.008058,0.355638,0.46042,1.120996e-07,37.0,1957-01-31,1957-01-31
1,10014,3903.375,0.426734,0.182102,-0.275641,6.237836,0.072103,0.046433,-0.086957,-0.115385,...,0.033305,6.610609e-06,0.047619,0.033495,1.152126,1.16961,9.229146e-08,,1957-01-31,1957-01-31
2,10022,9273.25,1.066449,1.137313,-0.02549,7.008844,0.027648,0.046433,-0.060377,-0.03955,...,0.016023,2.286832e-06,0.020833,0.015589,0.815777,0.679803,1.181757e-07,,1957-01-31,1957-01-31
3,10030,54465.875,0.926038,0.857547,0.018171,9.825337,0.0217,0.046433,0.044633,0.05047,...,0.015295,1.464273e-07,0.039326,0.015849,0.739302,1.333656,6.126699e-08,,1957-01-31,1957-01-31
4,10057,40250.0,1.247748,1.556875,0.025785,7.901007,0.025506,0.046433,0.086667,0.055247,...,0.005954,1.380375e-06,0.056856,0.019945,0.75551,0.410391,3.31579,,1957-01-31,1957-01-31


In [6]:
chars = df.copy()

In [7]:
import wrds

In [8]:
def create_wrds_client(username=None, password=None):
    """
    Create WRDS client connection
    
    Parameters:
    -----------
    username : str, optional
        WRDS username. If not provided, will prompt.
    password : str, optional
        WRDS password. If not provided, will prompt.
    
    Returns:
    --------
    wrds.Connection object
    
    Example:
    --------
    # Method 1: Let it prompt for credentials
    client = create_wrds_client()
    
    # Method 2: Provide username, prompt for password
    client = create_wrds_client(username='your_wrds_username')
    
    # Method 3: Provide both (not recommended for security)
    client = create_wrds_client(username='your_username', password='your_password')
    """    
    try:
        print("Connecting to WRDS...")
        
        if username and password:
            db = wrds.Connection(wrds_username=username, wrds_password=password)
        elif username:
            db = wrds.Connection(wrds_username=username)
        else:
            # Will prompt for both username and password
            db = wrds.Connection()
            
        print("Successfully connected to WRDS")
        
        return db
        
    except Exception as e:
        print(f"Failed to connect to WRDS: {e}")
        return None

In [9]:
client = create_wrds_client()

Connecting to WRDS...
Enter your WRDS username [datalore]: daisytian
Enter your password: ··············
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?:  y
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done
Successfully connected to WRDS








In [10]:
ff = client.raw_sql("""
    SELECT date, rf
    FROM ff.factors_monthly
    where date >= '1957-01-01'
""")

In [11]:
ff['date'] = pd.to_datetime(ff['date'], errors='coerce')
ff['date'] = ff['date'] + pd.offsets.MonthEnd(0)
ff.head()

Unnamed: 0,date,rf
0,1957-01-31,0.0027
1,1957-02-28,0.0024
2,1957-03-31,0.0023
3,1957-04-30,0.0025
4,1957-05-31,0.0026


In [12]:
crsp = client.raw_sql("""
    SELECT permno, date, ret
    FROM crsp.msf
    where date >= '1957-01-01'
""")
crsp.head()

Unnamed: 0,permno,date,ret
0,10000,1985-12-31,
1,10000,1986-01-31,
2,10000,1986-02-28,-0.257143
3,10000,1986-03-31,0.365385
4,10000,1986-04-30,-0.098592


In [13]:
crsp['date'] = pd.to_datetime(crsp['date'])
crsp = crsp.merge(ff, on="date", how="left")

crsp['ret'] = pd.to_numeric(crsp['ret'], errors='coerce')
crsp['rf'] = pd.to_numeric(crsp['rf'], errors='coerce') / 100  # FF provided in %

# Target variable
crsp['ret_excess'] = crsp['ret'] - crsp['rf']

crsp.head()

Unnamed: 0,permno,date,ret,rf,ret_excess
0,10000,1985-12-31,,6.5e-05,
1,10000,1986-01-31,,5.6e-05,
2,10000,1986-02-28,-0.257143,5.3e-05,-0.257196
3,10000,1986-03-31,0.365385,6e-05,0.365325
4,10000,1986-04-30,-0.098592,5.2e-05,-0.098644


In [14]:
df = chars.merge(crsp[['permno', 'date', 'ret_excess']], on=['permno', 'date'], how='inner')

df.head()

Unnamed: 0,permno,mvel1,beta,betasq,chmom,dolvol,idiovol,indmom,mom1m,mom6m,...,ill,maxret,retvol,std_dolvol,std_turn,zerotrade,sic2,date,eom,ret_excess
0,10006,82249.0,1.122846,1.260784,0.04718,9.569953,0.025742,0.046433,0.044843,-0.059517,...,9.411565e-08,0.015453,0.008058,0.355638,0.46042,1.120996e-07,37.0,1957-01-31,1957-01-31,0.064351
1,10014,3903.375,0.426734,0.182102,-0.275641,6.237836,0.072103,0.046433,-0.086957,-0.115385,...,6.610609e-06,0.047619,0.033495,1.152126,1.16961,9.229146e-08,,1957-01-31,1957-01-31,0.095211
2,10022,9273.25,1.066449,1.137313,-0.02549,7.008844,0.027648,0.046433,-0.060377,-0.03955,...,2.286832e-06,0.020833,0.015589,0.815777,0.679803,1.181757e-07,,1957-01-31,1957-01-31,0.102014
3,10030,54465.875,0.926038,0.857547,0.018171,9.825337,0.0217,0.046433,0.044633,0.05047,...,1.464273e-07,0.039326,0.015849,0.739302,1.333656,6.126699e-08,,1957-01-31,1957-01-31,-0.047118
4,10057,40250.0,1.247748,1.556875,0.025785,7.901007,0.025506,0.046433,0.086667,0.055247,...,1.380375e-06,0.056856,0.019945,0.75551,0.410391,3.31579,,1957-01-31,1957-01-31,-0.090089


In [15]:
df.to_csv("merged.csv", index=False)