# Fama-French Risk Factors SML & HML
by Dr Liang Jin

Part of AcF701 Python Sessions: [github.com/drliangjin/mini-python-book](https://github.com/drliangjin/mini-python-book)

Based on the Python example on WRDS by Qingyi Song Drechsler: [Fama-French Factors (Python)](https://wrds-www.wharton.upenn.edu/pages/support/applications/risk-factors-and-industry-benchmarks/fama-french-factors-python/)

### Import external packages

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import wrds
import psycopg2 
import matplotlib.pyplot as plt
from dateutil.relativedelta import *
from pandas.tseries.offsets import *
from scipy import stats

  """)


### Connect to WRDS

In [None]:
# explictly assign username as a string and store in a variable
my_wrds_username = "your_username" # <= change to your username

# WRDS Connection
conn = wrds.Connection(wrds_username=my_wrds_username)

### Retrieving Compustat Data

In [None]:
# Postgres Query
stmt = """
          SELECT gvkey, datadate, at, pstkl, txditc, pstkrv, seq, pstk
          FROM comp.funda
          WHERE indfmt='INDL' 
          AND datafmt='STD'
          AND popsrc='D'
          AND consol='C'
          AND datadate >= '01/01/1959'
       """
comp = conn.raw_sql(stmt)

In [None]:
# basic info on the data
comp.info()

# a closer look at the data
comp.describe()

# head and tail
comp.head()

### Work on Compustat Data

In [None]:
# set date and time to the standard format recognised by Pandas and other packages
comp['datadate']=pd.to_datetime(comp['datadate'])

# create a new variable for year
comp['year']=comp['datadate'].dt.year

In [None]:
# check data type
comp['datadate'].dtype(), comp['year'].dtype()

#
comp.head()

### Deal with prefered stock

In [None]:
# if pstkrv is missing, then use pstkl
comp['ps']=np.where(comp['pstkrv'].isnull(), comp['pstkl'], comp['pstkrv'])

# if created ps is missing, then use pstk
comp['ps']=np.where(comp['ps'].isnull(),comp['pstk'], comp['ps'])

# if ps is still missing, then assign 0
comp['ps']=np.where(comp['ps'].isnull(),0,comp['ps'])

In [None]:
# again check prefered stock we just created
comp['ps'].describe()

### Book Value of Equity

In [None]:
# assign 0 to txditc
comp['txditc']=comp['txditc'].fillna(0)

# create a variable, be, for book value of equity
comp['be']=comp['seq']+comp['txditc']-comp['ps']

# if be is missing, replaced by NaN 
comp['be']=np.where(comp['be']>0, comp['be'], np.nan)

In [None]:
# check book value of equity
comp['be'].describe()

### House cleanning

In [None]:
# sort values so that the dataframe is constructed by id and time
comp=comp.sort_values(by=['gvkey','datadate'])

# count obs? starting from 0 to length of the group -1
comp['count']=comp.groupby(['gvkey']).cumcount()

# house cleanning
comp=comp[['gvkey','datadate','year','be','count']]

In [None]:
comp.describe()

comp.head()

### Retrieving CRSP stock data

In [None]:
# wrap things in a function so that we can specify sample period we want
def crspmerge (begdt="01/01/1959", enddt="12/31/2017"): # can you think of other arguments we can put here?
    stmt = """
              SELECT a.permno, a.permco, a.date, a.ret, a.retx, a.shrout, a.prc,
                     b.shrcd, b.exchcd
              FROM crsp.msf AS a
              LEFT JOIN crsp.msenames AS b
              ON a.permno=b.permno
              AND b.namedt<=a.date
              AND a.date<=b.nameendt
              WHERE a.date BETWEEN {} AND {}
              AND b.exchcd BETWEEN 1 AND 3
           """.format(begdt, enddt)
    # connection checker
    if not conn:
        conn = wrds.Connection()
        
    return conn.raw_sql(stmt)

crsp_m = crspmerge() # default values

### Let's try to check the data

### Data cleanning

In [None]:
# change variable format to int
crsp_m[['permco','permno','shrcd','exchcd']]=crsp_m[['permco','permno','shrcd','exchcd']].astype(int)

# personally, I like to set all the identifications to be strings...because of the annoying trailing zeros

In [None]:
# format datatime
crsp_m['date']=pd.to_datetime(crsp_m['date'])

# MonthEnd is a function from pandas.tseries.offsets
# convert timestamp to current month end <= for easier merging purpose
# MonthEnd(-1) move backwards by 1 month (last month end)
# MonthEnd(1) next month end
crsp_m['jdate']=crsp_m['date']+MonthEnd(0)