# 01 - Data Preparation

## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sys

## Load raw data

In [2]:
df_crsp_raw = pd.read_pickle('../data/raw/df_crsp_raw.pkl')

In [3]:
df_crsp_delist = pd.read_pickle('../data/raw/df_crsp_delist.pkl')

In [4]:
df_crsp_desc = pd.read_pickle('../data/raw/df_crsp_desc.pkl')

In [5]:
df_ff_raw = pd.read_pickle('../data/raw/df_ff_raw.pkl')

## Format FF data

In [6]:
# edit data formats
df_ff_raw['date'] = pd.to_datetime(df_ff_raw['date'], yearfirst=True)

# declare index
df_ff_raw.set_index('date', inplace=True)

In [7]:
df_ff_raw.to_pickle(path='../data/interim/df_ff.pkl')

## Transform CRSP data to tidy data format & adjust returns for delisting

In [8]:
df_crsp_tidy = df_crsp_raw.reset_index()
del df_crsp_raw

In [9]:
# edit data formats
df_crsp_tidy['date'] = pd.to_datetime(df_crsp_tidy['date'], yearfirst=True)
df_crsp_tidy[['permno']] = df_crsp_tidy[['permno']].astype(int)

df_crsp_delist['permno'] = df_crsp_delist['permno'].astype(int)
df_crsp_delist['date'] = pd.to_datetime(df_crsp_delist['date'], yearfirst=True)

In [10]:
# merge
df_crsp_tidy = df_crsp_tidy.merge(df_crsp_delist, how='left', on=['permno','date'])

In [11]:
# adjusted returns (for delisting)
df_crsp_tidy['retadj'] = (1+df_crsp_tidy['ret'].fillna(0))*(1+df_crsp_tidy['dlret'].fillna(0))-1
df_crsp_tidy['retadj'] = df_crsp_tidy['retadj'].where(df_crsp_tidy['ret'].notna() | df_crsp_tidy['dlret'].notna())

In [12]:
# declare index & sort
df_crsp_tidy.set_index(['date','permno'], inplace=True)
df_crsp_tidy = df_crsp_tidy.drop(columns=['index', 'dlret'])
df_crsp_tidy = df_crsp_tidy.sort_index()

In [13]:
df_crsp_tidy.to_pickle(path='../data/interim/df_crsp_tidy.pkl')

## Format descriptive data

In [14]:
df_crsp_desc['permno'] = df_crsp_desc['permno'].astype(int)
df_aux = df_crsp_desc\
                    .groupby('permno')\
                    .last()#\
                    #.drop(columns=['exchcd'])

In [15]:
df_aux.to_pickle(path='../data/interim/df_aux.pkl')

## Filter biggest Assets per Year

In [16]:
# TEMPORARY CELL
df_crsp_tidy = pd.read_pickle('../data/interim/df_crsp_tidy.pkl')
df_aux = pd.read_pickle('../data/interim/df_aux.pkl')

In [17]:
# parameters
N_LARGEST = 100
ESTIMATION_YEARS = 1
ANALYSIS_YEARS = 1

In [18]:
# select years
sample_years = list(df_crsp_tidy.index.get_level_values('date').year.unique())
if (df_crsp_tidy.index.get_level_values('date').year==sample_years[0]).sum() < (df_crsp_tidy.index.get_level_values('date').year==sample_years[1]).sum()*0.5:
    sample_years = sample_years[1:]

In [19]:
# select assets function
def select_assets(df_estimation, n_assets):
    year_obs = len(df_estimation['ret'].unstack())
    
    df_select = pd.DataFrame()
    df_select['full_year'] = df_estimation['retadj']\
                                .groupby('permno')\
                                .count() > year_obs*0.99
    df_select['size'] = df_estimation['mcap']\
                                .unstack()\
                                .sort_index()\
                                .fillna(method='ffill', limit=1)\
                                .tail(1)\
                                .squeeze()
    df_select['size_rank'] = df_select['size']\
                                .where(df_select['full_year'])\
                                .rank(ascending=False)
    
    selected_assets = list(df_select.index[df_select['size_rank'] <= n_assets])
    return selected_assets

In [52]:
df_estimation_tidy = pd.Series(dtype='float', index=pd.MultiIndex.from_arrays([[],[]]))
df_analysis_tidy = pd.Series(dtype='float', index=pd.MultiIndex.from_arrays([[],[]]))
df_indices = pd.DataFrame()

for year in sample_years[ESTIMATION_YEARS-1:ESTIMATION_YEARS]:#[ESTIMATION_YEARS-1:-ANALYSIS_YEARS]:
    # slice time dime dimension
    df_estimation = df_crsp_tidy[(df_crsp_tidy.index.get_level_values('date').year > year-ESTIMATION_YEARS)\
                                      & (df_crsp_tidy.index.get_level_values('date').year <= year)]
    df_analysis = df_crsp_tidy[(df_crsp_tidy.index.get_level_values('date').year > year)\
                                      & (df_crsp_tidy.index.get_level_values('date').year <= year+ANALYSIS_YEARS)]

    # slice assets
    selected_assets = select_assets(df_estimation, N_LARGEST)
    df_estimation = df_estimation[[i in selected_assets for i in df_estimation.index.get_level_values('permno')]]
    df_analysis = df_analysis[[i in selected_assets for i in df_analysis.index.get_level_values('permno')]]

    # output adjusted returns data
    df_estimation = df_estimation['retadj'].unstack().fillna(0)
    df_analysis = df_analysis['retadj'].unstack()
    df_descriptive = df_aux.loc[selected_assets]
    
    # save
    df_estimation.to_csv('../data/processed/yearly/df_estimation_'+str(year)+'.csv')
    df_analysis.to_csv('../data/processed/yearly/df_analysis_'+str(year)+'.csv')
    df_descriptive.to_csv('../data/processed/yearly/df_descriptive_'+str(year)+'.csv')
    
    # collect full timeline
    df_estimation_tidy = df_estimation_tidy.append(df_estimation.stack())
    df_analysis_tidy = df_analysis_tidy.append(df_analysis.stack())
    df_indices[year] = selected_assets
    
    print(year, dt.datetime.today())

2000 2020-05-25 15:23:04.772932


In [26]:
df_estimation = df_crsp_tidy[(df_crsp_tidy.index.get_level_values('date').year > year-ESTIMATION_YEARS)\
                                      & (df_crsp_tidy.index.get_level_values('date').year <= year)]
df_analysis = df_crsp_tidy[(df_crsp_tidy.index.get_level_values('date').year > year)\
                                      & (df_crsp_tidy.index.get_level_values('date').year <= year+ANALYSIS_YEARS)]

In [35]:
in_est = ([i in df_estimation.index.get_level_values('permno') for i in selected_assets])

In [36]:
in_ana = ([i in df_analysis.index.get_level_values('permno') for i in selected_assets])

In [44]:
diff = [i and not j for (i,j) in zip(in_est,in_ana)]

In [48]:
pd.DataFrame(selected_assets).loc[diff]

Unnamed: 0,0
59,48071


In [51]:
df_aux.loc[48071]

comnam      MORGAN J P & CO INC
ticker                      JPM
st_date              1969-04-30
end_date             2000-12-29
exchcd                        1
Name: 48071, dtype: object

In [None]:
df_estimation_tidy.to_pickle(path='../data/processed/df_estimation.pkl')
df_analysis_tidy.to_pickle(path='../data/processed/df_analysis.pkl')
df_indices.to_pickle(path='../data/processed/df_indices.pkl')