# 01 - Data Preparation

## Imports

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sys

## Load raw data

In [2]:
df_crsp_raw = pd.read_pickle('../data/raw/df_crsp_raw.pkl')

In [3]:
df_crsp_delist = pd.read_pickle('../data/raw/df_crsp_delist.pkl')

In [4]:
df_crsp_desc = pd.read_pickle('../data/raw/df_crsp_desc.pkl')

In [5]:
df_ff_raw = pd.read_pickle('../data/raw/df_ff_raw.pkl')

## Format FF data

In [6]:
# edit data formats
df_ff_raw['date'] = pd.to_datetime(df_ff_raw['date'], yearfirst=True)

# declare index
df_ff_raw.set_index('date', inplace=True)

In [7]:
df_ff_raw.to_pickle(path='../data/interim/df_ff.pkl')

## Transform CRSP data to tidy data format & adjust returns for delisting

In [8]:
df_crsp_tidy = df_crsp_raw.reset_index()
del df_crsp_raw

In [9]:
# edit data formats
df_crsp_tidy['date'] = pd.to_datetime(df_crsp_tidy['date'], yearfirst=True)
df_crsp_tidy[['permno']] = df_crsp_tidy[['permno']].astype(int)

df_crsp_delist['permno'] = df_crsp_delist['permno'].astype(int)
df_crsp_delist['date'] = pd.to_datetime(df_crsp_delist['date'], yearfirst=True)

In [10]:
# merge
df_crsp_tidy = df_crsp_tidy.merge(df_crsp_delist, how='left', on=['permno','date'])

In [11]:
# adjusted returns (for delisting)
df_crsp_tidy['retadj'] = (1+df_crsp_tidy['ret'].fillna(0))*(1+df_crsp_tidy['dlret'].fillna(0))-1
df_crsp_tidy['retadj'] = df_crsp_tidy['retadj'].where(df_crsp_tidy['ret'].notna() | df_crsp_tidy['dlret'].notna())

In [12]:
# declare index & sort
df_crsp_tidy.set_index(['date','permno'], inplace=True)
df_crsp_tidy = df_crsp_tidy.drop(columns=['index', 'dlret'])
df_crsp_tidy = df_crsp_tidy.sort_index()

In [13]:
df_crsp_tidy.to_pickle(path='../data/interim/df_crsp_tidy.pkl')

## Format descriptive data

In [16]:
df_crsp_desc['permno'] = df_crsp_desc['permno'].astype(int)
df_aux = df_crsp_desc\
                    .groupby('permno')\
                    .last()#\
                    #.drop(columns=['exchcd'])

In [17]:
df_aux.to_pickle(path='../data/interim/df_aux.pkl')

## Filter biggest Assets per Year

In [172]:
#df_crsp_tidy = pd.read_pickle('../data/interim/df_crsp_tidy.pkl')

In [18]:
# parameters
N_LARGEST = 100
ESTIMATION_YEARS = 1
ANALYSIS_YEARS = 1

In [19]:
# select years
sample_years = list(df_crsp_tidy.index.get_level_values('date').year.unique())
if (df_crsp_tidy.index.get_level_values('date').year==sample_years[0]).sum() < (df_crsp_tidy.index.get_level_values('date').year==sample_years[1]).sum()*0.5:
    sample_years = sample_years[1:]

In [25]:
# select assets function
def select_assets(df_estimation, n_assets):
    year_obs = len(df_estimation['ret'].unstack())
    
    df_select = pd.DataFrame()
    df_select['full_year'] = df_estimation['retadj']\
                                .groupby('permno')\
                                .count() > year_obs*0.99
    df_select['size'] = df_estimation['mcap']\
                                .unstack()\
                                .sort_index()\
                                .fillna(method='ffill', limit=1)\
                                .tail(1)\
                                .squeeze()
    df_select['size_rank'] = df_select['size']\
                                .where(df_select['full_year'])\
                                .rank(ascending=False)
    
    selected_assets = list(df_select.index[df_select['size_rank'] <= n_assets])
    return selected_assets

In [24]:
for year in sample_years[ESTIMATION_YEARS-1:-ANALYSIS_YEARS]:
    # slice time dime dimension
    df_estimation = df_crsp_tidy[(df_crsp_tidy.index.get_level_values('date').year > year-ESTIMATION_YEARS)\
                                      & (df_crsp_tidy.index.get_level_values('date').year <= year)]
    df_analysis = df_crsp_tidy[(df_crsp_tidy.index.get_level_values('date').year > year)\
                                      & (df_crsp_tidy.index.get_level_values('date').year <= year+ANALYSIS_YEARS)]

    # slice assets
    selected_assets = select_assets(df_estimation, N_LARGEST)
    df_estimation = df_estimation[[i in selected_assets for i in df_estimation.index.get_level_values('permno')]]
    df_analysis = df_analysis[[i in selected_assets for i in df_analysis.index.get_level_values('permno')]]

    # output adjusted returns data
    df_estimation = df_estimation['retadj'].unstack().fillna(0)
    df_analysis = df_analysis['retadj'].unstack()
    df_descriptive = df_aux.loc[df_analysis.columns]
    
    # save
    df_estimation.to_csv('../data/processed/yearly/df_estimation_'+str(year)+'.csv')
    df_analysis.to_csv('../data/processed/yearly/df_analysis_'+str(year)+'.csv')
    df_descriptive.to_csv('../data/processed/yearly/df_descriptive_'+str(year)+'.csv')
    
    # NOTE: combined frame needed?
    
    print(year, dt.datetime.today())

2000 2020-05-21 15:12:33.876234
2001 2020-05-21 15:12:50.566604
2002 2020-05-21 15:13:07.179467
2003 2020-05-21 15:13:25.922381
2004 2020-05-21 15:13:44.766776
2005 2020-05-21 15:14:00.930631
2006 2020-05-21 15:14:17.061496
2007 2020-05-21 15:14:32.524949
2008 2020-05-21 15:14:50.611583
2009 2020-05-21 15:15:05.664330
2010 2020-05-21 15:15:20.724060
2011 2020-05-21 15:15:35.466806
2012 2020-05-21 15:15:53.660156
2013 2020-05-21 15:16:08.593250
2014 2020-05-21 15:16:23.636004
2015 2020-05-21 15:16:42.097759
2016 2020-05-21 15:16:58.044672
2017 2020-05-21 15:17:13.502338
2018 2020-05-21 15:17:28.389432
