In [5]:
import pickle
import pandas as pd
import numpy as np


In [17]:
data_dir = './data/tnic3_data/'
tmp_data_path = './data/tmp'


s_year = 1997
e_year = 2019 ### keep in mind here is 2019

top_k = 10


# helper funcs

In [50]:
def gvkey_year_table(gvkey_lst_topk):
    
    df_gvkey = pd.DataFrame({'tnic_topk_gvkey': gvkey_lst_topk})
    x = pd.DataFrame(np.repeat(df_gvkey.values, n_years, axis=0))
    y = pd.DataFrame({'year':list(np.arange(s_year, e_year+1))*len(gvkey_lst_topk)})
    
    
    assert x.shape[0] == y.shape[0]
    
    x['year'] = y['year']
    x.columns = ['tnic_topk_gvkey', 'year']
    return x
    


In [None]:
def get_lags(sub_pd):
    sub_pd = sub_pd[['gvkey', 'year', 'sale', 'at']]
    sub_pd[['lag_year', 'lag_sale', 'lag_at']] = sub_pd[['year', 'sale', 'at']].shift(1)
    return sub_pd

In [18]:
def get_unique(lst1, lst2):
    uniq = list(set(lst1+lst2))
    print('the length of unique elements is:', len(uniq))
    return uniq

In [43]:
def create_var(df):
    '''
    df:  financial var, must contain:
        - gvkey
        - datadate
        - and other variables you interested in
    
    '''
    pd_afr = df
    #### pre
    # create year and sort
    pd_afr['year'] = pd_afr.datadate.dt.year 
    pd_afr = pd_afr.sort_values(['gvkey', 'year', 'datadate'], ascending=True)
    
    # check, each firm-year observation should only be observed once
    pd_afr = pd_afr.groupby(['gvkey', 'year'], sort=False).tail(1)
    
    #### create 
    # keep at and sale
    ratio_pd = pd_afr[['gvkey', 'year', 'at', 'sale']].copy()
    
    # market to book ratio
    ratio_pd['m2b'] = (pd_afr['at']+pd_afr['prcc_f']*pd_afr['csho']-pd_afr['ceq']-pd_afr['txdb'])/(pd_afr['at'])
    
    # leverage
    ratio_pd['lev'] = (pd_afr['dlc']+pd_afr['dltt'])/(pd_afr['at'])
    
    # return on asset
    ratio_pd['roa'] = pd_afr['ib']/(pd_afr['at'])

    # various ratios
    ratio_pd['ppe'] = pd_afr['ppent']/(pd_afr['at'])
    ratio_pd['cash2asset'] = pd_afr['ch']/(pd_afr['at']) 
    ratio_pd['cash2sale'] = pd_afr['ch']/(pd_afr['sale'])
    ratio_pd['sale2asset'] = pd_afr['sale']/(pd_afr['at'])
    
    # current ratio
    ratio_pd['cr'] = pd_afr['act']/(pd_afr['lct']) 
    
    # sale growth
    growth_pd = pd_afr[['gvkey', 'year', 'sale', 'at']].copy()
    growth_pd[['lag_year', 'lag_sale', 'lag_at']] = growth_pd.groupby('gvkey', sort=False)[['year', 'sale', 'at']].shift(1)
    growth_pd['d_sale'] = (growth_pd['sale'] - growth_pd['lag_sale'])/growth_pd['lag_sale']
    growth_pd['d_at'] = (growth_pd['at'] - growth_pd['lag_at'])/growth_pd['lag_at']
    
    #print('check df structure ok: ', growth_pd.head(5))
    
    ratio_pd = ratio_pd.merge(growth_pd[['gvkey', 'year', 'd_sale', 'd_at']])
    
    
    print('check df created ok: \n', ratio_pd.head(1))
    
    print('\n variable lists of ratio pd: ', ratio_pd.columns)
    
    return ratio_pd
    
    
    


In [19]:
tnic_top_k = pd.read_pickle(f'{tmp_data_path}/tnic_top_{top_k}.pickle')

In [20]:
gvkey_lst_topk = get_unique(list(tnic_top_k['gvkey1']), list(tnic_top_k['gvkey2']))

the length of unique elements is: 13681


# varify if those GVKEYs could retreive fin var from Compustat

In [21]:
n_years = e_year - s_year + 1

In [51]:
gvkey_table = gvkey_year_table(gvkey_lst_topk)

In [64]:

ratios_nona = pd.read_pickle(f'{tmp_data_path}/fv_nona_{s_year}_2020.pickle')


## gvkey_table merge with fin_var

### change type to match

In [65]:
## verify the type is the same before merge

type(gvkey_table.loc[1,'tnic_topk_gvkey']) == type(ratios_nona.loc[1, 'gvkey'])

True

In [69]:
merged =  gvkey_table.merge(ratios_nona, left_on =list(gvkey_table.columns), right_on = ['gvkey','year'], how = 'inner' )

In [72]:
merged.shape # these are those with sufficient fin var in top k peers

(102988, 16)

In [74]:
merged.to_pickle(f'{tmp_data_path}/tnic_topk_finvar_{s_year}_2019.pickle')