In [14]:
import numpy as np
import pandas as pd
import wrds
import pickle
from scipy.stats.mstats import winsorize
from os.path import join as pjoin


In [15]:
tmp_data_path = '../MA_data/data/tmp'


s_year = 1997
e_year = 2019

In [24]:
db = wrds.Connection()
db = wrds.Connection(wrds_username='dayuyang1999')

Enter your WRDS username [dalab5]:dayuyang1999
Enter your password:········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
Loading library list...
Done
Loading library list...
Done


# add financial variables

## get raw data

According to Appendix A of Bernard et al. (2020), we need the following `COMPUSTAT` variables to compute financial ratios.

| compustat code | definition                                  |
|----------------|---------------------------------------------|
| at             | total asset                                 |
| ceq            | Common/Ordinary Equity - Total              |
| csho           | Number of Common Shares Outstanding         |
| prcc_f         | Price Close - Annual - Fiscal               |
| txdb           | Deferred Taxes (Balance Sheet)              |
| dlc            | Debt in Current Liabilities - Total         |
| dltt           | Long-Term Debt - Total                      |
| ib             | Income Before Extraordinary Items           |
| sale           | Net Sales                                   |
| ch             | Cash                                        |
| ppent          | Property, Plant and Equipment - Total (Net) |
| re             | Retained Earnings                           |
| act            | Current Assets - Total                      |
| lct            | Current Liabilities - Total                 |


Bernard, Darren, Terrence Blackburne, and Jacob Thornock. 2020. “Information Flows among Rivals and Corporate Investment.” Journal of Financial Economics 136 (3): 760–79.

In [None]:
from dataloader_helpers import get_firm_annual_data

In [75]:
get_firm_annual_data(tmp_data_path, s_year, e_year)

In [76]:
fin_var = pd.read_pickle(f"{tmp_data_path}/fin_raw_{s_year}_{e_year}.pickle")

## create variables

Based on the above raw input variables, we construct the following financial ratios.

| variable               | formula                          | definition                                                          |
|------------------------|----------------------------------|---------------------------------------------------------------------|
|                        |        Bernard et al. (2020) Appendix A     |                                                                     |
| size_i                 | at                               | Firm i’s total assets                                               |
| market-to-book ratio_i | (at+prcc_f*csho-ceq-txdb)/at     | Market-to-book assets ratio of firm i                               |
| leverage_i             | (dlc+dltt)/at                    | Book leverage of firm i                                             |
| roa_i                  | ib/at                            | Return-on-assets of firm i                                          |
| sales growth_i         | (sale_{t}-sale_{t-1})/sale_{t-1} | Sales growth of firm i                                              |
| ppe_i                  | ppent/at                         | Firm i’s net plant, property, and equipment, scaled by total assets |
|                        |  Yang et al (2014)  Table 2     |                                                                     |
| sale_i                 | sale                             | Firm i’s net sales                                                  |
| cash-to-asset ratio_i  | ch/at                            | Cash flow to total assets ratio of firm i                           |
| cash-to-sales ratio_i  | ch/sale                          | Cash flow to sales ratio of firm i                                  |
| sales-to-asset ratio_i | sale/at                          | Net sales/total assets                                              |
| current ratio_i        | act/lct                          | Current assets of firm i scaled by its current liabilities          |
| asset growth_i         | (at_{t}-at_{t-1})/at_{t-1}       | Total asset growth of firm i                                        |


Bernard, Darren, Terrence Blackburne, and Jacob Thornock. 2020. “Information Flows among Rivals and Corporate Investment.” Journal of Financial Economics 136 (3): 760–79.

Yang, Chin-Sheng, Chih-Ping Wei, and Yu-Hsun Chiang. 2014. “Exploiting Technological Indicators for Effective Technology Merger and Acquisition (M&A) Predictions.” Decision Sciences 45 (1): 147–74.

In [77]:
from fin_var_helpers import get_lags, create_var

In [78]:
ratio_pd_w_raw = create_var(fin_var)

check df created ok: 
    gvkey  year       at     sale       m2b      lev       roa       ppe  \
0  10000  1997  577.137  559.823  1.187368  0.41477 -0.002807  0.218361   

   cash2asset  cash2sale  sale2asset       cr  d_sale  d_at  
0    0.029125   0.030026        0.97  1.83279     NaN   NaN  

 variable lists of ratio pd:  Index(['gvkey', 'year', 'at', 'sale', 'm2b', 'lev', 'roa', 'ppe', 'cash2asset',
       'cash2sale', 'sale2asset', 'cr', 'd_sale', 'd_at'],
      dtype='object')


In [79]:
print(f"saving raw financial variable tables from {s_year} to {e_year}; table size: ", ratio_pd_w_raw.shape)
ratio_pd_w_raw.to_pickle(f'{tmp_data_path}/fv_raw_{s_year}_{e_year}.pickle')


saving raw financial variable tables from 1997 to 2019; table size:  (267134, 14)


### check NAs

1. since we have devide operation, there are supposed to have inf and -inf if devide by 0
    - replace `inf` as `na`

2. only keep rows which na <  threshold
3. replace na as mean value of the whole variable

check how series is missing, 

- totally 288k rows has at least missing value
- if we are ok to roughly drop 25% of the data, set thres = 5


However, we don't want to drop many rows since we may have no positive samples for MA if do so.

To decide what column to drop and how many row to drop, refer to [Appendix 2](./Appendix2_merge.ipynb)

In [80]:
thres = 7

In [88]:
ratio_pd_w_raw.columns

Index(['gvkey', 'year', 'at', 'sale', 'm2b', 'lev', 'roa', 'ppe', 'cash2asset',
       'cash2sale', 'sale2asset', 'cr', 'd_sale', 'd_at'],
      dtype='object')

In [82]:
# def helper fun
def deal_na(df, na_thres):
    '''
    df: raw financial varibale table
        - the first 2 columns are `gvkey` and `year`
        - the rest are fianncial variables
    
    '''
    n_features = len(df.columns) - 2 # exclude gvkey and year
    print(f"totally {n_features} number of financial features, tolerance of num of missing is:", na_thres)
    ratio_pd_w = df
    # replace inf to na
    ratio_pd_w.replace([np.inf, -np.inf], np.nan, inplace=True)
    # count na of each row
    ratio_pd_w['n_na'] = ratio_pd_w.isna().sum(axis=1) # each row has how many Nas
     # only retain those Na < thres
    ratio_pd_w = ratio_pd_w[ratio_pd_w['n_na'] <= na_thres].reset_index(drop=True)
    
    for colname in ratio_pd_w.columns[2:(2+n_features)]:
        # remove outliers
        ratio_pd_w[colname] = winsorize(ratio_pd_w[colname], limits=[0.01, 0.01], nan_policy='omit')
        # impute na with mean
        ratio_pd_w[colname].fillna(value=ratio_pd_w[colname].mean(skipna=True), inplace=True)
    assert ratio_pd_w.isna().sum().sum() == 0
    return ratio_pd_w


def merge_fv_ma(df_fv_nona, df_ma):
    '''
    df_fv_nona: df_fv with no single missing value
    df_ma: 
    
    '''
    assert df_fv_nona.isna().sum().sum() == 0
    merge_a = df_ma.merge(df_fv_nona, how = 'inner', left_on=['AGVKEY', 'YEAR'], right_on = ['gvkey','year'])
    merge_a.columns = list(merge_a.columns[0:len(merge_a.columns)-15]) + [x.upper()+'_A' for x in merge_a.columns[-15:]]
    
    merge_t =  merge_a.merge(df_fv_nona, how = 'inner', left_on=['TGVKEY', 'YEAR'], right_on = ['gvkey','year'])
    merge_t.columns = list(merge_t.columns[0:len(merge_t.columns)-15]) + [x.upper()+'_T' for x in merge_t.columns[-15:]]

    #print("num of obs for original MA table: ", df_ma.shape[0], '\n')
    #print('num of obs in merged table:', merge_t.shape[0], '\n')
    
    return merge_t


In [83]:
df_ma = pd.read_pickle(pjoin(tmp_data_path , f'sdc_gvkey_{s_year}_{e_year}.pickle'))
df_fv_raw = pd.read_pickle(f'{tmp_data_path}/fv_raw_{s_year}_{e_year}.pickle')

In [84]:
df_fv_nona = deal_na(df_fv_raw, thres)

In [85]:
merged_fv_ma = merge_fv_ma(df_fv_nona, df_ma)

In [86]:
print("fin_var merged sdc df shape = ", merged_fv_ma.shape)
df_fv_nona.to_pickle(f'{tmp_data_path}/fv_nona_{s_year}_{e_year}.pickle')

fin_var merged sdc df shape =  (26203, 71)


In [219]:
merged_fv_ma.drop(['GVKEY_A', 'GVKEY_T', 'YEAR_A', 'YEAR_T'], axis=1, inplace=True)

# arrange

In [220]:
merged_fv_ma.columns

Index(['ACU', 'ASIC2', 'ABL', 'ANL', 'APUBC', 'AUP', 'AUPSIC', 'AUPBL',
       'AUPNAMES', 'AUPPUB', 'BLOCK', 'CREEP', 'DA', 'DE', 'STATC', 'SYNOP',
       'VAL', 'PCTACQ', 'PSOUGHTOWN', 'PSOUGHT', 'PHDA', 'PCTOWN', 'PSOUGHTT',
       'PRIVATIZATION', 'DEAL_NO', 'TCU', 'TSIC2', 'TBL', 'TNL', 'TPUBC',
       'TUP', 'TUPSIC', 'TUPBL', 'TUPNAMES', 'TUPPUB', 'AGVKEY', 'TGVKEY',
       'GVKEY_OVERALL', 'YEAR', 'SIC_A', 'SIC_T', 'AT_A', 'SALE_A', 'M2B_A',
       'LEV_A', 'ROA_A', 'PPE_A', 'CASH2ASSET_A', 'CASH2SALE_A',
       'SALE2ASSET_A', 'CR_A', 'D_SALE_A', 'D_AT_A', 'N_NA_A', 'AT_T',
       'SALE_T', 'M2B_T', 'LEV_T', 'ROA_T', 'PPE_T', 'CASH2ASSET_T',
       'CASH2SALE_T', 'SALE2ASSET_T', 'CR_T', 'D_SALE_T', 'D_AT_T', 'N_NA_T'],
      dtype='object')

In [221]:
print(f'df contains ma and fin var from {s_year} to {e_year}, save to pickle, shape = ', merged_fv_ma.shape)
merged_fv_ma.to_pickle(f'{tmp_data_path}/ma_fv_{s_year}_{e_year}.pickle')

df contains ma and fin var from 1997 to 2020, save to pickle, shape =  (7045, 67)
