In [4]:
import pandas as pd
import pickle
import numpy as np
import datetime 
from os.path import join as pjoin
import os
import wrds

#import argparse
#import yamlb

In [5]:
tmp_data_path =  '../MA_data/data/tmp/'
data_path = '../MA_data/data/'
s_year = 1997
e_year = 2016

# Download SDC data

variable description could see [appendix 1.1 variable description](./Appendix_1.1_variable_description.ipynb)

In [6]:
name_lst = [
                'ACU', 'ASIC2', 'ABL', 'ANL', 'APUBC', 'AUP', 'AUPSIC', 'AUPBL', 'AUPNAMES', 'AUPPUB',
                'BLOCK','CREEP','DA','DE','STATC','SYNOP','VAL','PCTACQ','PSOUGHTOWN','PSOUGHT','PHDA','PCTOWN','PSOUGHTT','PRIVATIZATION','DEAL_NO',
                'TCU', 'TSIC2', 'TBL', 'TNL', 'TPUBC', 'TUP', 'TUPSIC', 'TUPBL', 'TUPNAMES', 'TUPPUB'    
            ]

In [7]:
def concat_data(st, end, name_lst, data_path):
    df_l = []
    for year in range(st, end+1, 1):
        df = pd.read_excel(f"{data_path}/SDC/{year}.xlsx", header=1, engine='openpyxl')
        #df = df.drop(df.columns[4], axis=1) # this column is duplicate with column 3 
        #print(len(df.columns))
        df.columns = name_lst
        
        # check date var loading ok
        check = df[df['DA'] == datetime.time(0, 0)]
        if check.shape[0] == 0 :
            print('date variables loading ok \n')
        else:
            print('date variables loading fail, please manually check. number of failed records: ', check.shape[0])
        
        df_l.append(df)
        print(f'{year} data shape:', df.shape)
        del df
    df = pd.concat(df_l)
    return df

def get_sic(df):
    '''
    df: the sdc table contains sic variable named as `ASIC2`
    
    '''
    x = df.ASIC2.str.split('/')
    x = x.transform(lambda x: x[0] if not isinstance(x, float) else np.nan)
    df['SIC_A'] = x

    x = df.ASIC2.str.split('/')
    x = x.transform(lambda x: x[0] if not isinstance(x, float) else np.nan)
    df['SIC_T'] = x
    
    return df 

In [8]:
def read_sdc(name_lst, from_cache=False):
    pickle_it = True
    from_cache = False
#     #tmp_data_path = tmp_data_path
#     s_year = s_year
#     e_year = e_year


    if (from_cache) & ~(os.path.isfile(pjoin(tmp_data_path , f'sdc_{s_year}_{e_year}.pickle'))):
        print(f"WARNING, Cannot load from cache; \n No compabible sdc_df cache file named 'sdc_{s_year}_{e_year}.pickle' exists in {tmp_data_path} \n")

    if (from_cache) & (os.path.isfile(pjoin(tmp_data_path , f'sdc_{s_year}_{e_year}.pickle'))):
        sdc_df = pd.read_pickle(pjoin(tmp_data_path , f'sdc_{s_year}_{e_year}.pickle'))
        print("loading data from previous download. Did not download again")
    else:                 
        sdc_df = concat_data(s_year, e_year, name_lst, data_path)
        sdc_df = sdc_df.reset_index()

        # change var type and fillna
        sdc_df = sdc_df.dropna(subset=['ACU','TCU']) # actually nothing drops
        sdc_df['DEAL_NO'] = sdc_df['DEAL_NO'].fillna(-1)
        print("ATTENTION, DEAL_NO NAs in sdc_df are interpolated as '-1'. \n ")
        sdc_df['DEAL_NO'] = sdc_df['DEAL_NO'].astype(str)
        sdc_df['TCU']  = sdc_df['TCU'].astype('str')
        sdc_df['ACU']  = sdc_df['ACU'].astype('str')
        sdc_df['TUP']  = sdc_df['TUP'].astype('str')
        sdc_df['AUP']  = sdc_df['AUP'].astype('str')


        # add sdc variable
        sdc_df = get_sic(sdc_df)
        # add year varibale
        sdc_df['YEAR'] = sdc_df.DA.dt.year
        # update name lst

        if pickle_it:
            print(f'saving sdc table ranging from {s_year} to {e_year} to {tmp_data_path}')
            sdc_df.to_pickle(pjoin(tmp_data_path , f'sdc_{s_year}_{e_year}.pickle'))

    if True:
        print("SDC Data looks like:", sdc_df.sample(3).T)

    name_lst += ['SIC_A', 'SIC_T', 'YEAR'] 

    return sdc_df, name_lst

In [None]:
sdc_df, name_lst1 = read_sdc(name_lst)

remove:
1. self merge self
    - ACU = TCU or TUP
    - AUP = TCU or TUP
2. MA is incomplete

In [20]:
def filter1(df):
    '''
    remove self merge
    remove incomplete deal

    '''
    cond0 = (df.ACU == df.TCU) | (df.ACU == df.TUP)
    cond1 = cond0 | (df.AUP == df.TCU) | (df.AUP == df.TUP)
    cond2 = cond1 | ~df.STATC.isin(["C"])
    return df[~cond2]

In [21]:
sdc_df2 = filter1(sdc_df)

In [22]:
sdc_df2.shape

(170711, 39)

# Filter Majority MA

The logic behind Filtering see [appendix1.2 majority takeover]()

In [23]:

def majority_filter(df):
    '''
    Description: used for filter "majority" takeover; Following “The Importance of Industry Links in Merger Waves.” The Journal of Finance 69 (2): 527–76. https://doi.org/10.1111/jofi.12122.
    Use: use for sdc_df
    input: sdc_df
    output: a new filtered sdc_df  to replace the old one
    '''
    
    cond1 = ((df.PCTACQ > 20.0) | pd.isna(df.PCTACQ))
    cond2 = cond1 & ((df.PCTOWN > 51.0) | (pd.isna(df.PCTOWN)))
    cond3 = cond2 & ((df.VAL > 1) | pd.isna(df.VAL))
    cond4 = cond3 & (df.BLOCK !='Yes') & (df.CREEP !='Yes') & (df.PRIVATIZATION !='Yes')
    df_new = df[cond3]
    print('original df shape: ', df.shape, '\n')
    print('filtered df shape: ', df_new.shape)
    return df_new.reset_index(drop=True)

In [24]:
sdc_df3 = majority_filter(sdc_df2)

original df shape:  (170711, 39) 

filtered df shape:  (159139, 39)


# Prepare Linking

make linking variables' type and format are the same

In [25]:
def var_type_checker(df1, df2):
    '''
    df1: sdc_df
    df2: linking table
    
    checking cusip types are the same
    '''
    # gvkey, cusip match
    assert type(df1.ACU[0]) == type(df1.TCU[0]) == type(df1.AUP[0]) == type(df1.TUP[0]) 
    
    assert type(df1.TCU[0]) == type(df2.HCUSIP[0]) 

    print("variable type checking finished, No error Found. \n")

## load linking tables

In [26]:
link1 = pd.read_pickle(data_path+"hcusip_permno.pickle")
link2 = pd.read_pickle(data_path+"permno_gvkey.pickle")

# Linking PERMNO

run the following after run [Appendix_1.3_CRSP_all_CUSIP](./Appendix_1.3_CRSP_all_CUSIP.ipynb)

It is normal to has a lot of NAs. Since Priv. do not have PRERMNO

## merge with PERMNO

In [27]:
merged1_1 = sdc_df3.merge(crsp_name_history, left_on='ACU', right_on = 'HCUSIP', how = 'left')

NameError: name 'crsp_name_history' is not defined

In [28]:
merged1_2 = merged1.merge(crsp_name_history, left_on='AUP', right_on = 'HCUSIP', how = 'left')

NameError: name 'merged1' is not defined

In [29]:
merged1_3 = merged2.merge(crsp_name_history, left_on='TCU', right_on = 'HCUSIP', how = 'left')

NameError: name 'merged2' is not defined

In [None]:
merged1_4 = merged3.merge(crsp_name_history, left_on='TUP', right_on = 'HCUSIP', how = 'left')

In [None]:
merged1_4 = merged1_4.drop(['index'], axis=1).reset_index(drop=True)

In [None]:
merged1_4.columns

In [None]:
tmp = []
for f in ['A', 'T']: # 
    for p in ['S', 'P']: # self, parent
        tmp += [ f+p+'_HCUSIP', f+p+'_PERMNO']

In [None]:
name_lst2 = name_lst1 + tmp

In [None]:
merged1_4.columns = name_lst2

In [None]:
merged1_4.columns

In [None]:

print(
    merged1_4.AS_PERMNO.isna().sum(),
    merged1_4.AP_PERMNO.isna().sum(),
    merged1_4.TS_PERMNO.isna().sum(),
    merged1_4.TP_PERMNO.isna().sum())
    
    

# Linking with GVKEY

check [appendix 1.4](./Appendix_1.4_PERMNO_GVKEY.ipynb)

In [None]:
merged2_1 = merged1_4.merge(link2, left_on='AS_PERMNO', right_on = 'PERMNO', how = 'left')

In [None]:
merged2_2 = merged2_1.merge(link2, left_on='AP_PERMNO', right_on = 'PERMNO', how = 'left')

In [None]:
merged2_3 = merged2_2.merge(link2, left_on='TS_PERMNO', right_on = 'PERMNO', how = 'left')

In [None]:
merged2_4 = merged2_3.merge(link2, left_on='TP_PERMNO', right_on = 'PERMNO', how = 'left')

In [None]:
merged2_4.columns

In [None]:
tmp = []
for f in ['A', 'T']: # 
    for p in ['S', 'P']: # self, parent
        tmp += [ f+p+'_PERMNO_2',f+p+'_GVKEY']

In [None]:
name_lst3 = name_lst2 + tmp

In [None]:
merged2_4.columns = name_lst3

In [None]:
tmp = []
for f in ['A', 'T']: # 
    for p in ['S', 'P']: # self, parent
        tmp += [f+p+'_HCUSIP', f+p+'_PERMNO_2']

In [None]:
merged_gvkey = merged2_4.drop(tmp, axis=1)

In [None]:
merged_gvkey.columns

In [None]:
merged_gvkey.shape

# Arrange linking result


the following conditions are marked as GVKEY merged successfully:

`ok` = `Corresponding GVKEY Found`

num of succcess condition = (C22 + C21) * (C22 + C21) = 9

| ACU ok | AUP  ok | TCU ok | TUP ok | mark as                                           |
|------------------|-------------------|------------------|-------------------|---------------------------------------------------|
| 1                | 1                 | 1                | 1                 | 1                                                 |
| 1                | 1                 | 1                | 0                 | 2                                                 |
| 1                | 1                 | 0                | 1                 | 3                                                 |
| 1                | 0                 | 1                | 1                 | 4                                                 |
| 1                | 0                 | 1                | 0                 | 5                                                 |
| 1                | 0                 | 0                | 1                 | 6                                                 |
| 0                | 1                 | 1                | 1                 | 7                                                 |
| 0                | 1                 | 1                | 0                 | 8                                                 |
| 0                | 1                 | 0                | 1                 | 9                                                 |
|                  |                   |                  |                   | all other combination is certainly unanalysiable |


Here I just mark those senarios.

Since at final, only the firms show up in TNIC is analysiable. However:
- TNIC: only contains those firm with regular updated 10-K (public firms)
- GVKEY: may contains private firms



In [None]:
def gvkey_checker(df):
    '''
    marker is str
    '''
    def mark(row):
        if pd.notna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.notna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '1'
        if pd.notna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.notna(row.TS_GVKEY) & pd.isna(row.TP_GVKEY):
            return '2'
        if pd.notna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.isna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '3'
        if pd.notna(row.AS_GVKEY) & pd.isna(row.AP_GVKEY) & pd.notna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '4'
        if pd.notna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.isna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '3'
        if pd.notna(row.AS_GVKEY) & pd.isna(row.AP_GVKEY) & pd.notna(row.TS_GVKEY) & pd.isna(row.TP_GVKEY):
            return '5'
        if pd.notna(row.AS_GVKEY) & pd.isna(row.AP_GVKEY) & pd.isna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '6'
        if pd.isna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.notna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '7'
        if pd.isna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.notna(row.TS_GVKEY) & pd.isna(row.TP_GVKEY):
            return '8'
        if pd.isna(row.AS_GVKEY) & pd.notna(row.AP_GVKEY) & pd.isna(row.TS_GVKEY) & pd.notna(row.TP_GVKEY):
            return '9'
        else:
            return '0'
    df['GVKEY_STATUS'] = df.apply(mark, axis=1)
    
    print('Number of Conditions: \n', df['GVKEY_STATUS'].value_counts(),'\n')
    
    return df
        
        

In [None]:
merged_gvkey2 = gvkey_checker(merged_gvkey)

# Link with EWENS

- EWENS' strategy is linking by name. It's not as accurate as linking via the method above
- And you have no idea if it was match with "self" or "parent".

- Here I did not apply this



The following condition may be saved


| AP | AS | TS | TP | EWENS agvkey | EWENS tgvkey |
|----|----|----|----|--------------|--------------|
| 1  | 0  | 0  | 0  | ~            | 1            |
| 0  | 1  | 0  | 0  | ~            | 1            |
| 0  | 0  | 1  | 0  | 1            | ~            |
| 0  | 0  | 0  | 1  | 1            | ~            |
| 0  | 0  | 0  | 0  | 1            | 1            |

before run EWENS, run [Appendix 1.5](./Appendix_1.5_EWENS.ipynb)

In [None]:
ewens = pd.read_pickle(tmp_data_path+"/ewens.pickle")
merged3_1 = merged_gvkey2.merge(ewens, left_on = "DEAL_NO", right_on = "DealNumber", how = "left")

In [None]:
merged3_1.columns

Since we are unable to identify if the matched GVKEY in EWENS is "self" or "parent", I naively classfied them as "self".

In [None]:
def attach_ewens(df):
    '''
    
    '''
    for i, row in df.iterrows():
        if (row.GVKEY_STATUS == '0') & (pd.isna(row.AS_GVKEY) & pd.isna(row.AP_GVKEY)) & pd.notna(row.agvkey):
            df.loc[i, 'AS_GVKEY'] = row.agvkey
        if (row.GVKEY_STATUS == '0') & (pd.isna(row.TS_GVKEY) & pd.isna(row.TP_GVKEY)) & pd.notna(row.tgvkey):
            df.loc[i, 'TS_GVKEY'] = row.tgvkey
        
    
    df = df.drop(['agvkey', 'DealNumber', 'tgvkey'], axis=1)
    
    return df
            
    
    

In [None]:
merged_gvkey3 = attach_ewens(merged3_1.reset_index(drop=True))

In [None]:
merged_gvkey4 = gvkey_checker(merged_gvkey3)

# Store result 

to tmp data path

In [None]:
merged_gvkey4.to_pickle(tmp_data_path+'/merged_gvkey2.pickle')

In [None]:
analysable = merged_gvkey4[merged_gvkey4.GVKEY_STATUS != '0']
print("final data shape:", analysable.shape)
analysable.to_pickle(tmp_data_path+'/merged_analysable.pickle')