In [90]:
import pandas as pd
import numpy as np
import warnings

In [144]:
tmp_data_path = '../MA_data/data/tmp/'
data_path = '../MA_data/data/'

s_year = 1997-1
e_year = 2020-1

In [92]:
def load_tnic(data_path, s_year, e_year):
    tnic3 =  pd.read_csv(data_path+'tnic3_data/tnic3_data.txt', delimiter = '\t' )
    tnic = tnic3[(tnic3['year'] <= e_year) & (tnic3['year'] >= s_year)]


    tnic['gvkey1'] = tnic['gvkey1'].astype(str)
    tnic['gvkey2'] = tnic['gvkey2'].astype(str)

    tnic = tnic.dropna(subset=['score'])
    return tnic
    

In [108]:
def get_gvkeylst_adj(tnic, year, top_peer=10, get_adj=False, weighted=False):

    def add_rank(df):
        df['rank'] = df.groupby('gvkey1')['score'].rank('dense', ascending=False)
        df['rank'] = df['rank'].astype(int)
        return df
    
    tnic_year = tnic[tnic.year == year]

    tnic_year = add_rank(tnic_year)
    tnic_year_top = tnic_year[tnic_year['rank'] <= top_peer].reset_index(drop=True)
    
    
    gvkey_arr = np.unique(tnic_year_top.gvkey1.unique())
    gvkey_lst = list(gvkey_arr)
    N = len(gvkey_lst)
    
    gvkey_index_mapping = {value:index for index, value in enumerate(gvkey_lst)}
    index_gvkey_mapping = {index:value for index, value in enumerate(gvkey_lst)}
    #return tnic_year_top, gvkey_arr gvkey_index_mapping, index_gvkey_mapping
    if get_adj:
        adj = np.zeros((N,N))
        
        for _, row in tnic_year_top.iterrows():
            
            gvkey1 = row['gvkey1']
            gvkey2 = row['gvkey2']
            score = row['score']
            
            index1 = gvkey_index_mapping[gvkey1]
            index2 = gvkey_index_mapping[gvkey2]
#            print(index1, index2)
            
            
            
            if weighted:
                adj[index1, index2] = score
            else:
                adj[index1, index2] = 1
            
        
        return gvkey_lst, gvkey_index_mapping, index_gvkey_mapping,   adj    
    else:
        return gvkey_lst, gvkey_index_mapping, index_gvkey_mapping
    
    
    
    

# Decide 4 types of MA

- SS
- PP
- PS
- SP

if  P, then the influence of deal is minor for P.
if  S, then the influence of deal is major for S.

Recall Master_1:

| ACU ok | AUP  ok | TCU ok | TUP ok | mark as                                           |
|------------------|-------------------|------------------|-------------------|---------------------------------------------------|
| 1                | 1                 | 1                | 1                 | 1                                                 |
| 1                | 1                 | 1                | 0                 | 2                                                 |
| 1                | 1                 | 0                | 1                 | 3                                                 |
| 1                | 0                 | 1                | 1                 | 4                                                 |
| 1                | 0                 | 1                | 0                 | 5                                                 |
| 1                | 0                 | 0                | 1                 | 6                                                 |
| 0                | 1                 | 1                | 1                 | 7                                                 |
| 0                | 1                 | 1                | 0                 | 8                                                 |
| 0                | 1                 | 0                | 1                 | 9                                                 |
|                  |                   |                  |                   | all other combination is certainly unanalysiable |

$\begin{array}{cl}0 & 130066 \\ 2 & 32609 \\ 6 & 10990 \\ 5 & 9219 \\ 8 & 7693 \\ 3 & 5720 \\ 1 & 3136 \\ 9 & 2277 \\ 4 & 1691 \\ 7 & 374\end{array}$


For those deal has multiple "identify" choices (like condition 1, it could be identified by any of 4 conditions), we want to make sure those GVKEY could be found in TNIC.


**Only 1, 2, 3, 4 has multiple choices**


## create list of gvkey list of each year

### Load data

In [145]:
sdc_df = pd.read_pickle(tmp_data_path+f'/sdc_analysable_{s_year+1}_{e_year+1}.pickle')

In [146]:
sdc_df.YEAR.unique()

array([1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
       2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018,
       2019, 2020])

In [147]:
tnic = load_tnic(data_path, s_year, e_year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tnic['gvkey1'] = tnic['gvkey1'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tnic['gvkey2'] = tnic['gvkey2'].astype(str)


In [148]:
gvkey_lsts = {}
key_ind_maps = {}
ind_key_maps = {}


with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for year in range(s_year, e_year+1, 1): # year are integers
        gvkey_lsts[year], key_ind_maps[year], ind_key_maps[year] = get_gvkeylst_adj(tnic, year)
    

In [149]:
sdc_df.columns

Index(['ACU', 'ASIC2', 'ABL', 'ANL', 'APUBC', 'AUP', 'AUPSIC', 'AUPBL',
       'AUPNAMES', 'AUPPUB', 'BLOCK', 'CREEP', 'DA', 'DE', 'STATC', 'SYNOP',
       'VAL', 'PCTACQ', 'PSOUGHTOWN', 'PSOUGHT', 'PHDA', 'PCTOWN', 'PSOUGHTT',
       'PRIVATIZATION', 'DEAL_NO', 'TCU', 'TSIC2', 'TBL', 'TNL', 'TPUBC',
       'TUP', 'TUPSIC', 'TUPBL', 'TUPNAMES', 'TUPPUB', 'SIC_A', 'SIC_T',
       'YEAR', 'AS_PERMNO', 'AP_PERMNO', 'TS_PERMNO', 'TP_PERMNO', 'AS_GVKEY',
       'AP_GVKEY', 'TS_GVKEY', 'TP_GVKEY', 'GVKEY_STATUS'],
      dtype='object')

In [150]:

for y in range(s_year, e_year+1):
    print(len(gvkey_lsts[y]))

7192
7157
6973
6734
6431
5855
5408
5023
4880
4804
4778
4704
4449
4197
4055
3954
3850
3932
4085
4046
3911
3871
3845
3825


priority(1st to last): 
+ SS
+ SP
+ PS
+ PP

## mark

In [164]:
def mark_ma_type(df):
    '''
    SDC should match with most-recent-previous-year-update TNIC and Compustat
    
    '''
    
    def helper(row):
        if row.GVKEY_STATUS == '1': # 4 Scenarios, there is priority
            if (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TS_GVKEY):
                return 'SS'
            elif (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TP_GVKEY):
                return "SP"
            elif (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TS_GVKEY):
                return "PS"
            elif (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TP_GVKEY):
                return "PP"
            else:
                return "XX"
            
            
        elif row.GVKEY_STATUS == "2": # 2 Scenarios 
            if (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TS_GVKEY):
                return "SS"
            elif (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TS_GVKEY):
                return "PS"
            else:
                return "XX"
        elif row.GVKEY_STATUS == "3": # 2 Scenarios 
            if (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TP_GVKEY):
                return "SP"
            elif (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TP_GVKEY):
                return "PP"
            else:
                return "XX"
        elif row.GVKEY_STATUS == "4": # 2 Scenarios 
            if (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TS_GVKEY):
                return "PS"
            elif (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TP_GVKEY):
                return "PP"
            else:
                return "XX"
        elif row.GVKEY_STATUS == "7": # 2 Scenarios 
            if (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TS_GVKEY):
                return "SS"
            elif (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TP_GVKEY):
                return "SP"
            else:
                return "XX"
            
            
            
        elif row.GVKEY_STATUS == "5": # 1 scenarios
            if (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TS_GVKEY):
                return "SS"
            else:
                return "XX"
        elif row.GVKEY_STATUS == "6": # 1 scenarios
            if (row.AS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AP_GVKEY != row.TP_GVKEY):
                return "SP"
            else:
                return "XX"
        elif row.GVKEY_STATUS == "8": # 1 scenarios
            if (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TS_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TS_GVKEY):
                return "PS"
            else:
                return "XX"
        elif row.GVKEY_STATUS == "9": # 1 scenarios
            if (row.AP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.TP_GVKEY in gvkey_lsts[row.YEAR-1]) & (row.AS_GVKEY != row.TP_GVKEY):
                return "PP"
            else:
                return "XX"
        
    df['MA_TYPE'] = df.apply(helper, axis=1)
        
    return df
        
        

In [165]:
sdc_df2 = mark_ma_type(sdc_df)

In [166]:
sdc_df2.MA_TYPE.value_counts()

XX    6222
SP    4097
SS    3499
PP    1404
PS     167
Name: MA_TYPE, dtype: int64

In [167]:
for y in range(s_year+1, e_year+2):
    print(y, sdc_df2[(sdc_df2.YEAR == y) & (sdc_df2.MA_TYPE == "XX")].MA_TYPE.value_counts()/ sdc_df2[(sdc_df2.YEAR == y)].shape[0])

1997 XX    0.34949
Name: MA_TYPE, dtype: float64
1998 XX    0.34996
Name: MA_TYPE, dtype: float64
1999 XX    0.368715
Name: MA_TYPE, dtype: float64
2000 XX    0.413547
Name: MA_TYPE, dtype: float64
2001 XX    0.392553
Name: MA_TYPE, dtype: float64
2002 XX    0.389764
Name: MA_TYPE, dtype: float64
2003 XX    0.410691
Name: MA_TYPE, dtype: float64
2004 XX    0.451872
Name: MA_TYPE, dtype: float64
2005 XX    0.429568
Name: MA_TYPE, dtype: float64
2006 XX    0.472141
Name: MA_TYPE, dtype: float64
2007 XX    0.435364
Name: MA_TYPE, dtype: float64
2008 XX    0.432049
Name: MA_TYPE, dtype: float64
2009 XX    0.438384
Name: MA_TYPE, dtype: float64
2010 XX    0.499014
Name: MA_TYPE, dtype: float64
2011 XX    0.486667
Name: MA_TYPE, dtype: float64
2012 XX    0.425963
Name: MA_TYPE, dtype: float64
2013 XX    0.377315
Name: MA_TYPE, dtype: float64
2014 XX    0.355263
Name: MA_TYPE, dtype: float64
2015 XX    0.41629
Name: MA_TYPE, dtype: float64
2016 XX    0.354331
Name: MA_TYPE, dtype: float64
201

In [171]:
sdc_df2[sdc_df2['MA_TYPE'] == "XX"].to_csv('../MA_data/test3.csv')