In [103]:
import os
import pickle
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from os.path import join as pjoin
import matplotlib.pyplot as plt
import warnings


In [104]:
tmp_data_path = '../MA_data/data/tmp/'
data_path = '../MA_data/data/'

s_year = 1996
e_year = 2019
k = 10

In [105]:
def load_tnic(data_path, s_year, e_year):
    tnic3 =  pd.read_csv(data_path+'tnic3_data/tnic3_data.txt', delimiter = '\t' )
    tnic = tnic3[(tnic3['year'] <= e_year) & (tnic3['year'] >= s_year)]


    tnic['gvkey1'] = tnic['gvkey1'].astype(str)
    tnic['gvkey2'] = tnic['gvkey2'].astype(str)

    tnic = tnic.dropna(subset=['score'])
    return tnic
    

In [106]:
def get_gvkeylst_adj(tnic, year, top_peer=10, get_adj=False, weighted=False):

    def add_rank(df):
        df['rank'] = df.groupby('gvkey1')['score'].rank('dense', ascending=False)
        df['rank'] = df['rank'].astype(int)
        return df
    
    tnic_year = tnic[tnic.year == year]

    tnic_year = add_rank(tnic_year)
    tnic_year_top = tnic_year[tnic_year['rank'] <= top_peer].reset_index(drop=True)
    
    
    gvkey_arr = np.unique(np.concatenate((tnic_year_top.gvkey1.unique(), tnic_year_top.gvkey2.unique()), axis = 0))
    gvkey_lst = list(gvkey_arr)
    N = len(gvkey_lst)
    
    gvkey_index_mapping = {value:index for index, value in enumerate(gvkey_lst)}
    index_gvkey_mapping = {index:value for index, value in enumerate(gvkey_lst)}
    #return tnic_year_top, gvkey_arr gvkey_index_mapping, index_gvkey_mapping
    if get_adj:
        adj = np.zeros((N,N))
        
        for _, row in tnic_year_top.iterrows():
            
            gvkey1 = row['gvkey1']
            gvkey2 = row['gvkey2']
            score = row['score']
            
            index1 = gvkey_index_mapping[gvkey1]
            index2 = gvkey_index_mapping[gvkey2]
#            print(index1, index2)
            
            
            
            if weighted:
                adj[index1, index2] = score
            else:
                adj[index1, index2] = 1
            
        
        return gvkey_lst, gvkey_index_mapping, index_gvkey_mapping,   adj    
    else:
        return gvkey_lst, gvkey_index_mapping, index_gvkey_mapping
    
    
    
    

In [107]:
def sdc_in_tnic(df, gvkey_lst):
    '''
    df: yearly sdc_df
    gvkey_lst: yearly
    
    return integer 1 , 0 !! not str
    '''
    
    # totally 4 conditions are analysable
    def mark1(row):
        if (row.AS_GVKEY in gvkey_lst) & (row.TS_GVKEY in gvkey_lst):
                return 1
        else:
            return 0
    def mark2(row):
        if (row.AS_GVKEY in gvkey_lst) & (row.TP_GVKEY in gvkey_lst):
                return 1
        else:
            return 0
    def mark3(row):
        if (row.AP_GVKEY in gvkey_lst) & (row.TS_GVKEY in gvkey_lst):
                return 1
        else:
            return 0
    def mark4(row):
        if (row.AP_GVKEY in gvkey_lst) & (row.TP_GVKEY in gvkey_lst):
                return 1
        else:
            return 0
    def mark_all(row):
        if (row['AS_TS_TNIC'] == 1) | (row['AS_TP_TNIC'] == 1) | (row['AP_TS_TNIC'] == 1) | (row['AP_TP_TNIC'] == 1) :
            return 1
        else:
            return 0
    
    df['AS_TS_TNIC'] = df.apply(mark1, axis=1)
    df['AS_TP_TNIC'] = df.apply(mark2, axis=1)
    df['AP_TS_TNIC'] = df.apply(mark3, axis=1)
    df['AP_TP_TNIC'] = df.apply(mark4, axis=1)
    
    df['A_T_TNIC'] = df.apply(mark_all, axis=1) # analysable
    
    return df

In [108]:
def check_coverage(sdc_df, year, k):
    '''
    check portion of deals matched with GVKEY, could be found in graph
    
    '''
    sdc_df_year = sdc_df[sdc_df.YEAR == year]
    gvkey_year, _, _ = get_gvkeylst_adj(tnic, year=year, top_peer=k, get_adj=False)
    sdc_df_year2 = sdc_in_tnic(sdc_df_year, gvkey_year)
    print(f"total number of deals with GVKEY match is: {sdc_df_year2.shape[0]}")
    answer = sdc_df_year2.A_T_TNIC.value_counts().values[0]/ sdc_df_year2.shape[0]
    print(f"coverage in year {year} is {str(round(answer, 3))}")
    
    

# Load data

In [109]:
sdc_df = pd.read_pickle(tmp_data_path+'/merged_analysable.pickle')

In [110]:
tnic = load_tnic(data_path, s_year, e_year)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tnic['gvkey1'] = tnic['gvkey1'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tnic['gvkey2'] = tnic['gvkey2'].astype(str)


# What is the portion of SDC deals can be found in the TNIC graph
- around 50% - 60%
- roughly totally 12000*0.55 = 6600 deals 


In [111]:

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for year in range(s_year, e_year+1, 1):
        check_coverage(sdc_df, year, k)

total number of deals with GVKEY match is: 715
coverage in year 1996 is 0.648
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1263
coverage in year 1998 is 0.508
total number of deals with GVKEY match is: 1253
coverage in year 1999 is 0.515
total number of deals with GVKEY match is: 1122
coverage in year 2000 is 0.587
total number of deals with GVKEY match is: 940
coverage in year 2001 is 0.576
total number of deals with GVKEY match is: 762
coverage in year 2002 is 0.538
total number of deals with GVKEY match is: 767
coverage in year 2003 is 0.574
total number of deals with GVKEY match is: 748
coverage in year 2004 is 0.564
total number of deals with GVKEY match is: 717
coverage in year 2005 is 0.533
total number of deals with GVKEY match is: 682
coverage in year 2006 is 0.585
total number of deals with GVKEY match is: 673
coverage in year 2007 is 0.557
total number of deals with GVKEY match is: 493
coverage in y

# If increase k of top-k peers could increase coverage?
- No, but a single event can update more firms

## Link with SDC

How many SDC deals could be found in the graph (Both side could be found in this list)

Only the following 4 conditons are analysable:

| AP | AS | TS | TP |
|----|----|----|----|
|    | 1  | 1  |    |
|    | 1  |    | 1  |
| 1  |    | 1  |    |
| 1  |    |    | 1  |

- the blank place could be either 0 or 1


In [112]:
year = 1997
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for k in range(10, 100, 10):
        check_coverage(sdc_df, year, k)

total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
total number of deals with GVKEY match is: 1176
coverage in year 1997 is 0.509
