In [5]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import dask
from dask.diagnostics import ProgressBar
import os

In [4]:
tmp_data_path = '../MA_data/data/tmp'
data_path = '../MA_data/data'

s_year = 1997
e_year = 2020

In [3]:
sdc_tnic = pd.read_pickle(tmp_data_path+f"/sdc_tnic_{s_year}_{e_year}")

In [4]:
tnic3 = pd.read_pickle(tmp_data_path+f"/tnic3_{s_year-1}_{e_year-1}") # use tnic 3
tnic2 = pd.read_pickle(tmp_data_path+f"/tnic2_{s_year-1}_{e_year-1}") # use tnic 3

In [5]:
with open(tmp_data_path+f"/tnic_info_3_pairs_{s_year-1}_{e_year-1}", 'rb') as f:
    gvkey_lsts, key_ind_maps , ind_key_maps = pickle.load(f)

# Choose "Frequent Acquirer"

In [6]:
A_freq = pd.DataFrame(sdc_tnic.AGVKEY.value_counts()).reset_index(drop=False)
A_freq = A_freq[A_freq.AGVKEY >= 5]
A_freq.columns = ["GVKEY", "freq"]
print(f"totally {A_freq.shape[0]} numbers of frequent Acquirers")

totally 511 numbers of frequent Acquirers


## obtain acquirer's top peers
1. obtain top peers of frequent acquirer every year (only consider freq acquirer universe? -- NO, consider the entire universe)
    - top 10 and thres = 0.4

In [None]:
def check_if_exist(gvkey):
    for y in range(1996, 2019+1, 1):
        print(y, '5047' in gvkey_lsts[y])

In [12]:
def create_top_peer(tnic, tnic_name):
    '''
    tnic = tnic data
    tnic_name = "tnic2" or "tnic3"
    '''
    def find_top_peer(tnic, gvkey, year, k=10, thres=0.2):
        '''
        
        Obtain top peers 
        tnic has: year, gvkey1, gvkey2, score, rank

        return: a data frame, top peer list is .gvkey2.tolist()
        '''
        top_peers_subdf =  tnic[(tnic.year == year)]
        top_peers_subdf = top_peers_subdf[(top_peers_subdf.gvkey1 == gvkey)] 
        top_peers = top_peers_subdf[(top_peers_subdf['score'] >= thres)]

        if top_peers.shape[0] < k:
            top_peers_10 = top_peers_subdf[(top_peers_subdf['yearly_rank'] <= k)]
            if top_peers.shape[0] < top_peers_10.shape[0]:
                top_peers = top_peers_10

        #top_peers_lst = top_peers
        return top_peers
    
    @dask.delayed
    def create_top_peer_map(tnic, tnic_name, year, gvkey_lsts, k=10, thres=0.2):
        y = year
        year_lst = {}
        #print("processing year ", y)
        for gvkey1 in gvkey_lsts:
            year_lst[gvkey1] = find_top_peer(tnic, gvkey1, y, k, thres)
        with open(tmp_data_path+f"/a5_top_10_peers_{tnic_name}_{y}.pickle", 'wb') as f:
            pickle.dump(year_lst, f)
    
    # main
    
    A_freq_gvkey_list = A_freq.GVKEY.tolist()
    
    if tnic_name == "tnic2":
        peer_map_tasks = [create_top_peer_map(tnic, "tnic2", yy, A_freq_gvkey_list, k=10, thres=0.2) for yy in range(s_year-1, e_year)]
    else:
        peer_map_tasks = [create_top_peer_map(tnic,"tnic3", yy, A_freq_gvkey_list, k=10, thres=0.2) for yy in range(s_year-1, e_year)]
    
    with ProgressBar():
        dask.compute(peer_map_tasks, num_workers = os.cpu_count())
    

In [14]:

#create_top_peer(tnic2, "tnic2")

[########################################] | 100% Completed |  1hr 12min  1.0s


In [289]:
focal_gvkey = "5047"


def get_focal_df(focal_gvkey):
    '''
    output: will be a df contains 3 columns: DATE, AGVKEY, EVENT_TYPE, SCORE
        DATE: datetime.dt object
        AGVKEY: str: 4 - 6 digits
        EVENT_TYPE: 1:self 0:peer (integer)
        SCORE: TNIC similarity last year for event type 0, otherwise 1
    
    '''
    def helper1(row):
        if row.AGVKEY == focal_gvkey:
            return 1 # integer 1
        else:
            return 0 # integer 0   
    sdc_lst = []
    for focal_year in range(s_year-1, e_year):  
        with open(tmp_data_path+f"/a5_top_10_peers_tnic2_{focal_year}.pickle", 'rb') as f:
            top_peers = pickle.load(f)
        try:
            top_peers = top_peers[focal_gvkey] # a dataframe
 #           print(top_peers)
            top_peers_lst = top_peers.gvkey2.tolist()
            selected_sdc_tnic = sdc_tnic[ (sdc_tnic['AGVKEY'].isin(top_peers_lst + [focal_gvkey])) & (sdc_tnic.YEAR == focal_year+1) ] 
            selected_sdc_tnic.reset_index(drop=True)
            if selected_sdc_tnic.shape[0] > 0:
                #print(selected_sdc_tnic[['DE', 'AGVKEY']] , top_peers[['gvkey2', 'score']])
                df = selected_sdc_tnic[['DE', 'AGVKEY']]
                
                
                df['EVENT_TYPE'] = df.apply(helper1, axis=1)
                #print(df)
                
                score_df = top_peers[['gvkey2', 'score']]
                
                df = df.merge(score_df, left_on='AGVKEY', right_on = 'gvkey2', how = 'left')
                df = df[['DE','AGVKEY', 'EVENT_TYPE', 'score']]
#                print(df)
                df = df.fillna(1)
                df.columns = ['UPDATE_DATE','AGVKEY','EVENT_TYPE', 'SCORE']
                df = df.reset_index(drop=True)
                sdc_lst.append(df)
            #print(len(sdc_lst))
        except:
            pass

    focal_df = pd.concat(sdc_lst, axis=0)
    focal_c = focal_df.reset_index(drop=True) 
    focal_c = focal_c.sort_values(by = ['UPDATE_DATE']) # date time is unsortable..
    focal_c.reset_index(drop=True, inplace=True)
    return focal_c


In [290]:
focal_c = get_focal_df(focal_gvkey)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EVENT_TYPE'] = df.apply(helper1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EVENT_TYPE'] = df.apply(helper1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EVENT_TYPE'] = df.apply(helper1, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [291]:
def convert_date(df):
    def datetime_converter(date_time):
        base_time = np.datetime64('1997-01-01')
        days_diff = np.datetime64(date_time.date()) - base_time
        return days_diff.astype(int)
    for idx, row in df.iterrows():
        df.loc[idx, 'UPDATE_DATE'] = datetime_converter(df.loc[idx, 'UPDATE_DATE'])
    
    df.sort_values(by = ['UPDATE_DATE']).reset_index(drop=True, inplace=True)
    return df
    

In [307]:
focal_c2 = convert_date(focal_c.copy())

In [304]:
def making_time_diff(focal_c2):
    '''
    df = focal_c; update date is the integer form that count the date from base_date (1997 01 01)
    
    WARNING: the No.1 event set time-diff = 0
    '''
    tmp_columns = focal_c2.columns.tolist()
    focal_c2['UPDATE_DATE'] = [0] + [1 if timediff==0 else timediff for timediff in focal_c2.UPDATE_DATE.diff().tolist()[1:] ]
    focal_c2.columns = ['time_diff'] + tmp_columns[1:]
    return focal_c2
    

In [308]:
focal_c3 = making_time_diff(focal_c2.copy())

In [309]:
focal_c3

Unnamed: 0,time_diff,AGVKEY,EVENT_TYPE,SCORE
0,0,5047,1,1.0000
1,44,5047,1,1.0000
2,28,5047,1,1.0000
3,4,10519,0,0.0242
4,1,5047,1,1.0000
...,...,...,...,...
193,323,5047,1,1.0000
194,200,5047,1,1.0000
195,218,3243,0,0.0105
196,260,5047,1,1.0000


In [311]:
arr_c = np.array(focal_c3[['time_diff', 'EVENT_TYPE', 'SCORE']])

In [312]:
arr_c

array([[0.00e+00, 1.00e+00, 1.00e+00],
       [4.40e+01, 1.00e+00, 1.00e+00],
       [2.80e+01, 1.00e+00, 1.00e+00],
       [4.00e+00, 0.00e+00, 2.42e-02],
       [1.00e+00, 1.00e+00, 1.00e+00],
       [1.00e+01, 0.00e+00, 5.12e-02],
       [7.00e+00, 1.00e+00, 1.00e+00],
       [4.60e+01, 1.00e+00, 1.00e+00],
       [1.00e+00, 1.00e+00, 1.00e+00],
       [8.00e+00, 0.00e+00, 3.59e-02],
       [5.00e+01, 1.00e+00, 1.00e+00],
       [1.00e+00, 1.00e+00, 1.00e+00],
       [4.00e+00, 1.00e+00, 1.00e+00],
       [3.80e+01, 0.00e+00, 3.59e-02],
       [6.00e+00, 0.00e+00, 5.12e-02],
       [3.40e+01, 1.00e+00, 1.00e+00],
       [1.30e+01, 1.00e+00, 1.00e+00],
       [2.80e+01, 1.00e+00, 1.00e+00],
       [2.00e+00, 0.00e+00, 3.97e-02],
       [9.20e+01, 0.00e+00, 2.78e-02],
       [4.60e+01, 1.00e+00, 1.00e+00],
       [1.30e+01, 1.00e+00, 1.00e+00],
       [7.00e+00, 0.00e+00, 2.67e-02],
       [3.00e+00, 0.00e+00, 2.61e-02],
       [3.30e+01, 1.00e+00, 1.00e+00],
       [4.40e+01, 1.00e+0

# obtain fv of freq acquirers



### add Date to fv

In [88]:
def add_datetime(df):
    def helper(row):
        return np.datetime64(str(row.year+1)+'-01-01')
    df['UPDATE_DATE'] = df.apply(helper, axis=1)
    return df

In [89]:
with open(tmp_data_path+"/afreq_full_fv.pickle", "rb") as f:
    fv = pickle.load(f)
fv = add_datetime(fv)

In [132]:
def obtain_fv(focal_gvkey, focal_c, fv):
    year_min, year_max = min([date.year for date in focal_c.UPDATE_DATE.tolist()]), max([date.year for date in focal_c.UPDATE_DATE.tolist()])
    fv_subset = fv[(fv.year >= year_min-1) & (fv.year <= year_max-1) & (fv.gvkey == focal_gvkey)]
    fv_subset = fv_subset[['gvkey', 'year','UPDATE_DATE', 'at', 'sale', 'ch', 'm2b', 'lev', 'roa', 'ppe',
       'cash2asset', 'cash2sale', 'sale2asset', 'de', 'roe', 'd_sale', 'd_at']]
    fv_subset.columns=['AGVKEY', 'year','UPDATE_DATE', 'at', 'sale', 'ch', 'm2b', 'lev', 'roa', 'ppe',
       'cash2asset', 'cash2sale', 'sale2asset', 'de', 'roe', 'd_sale', 'd_at']
    return fv_subset
    

In [133]:
focal_b = obtain_fv("5047", tmp, fv)

In [321]:
focal_c

Unnamed: 0,UPDATE_DATE,AGVKEY,EVENT_TYPE,SCORE
0,1997-04-16,5047,1,1.0000
1,1997-05-30,5047,1,1.0000
2,1997-06-27,5047,1,1.0000
3,1997-07-01,10519,0,0.0242
4,1997-07-01,5047,1,1.0000
...,...,...,...,...
193,2016-09-08,5047,1,1.0000
194,2017-03-27,5047,1,1.0000
195,2017-10-31,3243,0,0.0105
196,2018-07-18,5047,1,1.0000


In [158]:
focal_b

Unnamed: 0,AGVKEY,year,UPDATE_DATE,at,sale,ch,m2b,lev,roa,ppe,cash2asset,cash2sale,sale2asset,de,roe,d_sale,d_at
7782,5047,1996,1997-01-01,272402.0,78541.0,37003.0,1.452295,0.475202,0.026725,0.105708,0.053577,0.25035,0.288328,4.158908,0.233896,0.001172,0.09342
7783,5047,1997,1998-01-01,304012.0,88540.0,37003.0,1.646193,0.475896,0.026982,0.106298,0.053577,0.25035,0.291239,4.201115,0.238196,0.127309,0.116042
7784,5047,1998,1999-01-01,355935.0,99820.0,37003.0,1.801978,0.491778,0.026117,0.100383,0.053577,0.25035,0.280444,4.502083,0.239095,0.1274,0.170793
7785,5047,1999,2000-01-01,405200.0,110832.0,37003.0,2.126689,0.497959,0.026449,0.101239,0.053577,0.25035,0.273524,4.741241,0.251827,0.110319,0.13841
7786,5047,2000,2001-01-01,437006.0,128051.0,8195.0,1.954068,0.460662,0.029141,0.091566,0.018753,0.063998,0.293019,3.987008,0.252218,0.155361,0.078495
7787,5047,2001,2002-01-01,495023.0,125679.0,9082.0,1.674469,0.470447,0.02854,0.085127,0.018347,0.072263,0.253885,4.247811,0.257697,-0.018524,0.13276
7788,5047,2002,2003-01-01,575244.0,130685.0,8910.0,1.289519,0.485719,0.026307,0.082059,0.015489,0.068179,0.227182,4.385882,0.237544,0.039832,0.162055
7789,5047,2003,2004-01-01,647483.0,133585.0,12664.0,1.339667,0.470933,0.024076,0.082445,0.019559,0.094801,0.206314,3.850985,0.196881,0.022191,0.12558
7790,5047,2004,2005-01-01,499118.142857,115853.333333,37003.0,1.494505,0.527306,0.022999,0.10579,0.053577,0.25035,0.25883,4.1455,0.201941,0.001172,0.09342
7791,5047,2005,2006-01-01,499118.142857,115853.333333,37003.0,1.494505,0.527306,0.022999,0.10579,0.053577,0.25035,0.25883,4.1455,0.201941,0.001172,0.09342


In [315]:
arr_b = np.array(focal_b.iloc[:, 3:])

In [320]:
arr_b

array([[ 2.72402000e+05,  7.85410000e+04,  3.70030000e+04,
         1.45229510e+00,  4.75202091e-01,  2.67252076e-02,
         1.05707741e-01,  5.35767172e-02,  2.50350296e-01,
         2.88327545e-01,  4.15890763e+00,  2.33895582e-01,
         1.17151393e-03,  9.34195773e-02],
       [ 3.04012000e+05,  8.85400000e+04,  3.70030000e+04,
         1.64619304e+00,  4.75895688e-01,  2.69824875e-02,
         1.06298436e-01,  5.35767172e-02,  2.50350296e-01,
         2.91238504e-01,  4.20111505e+00,  2.38196179e-01,
         1.27309303e-01,  1.16041732e-01],
       [ 3.55935000e+05,  9.98200000e+04,  3.70030000e+04,
         1.80197843e+00,  4.91777993e-01,  2.61171281e-02,
         1.00383497e-01,  5.35767172e-02,  2.50350296e-01,
         2.80444463e-01,  4.50208333e+00,  2.39094650e-01,
         1.27400045e-01,  1.70792600e-01],
       [ 4.05200000e+05,  1.10832000e+05,  3.70030000e+04,
         2.12668918e+00,  4.97959033e-01,  2.64486673e-02,
         1.01238894e-01,  5.35767172e-02,  2.

# create main timeline

- b and c combine together

- '1' = 'self'
- '2' = 'peer'
- '3' = 'fv'


In [184]:
def create_main_timeline(focal_b, focal_c):
    '''
    WARNING: GLOBAL and LOCAL time both start from 0!
    
    '''
    def helper(row):
        if (row.EVENT_TYPE == 1) or (row.EVENT_TYPE == 0):
            return 'past'
        else:
            return "fv"
        
    def helper2(row):
        if row.EVENT_TYPE == 1:
            return "1"
        elif row.EVENT_TYPE == 0:
            return "2"
        else:
            return "3"
        
    tmp = pd.concat([focal_c, focal_b]).sort_values(by=['UPDATE_DATE'])
    tmp['EVENT_TYPE_countcreater'] = tmp.apply(helper, axis=1)
    tmp['EVENT_TYPE_true'] = tmp.apply(helper2, axis=1)
    tmp['LOCAL_IDX'] = tmp.groupby(['EVENT_TYPE_countcreater']).cumcount()
    
    tmp_columns = tmp.columns
    tmp.reset_index(drop=True, inplace=True)
    tmp.reset_index(drop=False, inplace=True)
    
    tmp.columns = ['GLOBAL_IDX']+ tmp_columns.tolist() # rename global index
    
    tmp = tmp[['GLOBAL_IDX', 'LOCAL_IDX', 'UPDATE_DATE', 'EVENT_TYPE_true']]

    tmp.columns = ['GLOBAL_IDX', 'LOCAL_IDX', 'UPDATE_DATE', 'EVENT_TYPE'] # rename
    
    
    
    return tmp
    

    

In [185]:
timeline = create_main_timeline(focal_b, focal_c)

In [186]:
timeline

Unnamed: 0,GLOBAL_IDX,LOCAL_IDX,UPDATE_DATE,EVENT_TYPE
0,0,0,1997-01-01,3
1,1,0,1997-04-16,1
2,2,1,1997-05-30,1
3,3,2,1997-06-27,1
4,4,3,1997-07-01,1
...,...,...,...,...
215,215,194,2017-03-27,1
216,216,195,2017-10-31,2
217,217,21,2018-01-01,3
218,218,196,2018-07-18,1
