In [2]:
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
import warnings

In [3]:
tmp_data_path = '../MA_data/data/tmp'
data_path = '../MA_data/data'

s_year = 1997
e_year = 2020

In [150]:
sdc_tnic = pd.read_pickle(tmp_data_path+f"/sdc_tnic_{s_year}_{e_year}")

In [332]:

with open(data_path+"/dataset_top10_freq5_1997_2020.pickle", "rb") as f:
    arr_cs, arr_bs, timelines = pickle.load(f)

In [330]:
with open(data_path+"/freq_acquirer_lst.pickle", 'rb') as f:
    corresponding_gvkey_lst = pickle.load( f)

In [148]:
len(arr_cs)

511

In [None]:
with open(tmp_data_path+f"/tnic_info_3_pairs_{s_year-1}_{e_year-1}", 'rb') as f:
    gvkey_lsts, key_ind_maps , ind_key_maps = pickle.load(f)

# arr_b, arr_c, arr_delta_time


## arr_c and  arr_delta_time

- you have to ignore the first event (if it's self event, you cannot model it since there's no "time shift" of the 1st self/peer event)
- arr_delta_time is +1 shift of time in arr_c (so, the first self event you can model is idx = 2 (if you look at original arr_c, but later I drop the first row of arr_c so the "past event" could start from idx=0, and "self-event" start from idx=1))

In [373]:
with open(tmp_data_path+"/testing1.pickle", "rb") as f:
    arr_b, sample_arr_c, sample_df = pickle.load(f)

In [374]:
arr_c = sample_arr_c[1:]
arr_delta_time = arr_c[:, 0]

## arr_b

arr_b has nothing to modify

In [375]:
arr_b = arr_bs[0]

# Event data

+ event_data: 
   + arr_b_idx: lst of indeces, [3, 3, 4, 4, 4, 5, 9 ...]
       + length = L3
       + element: integer as row number in arr_b; for true event
   + arr_c_idx: lst of indeces, [3, 3, 4, 4, 4, 5, 9 ...]
       + length = L3
       + element: integer as row number in arr_c; for true event
   + arr_t_idx
       + length = L3



## arr_b_idx

for every self event, which row of fv should you use as b?

qualify:

1. Event_type = 1 (self event)
2. local_idx >= 2

Idea:
- get event_type = 3 right before the global idx of the qualified self event:... and retreive the local idx of that row





In [376]:

def get_arr_b_odx(df):
    '''
    df is the a timeline table of a single firm (the 3rd output of preprocess function)
    
    output is a list
    
    '''
    sample_df = df.copy()
    sub1 = sample_df[(sample_df.EVENT_TYPE == '1')& (sample_df.LOCAL_IDX >=2)]

    global_idxs = sub1.GLOBAL_IDX.values # array

    arr_b_idxs = []
    for global_idx in global_idxs:
        sub2 = sample_df[(sample_df.EVENT_TYPE == '3') & (sample_df.GLOBAL_IDX < global_idx)]
        arr_b_idx = sub2.iloc[-1, 1]
        arr_b_idxs.append(arr_b_idx)

    return arr_b_idxs


In [377]:
tmp_b_idx = get_arr_b_odx(sample_df)

## arr_c_idx and arr_t_idx

when you model the i-th event in c (which is a self event, but keep in mind c contains both self and peer event), get the hidden state of i-1 event

In [378]:
def get_c_t_idx(df):
    sample_df = df.copy()
    sub1 = sample_df[(sample_df.EVENT_TYPE == '1')& (sample_df.LOCAL_IDX >=2)] 
    local_idxs = sub1.LOCAL_IDX.values # array
    
    arr_c_idxs = []
    arr_t_idxs = []
    for local_idx in local_idxs:
        sub2 = sample_df[(sample_df.EVENT_TYPE.isin(['1','2'])) & (sample_df.LOCAL_IDX < local_idx)]
        arr_c_idx = sub2.iloc[-1, 1] -1
        arr_t_idx = sub2.iloc[-1, 1]
        arr_c_idxs.append(arr_c_idx)
        arr_t_idxs.append(arr_t_idx)
    return arr_c_idxs, arr_t_idxs
        
        
    

In [379]:
tmp_c_idx, tmp_t_idx = get_c_t_idx(sample_df)

In [380]:
print(tmp_b_idx, '\n')
print(tmp_c_idx, '\n')
print(tmp_t_idx, '\n')

[0.0, 2.0, 2.0, 12.0] 

[2.0, 6.0, 8.5, 31.0] 

[3.0, 7.0, 9.5, 32.0] 



# non_event_data

- non_event_data: for negative sampling in timing model only; idea: draw time point from Unif(0, MAX_T). pick up the corresponding b, c, delta_t.
    - arr_b_idx:  lst of indeces, 
      -  length: L_Neg
    - arr_c_idx: 
    - estimate_length: scalar, = max(time)  

In [387]:
tmp_df = convert_date(sample_df)

In [382]:
def convert_date(df):
    df = df.copy()
    def datetime_converter(date_time):
        base_time = np.datetime64('1997-01-01')
        days_diff = np.datetime64(date_time.date()) - base_time
        return days_diff.astype(int)
    for idx, row in df.iterrows():
        df.loc[idx, 'UPDATE_DATE_int'] = datetime_converter(df.loc[idx, 'UPDATE_DATE'])

    #df.sort_values(by = ['UPDATE_DATE']).reset_index(drop=True, inplace=True)
    return df

In [383]:
def sample_negative_time_point(df, base_n_sample=10):
    '''
    df is timeline + 'UPDATE_DATE_int'
    
    number of negative samples is corresponding to the number of positive samples (follow the idea of negative smapling in skip-gram)
        each word, approx 10 negative samples.
    
    '''
    df = df.copy()
    max_time = df.UPDATE_DATE_int.values[-1]
    sub_df = df[df.EVENT_TYPE.isin(['1', '2']) & (df.LOCAL_IDX >=2)]
    min_time = sub_df.UPDATE_DATE_int.values[0]
    n_samples = base_n_sample * df.GLOBAL_IDX.values[-1]
    samples = np.random.uniform(low=min_time, high=max_time, size=n_samples)
    return samples, max_time - min_time
    
    

In [388]:
samples, estimate_time_length = sample_negative_time_point(tmp_df)

## arr_b_idx


In [389]:
def get_arr_b_idx_neg(time_samples, df):
    '''
    df is timeline + 'UPDATE_DATE_int'
    
    '''
    df = df.copy()
    df_b = df[df.EVENT_TYPE == '3']
    arr_b_idxs = []
    for time in time_samples:
        df_b_sub = df_b[df_b.UPDATE_DATE_int<time]
        arr_b_idxs.append(df_b_sub.iloc[-1, 1])
    
    return arr_b_idxs
        

In [390]:
get_arr_b_idx_neg(samples, tmp_df)

[17.0,
 7.0,
 8.0,
 9.0,
 8.0,
 0.0,
 4.0,
 14.0,
 10.0,
 1.0,
 20.0,
 1.0,
 4.0,
 18.0,
 6.0,
 6.0,
 14.0,
 20.0,
 3.0,
 8.0,
 4.0,
 17.0,
 3.0,
 7.0,
 10.0,
 17.0,
 6.0,
 15.0,
 5.0,
 11.0,
 12.0,
 0.0,
 3.0,
 1.0,
 19.0,
 0.0,
 18.0,
 7.0,
 11.0,
 11.0,
 5.0,
 20.0,
 9.0,
 0.0,
 0.0,
 10.0,
 16.0,
 13.0,
 7.0,
 12.0,
 9.0,
 20.0,
 12.0,
 10.0,
 12.0,
 6.0,
 4.0,
 3.0,
 5.0,
 15.0,
 17.0,
 18.0,
 16.0,
 16.0,
 18.0,
 6.0,
 2.0,
 16.0,
 19.0,
 4.0,
 14.0,
 19.0,
 13.0,
 12.0,
 15.0,
 9.0,
 2.0,
 4.0,
 5.0,
 12.0,
 8.0,
 14.0,
 13.0,
 3.0,
 10.0,
 20.0,
 20.0,
 11.0,
 1.0,
 5.0,
 12.0,
 12.0,
 14.0,
 11.0,
 14.0,
 6.0,
 18.0,
 10.0,
 16.0,
 14.0,
 2.0,
 8.0,
 19.0,
 2.0,
 1.0,
 1.0,
 9.0,
 19.0,
 1.0,
 8.0,
 0.0,
 19.0,
 1.0,
 5.0,
 16.0,
 18.0,
 3.0,
 0.0,
 18.0,
 14.0,
 2.0,
 15.0,
 19.0,
 5.0,
 6.0,
 0.0,
 9.0,
 13.0,
 18.0,
 15.0,
 8.0,
 17.0,
 14.0,
 10.0,
 13.0,
 0.0,
 5.0,
 6.0,
 6.0,
 18.0,
 13.0,
 16.0,
 20.0,
 15.0,
 5.0,
 15.0,
 0.0,
 12.0,
 13.0,
 13.0,
 15.0,
 13.0,
 1.0,


## arr_c_idx and arr_t

In [391]:
def get_arr_c_t_idx_neg(time_samples, df):
    '''
    df is timeline + 'UPDATE_DATE_int'
    total columns are: [GLOBAL_IDX  LOCAL_IDX UPDATE_DATE EVENT_TYPE  UPDATE_DATE_int]
    '''
    df = df.copy()
    df_c = df[df.EVENT_TYPE.isin(['1', '2']) & (df.LOCAL_IDX >=2)]

    arr_c_idxs_neg = []
    arr_t_neg = []
    for time in time_samples:
        df_before = df_c[df_c.UPDATE_DATE_int < time]
        #print(time)
        
        arr_c_idx_neg = df_before.iloc[-1, 1] # here do not -1!
        previous_time = df_before.iloc[-1, 5]
        print(time, previous_time)
        arr_c_idxs_neg.append(arr_c_idx_neg)
        arr_t_neg.append(time - previous_time)
    
    return arr_c_idxs_neg, np.array(arr_t_neg)
        
        
        
        
        
        

In [392]:
tmp_df

Unnamed: 0,GLOBAL_IDX,LOCAL_IDX,UPDATE_DATE,EVENT_TYPE,TGVKEY,UPDATE_DATE_int
0,0,0.0,2000-01-01,3,,1095.0
1,1,1.0,2000-03-06,1,3105,1160.0
2,2,1.0,2000-03-06,2,3105,1160.0
3,3,1.0,2000-03-06,2,3105,1160.0
4,4,3.0,2000-03-13,2,10580,1167.0
...,...,...,...,...,...,...
61,61,42.0,2018-12-03,2,10726,8006.0
62,62,19.0,2019-01-01,3,,8035.0
63,63,43.0,2019-02-13,2,112030,8078.0
64,64,20.0,2020-01-01,3,,8400.0


In [393]:
arr_c_idxs_neg, arr_t_neg = get_arr_c_t_idx_neg(samples, tmp_df)

7593.308352265808 7321.0
3929.6910979205763 3900.0
4174.991586461352 4165.0
4714.762569579885 4625.0
4304.507262000441 4206.0
1359.5113640226896 1279.0
2655.9752304571216 2339.0
6430.307308389162 6419.0
4956.63100104246 4625.0
1484.8107775347682 1279.0
8533.1611434728 8078.0
1506.4972715273975 1279.0
2717.8343249552627 2339.0
7804.326808446769 7609.0
3600.706019842716 3535.0
3579.9021111507786 3535.0
6551.134632960025 6550.0
8694.51213101231 8078.0
2512.038222663421 2339.0
4364.730542452634 4206.0
2657.7867417114803 2339.0
7638.756388687758 7609.0
2433.613451218386 2339.0
3852.1035619431755 3821.0
4857.1686462072175 4625.0
7489.305660502155 7321.0
3595.73008472824 3535.0
6706.145487092548 6550.0
3150.4472663467445 3128.0
5117.682225643928 4625.0
5661.342144861919 4625.0
1191.633044129765 1175.0
2329.4802948078127 2184.0
1668.0989887708702 1279.0
8120.110487510821 8078.0
1272.7840380194284 1175.0
7808.849568467784 7609.0
3778.8826795140167 3535.0
5447.276619041891 4625.0
5358.9680580778

7314.954195870948 7041.0
5149.9442434966495 4625.0
7822.101991036349 7609.0
7903.198553504887 7609.0
4202.661175386771 4165.0
7826.213046161904 7609.0
5792.893511861902 5774.0
6255.940737240474 6015.0
7258.887926437913 7041.0
7082.885119083325 7041.0
7251.420957984981 7041.0
8151.2656512188905 8078.0
2852.513027669762 2806.0
4090.7469315965477 3935.0
7412.956257216654 7321.0
3506.4001722178896 3442.0
7752.233974055633 7609.0
5375.14114100286 4625.0
2130.661867897034 1988.0
6929.6797042189455 6550.0
3314.6308529199982 3128.0
6538.740604061246 6440.0
6089.59168292427 6015.0
5133.501030438991 4625.0
2431.8371909359753 2339.0
5278.982645654889 4625.0
1564.5342975396684 1279.0
7788.231557692833 7609.0
4375.335832093967 4206.0
3497.7416876217435 3442.0
5063.788464091121 4625.0
5524.525351004439 4625.0
6831.468124513819 6550.0
7272.324348819051 7041.0
3243.402773334417 3128.0
2945.608914967178 2892.0
3969.1654947179927 3935.0
5090.134596627203 4625.0
4447.023031913348 4206.0
8493.446653143794

In [394]:
arr_t_neg

array([2.72308352e+02, 2.96910979e+01, 9.99158646e+00, 8.97625696e+01,
       9.85072620e+01, 8.05113640e+01, 3.16975230e+02, 1.13073084e+01,
       3.31631001e+02, 2.05810778e+02, 4.55161143e+02, 2.27497272e+02,
       3.78834325e+02, 1.95326808e+02, 6.57060198e+01, 4.49021112e+01,
       1.13463296e+00, 6.16512131e+02, 1.73038223e+02, 1.58730542e+02,
       3.18786742e+02, 2.97563887e+01, 9.46134512e+01, 3.11035619e+01,
       2.32168646e+02, 1.68305661e+02, 6.07300847e+01, 1.56145487e+02,
       2.24472663e+01, 4.92682226e+02, 1.03634214e+03, 1.66330441e+01,
       1.45480295e+02, 3.89098989e+02, 4.21104875e+01, 9.77840380e+01,
       1.99849568e+02, 2.43882680e+02, 8.22276619e+02, 7.33968058e+02,
       1.19977841e+02, 6.04752400e+02, 4.15154330e+02, 5.09386777e+01,
       7.30591781e+01, 3.65595591e+02, 1.85414059e+02, 6.41013388e+01,
       2.67702974e+02, 1.10654944e+03, 3.46219156e+02, 5.93368407e+02,
       9.25033321e+02, 4.82166851e+02, 1.07762468e+03, 2.56490055e+02,
      

# chocie data dict



- choice_data_dict: for choice model only, a dict contains:(invariant for firms,  variant by year)
                - dict_idx: A dict split arr_b_idx and arr_c_idx to year. e.g. {1997: [true_tar_idxs, arr_b_idx, arr_c_idx], '1998': [arr_b_idx, arr_c_idx], ...} 
                            year is "the year that self event happens" (so the size varies for differnt firms)
                    
                - true_tar_idxs_i: a list, length = N_i_2 
                            contains the idx of acquirer in the graph (not gvkey)
                    - arr_b_idx_i: lst, N_i_2 
                    - arr_c_idx_i: lst, N_i_2 

                - node features array: [N_i, in_channels_i]
                - network structure array(edges): [2, N_edges_i] # the idx here is corresponding to node feature array

                



## dict_idx



### arr_c_idx_i + arr_b_idx_i

```python
# create a new year var by update_date
def get_arr_b_c_idx_i()
    df['year'] = update_date.year
    sub = df[event_type == 1, local_idx >= 2]


    yearly = {}
    for year in range(s_year, e_year+1):
        b_idxs = []
        c_idxs = []
        sub2 = sub[year==year] # yearly self event
        for row in iterrows:
            time = get_time(row)
            # back to global df
            df_b_before = df[update-date-int < time, EVENT_type =='3']
            df_c_before = df[update-date-int < time, EVENT_type.isin(['1', '2'])]
            idx_c = df_c_before.iloc[-1, 1] - 1 # since remove the 1st row = 0
            idx_b = df_b_before.iloc[-1, 1]
            b_idxs.append(idx_b)
            c_idxs.append(idx_c)

        yearly[year]=b_idxs

    return yearly
        
    
    
    
    
```

In [395]:
def get_arr_b_c_idx_i(df, s_year, e_year):
    '''
    df is timeline + 'UPDATE_DATE_int'
    total columns are: [GLOBAL_IDX  LOCAL_IDX UPDATE_DATE EVENT_TYPE  UPDATE_DATE_int]
    '''
    df = df.copy()
    # create a year variable
    def helper(row):
        return row.UPDATE_DATE.year
    df['year'] = df.apply(helper, axis=1)
    
    # qualified self event
    sub = df[(df.EVENT_TYPE == '1') & (df.LOCAL_IDX >= 2)]
    
    yearly = {}
    for year in range(s_year, e_year+1):
        b_idxs = []
        c_idxs = []
        sub2 = sub[sub.year == year] # self event at particular year
        for _, row in sub2.iterrows():
            
            time = row.UPDATE_DATE_int # float
            print(time)
            # back to global df
            df_b_before = df[(df.UPDATE_DATE_int < time)&(df.EVENT_TYPE == '3')]
            df_c_before = df[(df.UPDATE_DATE_int < time)&(df.EVENT_TYPE.isin(['1','2']))]
            idx_b = df_b_before.iloc[-1, 1]
            print(df_c_before)
            idx_c = df_c_before.iloc[-1, 1] -1 # 
            b_idxs.append(idx_b)
            c_idxs.append(idx_c)
            
        yearly[year] = (b_idxs, c_idxs)
    
    return yearly
        
        
        
    

In [399]:
tmp_df = convert_date(sample_df)

In [400]:
tmp_df


Unnamed: 0,GLOBAL_IDX,LOCAL_IDX,UPDATE_DATE,EVENT_TYPE,TGVKEY,UPDATE_DATE_int
0,0,0.0,2000-01-01,3,,1095.0
1,1,1.0,2000-03-06,1,3105,1160.0
2,2,1.0,2000-03-06,2,3105,1160.0
3,3,1.0,2000-03-06,2,3105,1160.0
4,4,3.0,2000-03-13,2,10580,1167.0
...,...,...,...,...,...,...
61,61,42.0,2018-12-03,2,10726,8006.0
62,62,19.0,2019-01-01,3,,8035.0
63,63,43.0,2019-02-13,2,112030,8078.0
64,64,20.0,2020-01-01,3,,8400.0


In [401]:
temp_dict_idx = get_arr_b_c_idx_i(tmp_df, 1997, 2020)

1175.0
   GLOBAL_IDX  LOCAL_IDX UPDATE_DATE EVENT_TYPE TGVKEY  UPDATE_DATE_int  year
1           1        1.0  2000-03-06          1   3105           1160.0  2000
2           2        1.0  2000-03-06          2   3105           1160.0  2000
3           3        1.0  2000-03-06          2   3105           1160.0  2000
4           4        3.0  2000-03-13          2  10580           1167.0  2000
1832.0
   GLOBAL_IDX  LOCAL_IDX UPDATE_DATE EVENT_TYPE  TGVKEY  UPDATE_DATE_int  year
1           1        1.0  2000-03-06          1    3105           1160.0  2000
2           2        1.0  2000-03-06          2    3105           1160.0  2000
3           3        1.0  2000-03-06          2    3105           1160.0  2000
4           4        3.0  2000-03-13          2   10580           1167.0  2000
5           5        4.0  2000-03-21          1    3105           1175.0  2000
6           6        5.0  2000-07-03          2  120716           1279.0  2000
8           8        6.0  2001-10-08       

## true_tar_idxs_i 

already have:
- tnic raw + rank
- gvkey_lsts, key_ind_maps , ind_key_maps 


need: 
- TNIC2
- MA event


```python
gvkey == '5047'

sub_sdc_tnic = sdc_tnic[['AGVKEY', 'TGVKEY', 'DE', 'YEAR']]
sub_sdc_tnic = sdc_tnic[sub_sdc_tnic.AGVKEY == gvkey].sort_values(by = ['DE'])
 
def choice_data_dict(sdc_tnic, timeline):
    
    '''
    timeline is timeline + 'UPDATE_DATE_int'
    total columns are: [GLOBAL_IDX  LOCAL_IDX UPDATE_DATE EVENT_TYPE  UPDATE_DATE_int]
    
    sdc_tnic, only has ['AGVKEY', 'TGVKEY', 'DE', 'YEAR']
    '''
    
    
    
    ###### get true_tar_idxs_i ######## 
    df = timeline.copy()
    # create a year variable
    def helper(row):
        return row.UPDATE_DATE.year
    
    df['year'] = df.apply(helper, axis=1)
    
    # qualified self event
    df = df[(df.EVENT_TYPE == '1') & (df.LOCAL_IDX >= 2)]

    

    yearly = {}
    for year in range(s_year, e_year):
        
        ######
        
        # watchout! self event should use "most recent year" TNIC structure
        
        gvkey_lsts, key_ind_maps , ind_key_maps # invariant for all acquirer, variant by year
        N_i_1 = len(gvkey_lsts) # invariant for all acquirer
      
        b_idxs, c_idxs = dict_idx[year] 
        N_i_2 = len(b_idxs) 
        sub_sdc_tnic = sdc_tnic[sdc_tnic.YEAR == year]
         
        assert N_i_2 == sub_sdc_tnic.shape[0], "timeline length dismatch sdc_tnic df shape[0]"
        
        tgvkey_lst = sub_sdc_tnic.TGVKEY.values.tolist()
        target_idx_lst = [key_ind_maps[tgvkey] for tgvkey in tgvkey_lst]
        true_tar_idxs_i = F.one_hot(torch.LongTensor(target_idx_lst), num_classes = N_i_2)
    
        yearly[year] = true_tar_idxs_i
        

    return yearly    
    
        
    




```

In [402]:
import torch.nn.functional as F
import torch

In [403]:
def true_tar_idxs_i(timeline, dict_idx):
    '''
    year loop by self-event year
        TNIC related data use year-1
    
    '''
    # add year to timeline data
    df = timeline.copy()
    # create a year variable
    def helper(row):
        return row.UPDATE_DATE.year
    df['year'] = df.apply(helper, axis=1)
    
    # qualified self event
    sub = df[(df.EVENT_TYPE == '1') & (df.LOCAL_IDX >= 2)]
    
    yearly = {}
    # loop over self-merge year
    for year in range(s_year, e_year+1):
        '''
        N_i_1 = num of candidate target
        N_i_2 = num of self event
        '''
        N_i_1 = len(gvkey_lsts[year-1]) # all target candidate in TNIC net
        b_idxs, c_idxs = dict_idx[year] # the output of ...
        N_i_2 = len(b_idxs)
        timeline_i = sub[sub.year == year] # only ith year
        targets_lst = timeline_i.TGVKEY.values.tolist() # length = N_i_2
        print(len(targets_lst), N_i_2)
        assert len(targets_lst) == N_i_2, "length dismatch with larget lists and N_i_2"
        idx_lst = [key_ind_maps[year-1][tgvkey] for tgvkey in targets_lst]
        one_hot_i = F.one_hot(torch.LongTensor(idx_lst), num_classes = N_i_1)
        yearly[year] = one_hot_i
    return yearly

In [404]:
# def true_tar_idxs_i(sdc_tnic, dict_idx,  timeline, gvkey_lsts, key_ind_maps , ind_key_maps, s_year=1997, e_year=2020):
#     '''
#     sdc_tnic: full raw data
#     dict_idx: output for a single gvkey
#     timeline: output for a single gvkey
#     gvkey_lsts, key_ind_maps , ind_key_maps: full raw data
    
#     '''
#     sub_sdc_tnic = sdc_tnic[['AGVKEY', 'TGVKEY', 'DA', 'YEAR']]
#     sub_sdc_tnic = sub_sdc_tnic[sub_sdc_tnic.AGVKEY == gvkey].sort_values(by = ['DA']) # in time order
    
#     df = timeline.copy()
#     timeline_self = df[(df.EVENT_TYPE == '1') & (df.LOCAL_IDX >= 2)] # qualified self event
#     merged_timeline =  timeline_self.merge(sub_sdc_tnic, left_on = 'UPDATE_DATE', right_on = 'DA', how = 'inner')
#     merged_timeline = merged_timeline[['GLOBAL_IDX', 'LOCAL_IDX', 'UPDATE_DATE', 'EVENT_TYPE', 'UPDATE_DATE_int', 'TGVKEY', 'YEAR']]
    
    
    
    
#     yearly = {}
#     for year in range(s_year, e_year+1):
#         # WATCHOUT, self event in ith year should use i-1th year TNIC structure
#         N_i_1 = len(gvkey_lsts[year])
#         b_idxs, c_idxs = dict_idx[year] # the output of ...
#         N_i_2 = len(b_idxs)
#         merged_timeline_i =  merged_timeline[merged_timeline.YEAR == year]
#         targets_lst = merged_timeline_i.TGVKEY.values.tolist() # list
#         print(merged_timeline_i)
#         assert len(targets_lst) == N_i_2, "length dismatch with larget lists and N_i_2"
#         idx_lst = [key_ind_maps[year][tgvkey] for tgvkey in targets_lst]
#         one_hot_i = F.one_hot(torch.LongTensor(idx_lst), num_classes = N_i_1)
#         yearly[year] = one_hot_i
    
#     return yearly
        
        
        
        
        

In [405]:
tmp_df

Unnamed: 0,GLOBAL_IDX,LOCAL_IDX,UPDATE_DATE,EVENT_TYPE,TGVKEY,UPDATE_DATE_int
0,0,0.0,2000-01-01,3,,1095.0
1,1,1.0,2000-03-06,1,3105,1160.0
2,2,1.0,2000-03-06,2,3105,1160.0
3,3,1.0,2000-03-06,2,3105,1160.0
4,4,3.0,2000-03-13,2,10580,1167.0
...,...,...,...,...,...,...
61,61,42.0,2018-12-03,2,10726,8006.0
62,62,19.0,2019-01-01,3,,8035.0
63,63,43.0,2019-02-13,2,112030,8078.0
64,64,20.0,2020-01-01,3,,8400.0


In [406]:
foo = true_tar_idxs_i(tmp_df, temp_dict_idx)

0 0
0 0
0 0
1 1
0 0
2 2
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0


## node feature array 

- fv_full contains full year, all gvkey shows in TNIC that year

- self event happened in i-th year 
    - should use:
        - i-1 year TNIC network
        - and fv


- **WATCHOUT: that feature used for frequent acquirer and target are different! since generally, more features are avaliable for frequent acquirers**


(N_i_1, fv)



In [407]:
with open(data_path+"/fv_raw_full_1996_2019.pickle", 'rb') as f:
    fv_full = pickle.load(f)

In [408]:
fv_full.columns

Index(['gvkey', 'year', 'at', 'sale', 'ch', 'rdip', 'm2b', 'lev', 'roa', 'ppe',
       'cash2asset', 'cash2sale', 'sale2asset', 'cr', 'gsi', 'de', 'roe',
       'd_sale', 'd_at'],
      dtype='object')

In [409]:
def get_node_features(fv_full, gvkey_lsts, key_ind_maps , ind_key_maps, s_year=1997, e_year=2020):
    '''
    fv_full: raw
    gvkey_lsts, key_ind_maps , ind_key_maps: raw
    
    WARNING: the output yearly's year is self-event's year!!! 
    '''
    # loop self-merge year
    yearly = {}
    for year in range(s_year, e_year+1): 
        df_gvkeys = pd.DataFrame({'gvkeys': gvkey_lsts[year-1]})
        fv_candidate = fv_full[fv_full.gvkey.isin(gvkey_lsts[year-1]) & (fv_full.year == year-1)]
        fv_i = df_gvkeys.merge(fv_candidate, left_on='gvkeys', right_on = 'gvkey', how = "left")
        fv_i.reset_index(drop=True, inplace=True)
        #print(fv_i[:5])
        arr = fv_i.iloc[:, 3:].to_numpy()
        yearly[year] = arr
        assert len(gvkey_lsts[year-1]) == arr.shape[0], "list and arr shape dismatch"
    
    return yearly
        
    

In [410]:
tmp_node_feature = get_node_features(fv_full, gvkey_lsts, key_ind_maps , ind_key_maps, s_year=1997, e_year=2020)

## network structure array(edges)

what I want:

(2, E), unweighted edge


use I use:
tnic network structure (top-10)

In [411]:
def get_net_structure(tmp_data_path, gvkey_lsts, key_ind_maps , ind_key_maps, s_year=1997, e_year=2020):
    
    yearly = {}    
    # loop over self-event year! not TNIC !
    for year in range(s_year, e_year+1):     
        with open(tmp_data_path+f'/a5_top_10_peers_tnic2_{year-1}.pickle', 'rb') as f:
            tnic = pickle.load(f)   
            df_all_lst = []
            for _,value in tnic.items():
                df_all_lst.append(value)
            df_all = pd.concat(df_all_lst)
            df_net = df_all[['gvkey1', 'gvkey2']]
            lst1 = df_net.gvkey1.values.tolist()
            lst2 = df_net.gvkey2.values.tolist()
            idx1 = [key_ind_maps[year-1][gvkey1] for gvkey1 in lst1]
            idx2 = [key_ind_maps[year-1][gvkey2] for gvkey2 in lst2]
            arr = np.array([idx1, idx2])
            assert arr.shape[0] == 2, "the dim of output is wrong"
        yearly[year] = arr
    return yearly
            
                
        
    

In [412]:
tmp_structure = get_net_structure(tmp_data_path, gvkey_lsts, key_ind_maps , ind_key_maps, s_year=1997, e_year=2020)