In [23]:
import pickle
import pandas as pd
import numpy as np

## helpers

In [17]:
def same_day_only_one(sdc_tnic_raw):
    print("shape before removing same-day multi events:", sdc_tnic_raw.shape)
    sdc_tnic_raw = sdc_tnic_raw.copy()
    sdc_tnic_one = sdc_tnic_raw.groupby(['AGVKEY', 'DA']).first().reset_index(drop=False)
    print("shape after removing same-day multi events:", sdc_tnic_one.shape)
    sdc_tnic_one.sort_values(by = ['DA'], axis=0, inplace=True)
    return sdc_tnic_one

# load data


In [3]:

tmp_data_path = '../MA_data/data/tmp'
data_path = '../MA_data/data'

s_year = 1997
e_year = 2020

In [9]:
# import data
# a_freq

with open(data_path+"/freq_a_info_1997_2020.pickle", "rb") as f:
    a_freq_info = pickle.load(f)

In [11]:
A_freq, a_freq_lst, a_freq_idx_to_gvkey_mapping, a_freq_gvkey_to_idx_mapping = a_freq_info

In [18]:
sdc_tnic = pd.read_pickle(tmp_data_path+f"/sdc_tnic_1997_2020") # always load full dataset
sdc_tnic = same_day_only_one(sdc_tnic) # make sure 1 timpoint could only happen 1 event (otherwise violate the assumption of point process)


shape before removing same-day multi events: (9661, 50)
shape after removing same-day multi events: (9448, 50)


In [21]:
sdc_tnic.columns

Index(['AGVKEY', 'DA', 'ACU', 'ASIC2', 'ABL', 'ANL', 'APUBC', 'AUP', 'AUPSIC',
       'AUPBL', 'AUPNAMES', 'AUPPUB', 'BLOCK', 'CREEP', 'DE', 'STATC', 'SYNOP',
       'VAL', 'PCTACQ', 'PSOUGHTOWN', 'PSOUGHT', 'PHDA', 'PCTOWN', 'PSOUGHTT',
       'PRIVATIZATION', 'DEAL_NO', 'TCU', 'TSIC2', 'TBL', 'TNL', 'TPUBC',
       'TUP', 'TUPSIC', 'TUPBL', 'TUPNAMES', 'TUPPUB', 'SIC_A', 'SIC_T',
       'YEAR', 'AS_PERMNO', 'AP_PERMNO', 'TS_PERMNO', 'TP_PERMNO', 'AS_GVKEY',
       'AP_GVKEY', 'TS_GVKEY', 'TP_GVKEY', 'GVKEY_STATUS', 'MA_TYPE',
       'TGVKEY'],
      dtype='object')

In [44]:
sdc_tnic.YEAR

5055    1997
9392    1997
6442    1997
149     1997
6164    1997
        ... 
4869    2020
2344    2020
3106    2020
7232    2020
7979    2020
Name: YEAR, Length: 9448, dtype: int64

# Q: see how many freq acquirer has self-event in 2020

- 500 freq acquirers
- 32 trigger a self event in 2020

In [22]:
sdc_2020 = sdc_tnic[sdc_tnic.AGVKEY.isin(a_freq_lst) & (sdc_tnic.YEAR == 2020)]

In [25]:
len(np.unique(sdc_2020.AGVKEY.values).tolist())/ len(a_freq_lst)

0.06451612903225806

In [27]:
len(np.unique(sdc_2020.AGVKEY.values).tolist())

32

## How about choose another year to predict that has more self-events?

In [37]:
for year in range(2010, 2020+1):
    sdc_year = sdc_tnic[sdc_tnic.AGVKEY.isin(a_freq_lst) & (sdc_tnic.YEAR == year)]
    print("number of self event in year{} is {}".format(year, len(np.unique(sdc_year.AGVKEY.values).tolist())))

number of self event in year2010 is 114
number of self event in year2011 is 90
number of self event in year2012 is 114
number of self event in year2013 is 99
number of self event in year2014 is 97
number of self event in year2015 is 99
number of self event in year2016 is 88
number of self event in year2017 is 87
number of self event in year2018 is 71
number of self event in year2019 is 63
number of self event in year2020 is 32


**you'd better predict on rolling basis!!!!**
- make sure the code is compatible with such design

# Q: build delta_time_t for MC estimation for prediction

## helpers

In [31]:

def convert_date(df):
    df = df.copy()
    def datetime_converter(date_time):
        base_time = np.datetime64('1997-01-01')
        days_diff = np.datetime64(date_time.date()) - base_time
        return days_diff.astype(int)
    for idx, row in df.iterrows():
        df.loc[idx, 'UPDATE_DATE_int'] = datetime_converter(df.loc[idx, 'UPDATE_DATE'])

    #df.sort_values(by = ['UPDATE_DATE']).reset_index(drop=True, inplace=True)
    return df 

## load data

In [32]:
with open(data_path+f"/dataset_top10_freq5_{s_year}_{e_year}.pickle", "rb") as f:
    arr_cs, arr_bs, timelines = pickle.load(f)

## main

In [33]:
tmp_df = convert_date(timelines[0])

In [50]:
for i in range(0, 10):
    print(len(arr_bs[i]))

22
24
13
23
24
23
24
24
24
24


In [30]:
timelines[0]

Unnamed: 0,GLOBAL_IDX,LOCAL_IDX,UPDATE_DATE,EVENT_TYPE,TGVKEY
0,0,0,1997-01-01,3,
1,1,0,1997-01-06,2,10499
2,2,1,1997-01-16,2,5073
3,3,2,1997-01-23,1,11636
4,4,3,1997-03-10,1,29132
...,...,...,...,...,...
211,211,190,2017-03-27,1,13440
212,212,191,2017-10-31,2,162894
213,213,21,2018-01-01,3,
214,214,192,2018-02-07,1,135990


# Q: if the frequent acquirer is invariant
- frequent acquirer should not only decided by the number of event they encountered. but also the length of "active" 

what is the min year I could use for predict

In [53]:
years = []
for gvkey in a_freq_lst:
    sub = sdc_tnic[sdc_tnic.AGVKEY == gvkey]
    year = min(sub.YEAR.values)
    years.append(year)
print(max(years))

2017


If I want to predict 2017-01-01 ~ 2018-01-01

# create dataset 

3 steps:
- finish evaluation function
- creating a data set ends with year-01-01 (includes)
- get the delta_time_before
- get the label


In [39]:
with open(data_path+f"/ma_dataset_N01.pickle", 'rb') as f: ### 
    ma_dataset = pickle.load(f)

In [41]:
ma_dataset[0][2]

array([  7.,  46.,  37.,  51.,   3.,   1.,   1.,  39.,   3.,  10.,  14.,
        40.,  76.,  54.,  10.,   3.,  36.,  24.,  42.,  38.,  46.,  26.,
        15.,  33.,   1.,  15.,  14.,   5.,   7.,  18.,  32.,   7.,  44.,
        50.,   1.,  26.,  13.,   7.,   4.,   2.,  60.,  76.,   1.,  27.,
        29.,  35.,   3.,  64.,  13.,  26.,  22.,  13.,  11.,  30.,  50.,
         6.,  97.,  43.,  29.,  20.,   4.,  13.,  21., 109.,   7.,   4.,
        30.,   1.,  68.,  23.,  66.,  15.,   7.,  40.,  59., 108.,  37.,
         6.,  98.,  17.,  54.,   3., 248.,   8.,  29.,   1.,  16.,  37.,
        97.,  59., 101.,  10., 263.,  14., 402., 323., 200.,  99., 119.])