In [6]:
import pandas as pd
import numpy as np
import random
import pickle

In [7]:
data_orig= pd.read_csv('../../MA_PREDICTOR/data/ma_data_car.csv', parse_dates=['announcement_date'])

In [8]:
# Removing outliers
cars = [1, 3, 5, 10]
for car in cars:
    data = data_orig[(abs(data_orig[f'car_{car}']) <= 0.3)]

In [10]:
data.columns

Index(['id', 'announcement_date', 'deal_value', 'acquiror_total_assets',
       'acquiror_name', 'acquiror_ticker', 'target_name', 'target_nation',
       'acquiror_nation', 'target_status', 'acquiror_industry',
       'target_industry', 'shares_at_announcement', 'shares_acquired',
       'consideration_offered', 'attitude', 'acquisition_technique',
       'acquiror_financial_advisor', 'target_financial_advisor', 'purpose',
       'acquiror_code', 'target_code', 'acquisition_count', 'bidder_count',
       'car_1', 'car_3', 'car_5', 'car_10'],
      dtype='object')

In [12]:
tmp = data[['consideration_offered', 'shares_acquired',
            'shares_at_announcement','acquiror_code', 'target_code', 'acquiror_nation',
            'target_nation', 'announcement_date', 'target_status',
            'acquisition_count', 'bidder_count',
            'car_1', 'car_3', 'car_5', 'car_10'
           ]].copy()

# Consideration

In [13]:
# transform pipes into list of all considerations
list_gen = lambda x: x.split('|')
tmp['consideration_offered'] = tmp.consideration_offered.apply(list_gen)

# Cluster considerations
def cash(considerations):
    
    for consideration in considerations:
        if 'Cash' not in consideration:
            return 'Other'
    return 'Cash'

# Apply
tmp['consideration_offered'] = tmp.apply(lambda row: cash(row.consideration_offered), axis=1)

# Public_vs_others

In [14]:
tmp['target_status'].unique()

array(['Private', 'Public', 'Subsidiary', 'Joint Venture', 'Government'],
      dtype=object)

In [15]:
# Apply
tmp['target_status']=tmp['target_status'].apply(lambda val: "public" if val=='Public' else "others")

# Month

In [17]:
tmp['month']= pd.DatetimeIndex(tmp['announcement_date']).month

# Full vs not_full acqu

In [18]:
tmp['shares_acquired']=tmp['shares_acquired'].astype(str)

In [19]:
# Apply
tmp['shares_acquired']=tmp['shares_acquired'].apply(lambda val: "full" if val=="100.0" else "not_full")

# Shares at annoncement

In [20]:
tmp['shares_at_announcement']=tmp['shares_at_announcement'].astype(str)

In [21]:
# Apply
tmp['shares_at_announcement']=tmp['shares_at_announcement'].apply(lambda val: "no" if val=="0.0" else "yes")

# Cross-border vs National

In [22]:
tmp['cross_border'] = tmp.apply(lambda row: 'cross_border' if row.acquiror_nation == row.target_nation else 'national', axis=1)

# Industry relatedness

In [23]:
# Transform codes

def relatedness(acquiror, target):
    if acquiror == target:
        return 'industry'
    elif acquiror[:6] == target[:6]:
        return 'industry_group'
    elif acquiror[:4] == target[:4]:
        return 'business_sector'
    elif acquiror[:2] == target[:2]:
        return 'economic_sector'
    else:
        return 'not_related'

In [24]:
tmp['relatedness'] = data.apply(lambda row: relatedness(str(row.acquiror_code), str(row.target_code)), axis=1)

# Declassifier

In [25]:
# transform code into different cols mentioned above
def declassifier(x):
    x = str(x)
    return x[:2], x[:4]

In [26]:
tmp['economic_sector_ac'], tmp['business_sector_ac'] = \
zip(*data['acquiror_code'].map(declassifier))

In [27]:
tmp['economic_sector_target'], tmp['business_sector_target'] = \
zip(*data['target_code'].map(declassifier))

# Columns modification

In [28]:
tmp.drop(['acquiror_nation', 'target_nation', 'announcement_date'], axis=1, inplace=True)

In [30]:
tmp.columns

Index(['consideration_offered', 'shares_acquired', 'shares_at_announcement',
       'acquiror_code', 'target_code', 'target_status', 'acquisition_count',
       'bidder_count', 'car_1', 'car_3', 'car_5', 'car_10', 'month',
       'cross_border', 'relatedness', 'economic_sector_ac',
       'business_sector_ac', 'economic_sector_target',
       'business_sector_target'],
      dtype='object')

In [35]:
old_order = tmp.columns.tolist()
new_order = old_order[:8] + old_order[12:] + old_order[8:12]

tmp = tmp[new_order]

# Implementation of the declassifier function

In [36]:
pickle_in = open("../../MA_PREDICTOR/data/declassification.pkl","rb")
results = pickle.load(pickle_in)

In [37]:
results.head()

Unnamed: 0,economic_sector,business_sector,industry_group,industry,activity,hierarchical_id
0,Energy,Energy - Fossil Fuels,Coal,Coal,Coal (NEC),50101010
1,Energy,Energy - Fossil Fuels,Coal,Coal,Coal Mining Support,50101010
2,Energy,Energy - Fossil Fuels,Coal,Coal,Coal Wholesale,50101010
3,Energy,Energy - Fossil Fuels,Oil & Gas,Integrated Oil & Gas,Integrated Oil & Gas,50102010
4,Energy,Energy - Fossil Fuels,Oil & Gas,Oil & Gas Exploration and Production,Oil & Gas Exploration and Production (NEC),50102020


In [38]:
def get_info_trbc(hierarchical_id):
    str_id= str(hierarchical_id)
    try:
        if len(str_id)==2:
            s = results.loc[results['hierarchical_id'].str.startswith(str_id), 'economic_sector'].reset_index(drop=True)
            return s[0]
        elif len(str_id)==4:
            r = results.loc[results['hierarchical_id'].str.startswith(str_id), 'business_sector'].reset_index(drop=True)
            return r[0]
        elif len(str_id)==6:
            q = results.loc[results['hierarchical_id'].str.startswith(str_id), 'industry_group'].reset_index(drop=True)
            return q[0] 
        elif len(str_id)==8:
            p = results.loc[results['hierarchical_id'].str.startswith(str_id), 'industry'].reset_index(drop=True)
            return p[0] 
        else:
            print("Please enter a valid hierarchical ID.")
    except:
        print("Please enter a valid hierarchical ID.")

In [39]:
tmp['economic_sector_ac'] = tmp.apply(lambda row: get_info_trbc(row.economic_sector_ac), axis=1)

In [40]:
tmp['business_sector_ac'] = tmp.apply(lambda row: get_info_trbc(row.business_sector_ac), axis=1)

In [41]:
tmp['economic_sector_target'] = tmp.apply(lambda row: get_info_trbc(row.economic_sector_target), axis=1)

In [42]:
tmp['business_sector_target'] = tmp.apply(lambda row: get_info_trbc(row.business_sector_target), axis=1)

# Uploading

In [44]:
tmp.to_csv('../../MA_PREDICTOR/data/ma_data_car_clean.csv', index=False, date_format="%d/%m/%Y")