In [74]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [75]:
import pandas as pd
import numpy as np
from datetime import datetime

# Raw data

In [76]:
data = pd.read_csv('../raw_data/210908_LeWagon_finalproject_v3.csv')
data.head(2)

Unnamed: 0,SDC Deal No,Date Announced,"Deal Value\r\n(USD, Millions)","Acquiror Total Assets Last 12 Months\r\n(USD, Millions)",Acquiror Full Name,Acquiror Primary Ticker Symbol,Target Full Name,Target Nation,Acquiror Nation,Target Public Status,Acquiror TRBC Industry,Target TRBC Industry,Percentage of Shares Held at Announcement,Percentage of Shares Acquired in Transaction,Consideration Offered\r\n('|'),Deal Attitude,Acquisition Techniques\r\n('|'),Acquiror Financial Advisors Name\r\n('|'),Target Financial Advisors Name\r\n('|'),Purpose\r\n('|')
0,1631732020,01/01/2005,12.0,3160.41,Plains All American Pipeline LP,PAA,Shell Pipeline Co LP-Crude Oil Pipeline Assets...,United States,United States,Subsidiary,Oil & Gas Refining and Marketing,Oil & Gas Transportation Services,,100.0,Cash|Cash Only,Friendly,Financial Acquiror|Divestiture,,,Strengthen existing operations/expand presence...
1,1653652020,01/01/2005,,1680.57,Regis Corp,RGS,Scot Lewis Schools,United States,United States,Private,Personal Services,Professional & Business Education,,100.0,Unspecified,Friendly,Not Applicable,,,


# Adjust data

## Column names

In [77]:
data.columns

Index(['SDC Deal No', 'Date Announced', 'Deal Value\r\n(USD, Millions)',
       'Acquiror Total Assets Last 12 Months\r\n(USD, Millions)',
       'Acquiror Full Name', 'Acquiror Primary Ticker Symbol',
       'Target Full Name', 'Target Nation', 'Acquiror Nation',
       'Target Public Status', 'Acquiror TRBC Industry',
       'Target TRBC Industry', 'Percentage of Shares Held at Announcement',
       'Percentage of Shares Acquired in Transaction',
       'Consideration Offered\r\n('|')', 'Deal Attitude',
       'Acquisition Techniques\r\n('|')',
       'Acquiror Financial Advisors Name\r\n('|')',
       'Target Financial Advisors Name\r\n('|')', 'Purpose\r\n('|')'],
      dtype='object')

In [78]:
new_columns = [
    'id', 'announcement_date', 'deal_value', 'acquiror_total_assets', 'acquiror_name', 'acquiror_ticker', 'target_name',
    'target_nation', 'acquiror_nation', 'target_status', 'acquiror_industry', 'target_industry',
    'shares_at_announcement', 'shares_acquired', 'consideration_offered', 'attitude',
    'acquisition_technique', 'acquiror_financial_advisor', 'target_financial_advisor', 'purpose'
]

In [79]:
data.columns = new_columns

## Adding TRBC (industry classifier)

In [80]:
trbc = pd.read_pickle('../MA_PREDICTOR/data/trbc.pkl')
trbc.head()

Unnamed: 0,Title,Hierarchical_Code
0,Coal,50101010
1,Integrated Oil & Gas,50102010
2,Oil & Gas Exploration and Production,50102020
3,Oil & Gas Refining and Marketing,50102030
4,Oil & Gas Drilling,50103010


In [81]:
# Acquiror
data = data.join(trbc.set_index('Title'), on='acquiror_industry')
data.rename(columns={'Hierarchical_Code': 'acquiror_code'}, inplace=True)

# Target
data = data.join(trbc.set_index('Title'), on='target_industry')
data.rename(columns={'Hierarchical_Code': 'target_code'}, inplace=True)

## dtypes

In [82]:
data.dtypes

id                              int64
announcement_date              object
deal_value                     object
acquiror_total_assets          object
acquiror_name                  object
acquiror_ticker                object
target_name                    object
target_nation                  object
acquiror_nation                object
target_status                  object
acquiror_industry              object
target_industry                object
shares_at_announcement        float64
shares_acquired               float64
consideration_offered          object
attitude                       object
acquisition_technique          object
acquiror_financial_advisor     object
target_financial_advisor       object
purpose                        object
acquiror_code                 float64
target_code                   float64
dtype: object

In [83]:
# transforming announcement_date
data['announcement_date'] = pd.to_datetime(data['announcement_date'], format="%d/%m/%Y")

In [84]:
# transforming deal_value
rem_com = lambda x: x.replace(',', '') if type(x) == str else x # removing comma in string, leaving float
num_feat = ['deal_value', 'acquiror_total_assets']
for feat in num_feat:
    data[feat]= data[feat].apply(rem_com).astype(float)

In [85]:
# Transforming trbc codes

# Removing nas
data = data[data['acquiror_code'].notna()]
data = data[data['target_code'].notna()]

# Type transformation
data['acquiror_code'] = data['acquiror_code'].astype(int)
data['target_code'] = data['target_code'].astype(int)

# Modifying data

In [86]:
data.isna().sum()

id                                0
announcement_date                 0
deal_value                    17813
acquiror_total_assets          8738
acquiror_name                     0
acquiror_ticker                  73
target_name                       1
target_nation                     1
acquiror_nation                   0
target_status                     0
acquiror_industry                 0
target_industry                   0
shares_at_announcement        40009
shares_acquired                   0
consideration_offered             7
attitude                          0
acquisition_technique             3
acquiror_financial_advisor    35036
target_financial_advisor      32109
purpose                       19147
acquiror_code                     0
target_code                       0
dtype: int64

In [87]:
# deal_value=not considered
# acquiror_ticker=remove
# target_name/target_nation/target_industry=remove
# target_status=remove
# shares_at_announcement=0
# consideration_offered=remove
# acquiror/target_financial_advisor=not considered

## Fill up missing values

In [88]:
# NA in 'shares_at_announcement' implies 0
data['shares_at_announcement'].fillna(0, inplace=True)

## Removing NAs

In [89]:
drop_na = ['acquiror_ticker', 'target_name', 'target_nation', 'target_status', 'target_industry',
           'consideration_offered']

In [90]:
data.shape

(41209, 22)

In [91]:
for col in drop_na:
    data = data[data[col].notna()]

## Optional: clean consideration offered

In [92]:
# 'Unspecified' in consideration_offered has to be removed -> we would lose 38% of our data
len(data[data['consideration_offered'] == 'Unspecified']) / len(data)

0.3874583606876261

In [93]:
data.shape

(41127, 22)

In [94]:
# to be excluded in detailed dataset
# data = data[data['consideration_offered'] != 'Unspecified']

In [95]:
data.shape

(41127, 22)

## US only

In [96]:
# Raw data was filtered for nation of incorporation, some nations stayed in
data['acquiror_nation'].unique()

array(['United States', 'Canada', 'China (Mainland)', 'Russia',
       'Hong Kong', 'Philippines', 'Poland', 'Japan', 'Netherlands',
       'France', 'Malaysia', 'United Kingdom', 'Australia', 'Taiwan',
       'Sweden', 'Bermuda', 'Switzerland', 'Hungary', 'Singapore',
       'Israel', 'Peru', 'Argentina', 'Colombia', 'United Arab Emirates',
       'Thailand', 'Brazil', 'Cambodia', 'New Zealand', 'Panama',
       'South Korea', 'Germany', 'Ireland', 'Italy', 'Ecuador', 'Vietnam',
       'Czech Republic', 'Dominican Republic', 'Fiji', 'Kenya', 'Denmark',
       'Ukraine', 'Latvia', 'Malta', 'Jamaica', 'Puerto Rico', 'Greece',
       'Indonesia', 'U.S. Virgin Islands', 'India'], dtype=object)

In [97]:
data = data[data['acquiror_nation'] == 'United States']

## Empty acquisitions

In [98]:
data.shape

(39851, 22)

In [99]:
data[data['shares_acquired'] <= 0.0].shape # we have 12579 empty acquisitions

(12579, 22)

In [100]:
data = data[data['shares_acquired'] > 0.0]

In [101]:
data.shape

(27272, 22)

## Optional: No purpose given

In [102]:
data[data['purpose'] == 'Other'].shape # 673 undefined purpose
# + 12533 NaN
# = 13206 lost deals

(673, 22)

In [103]:
data.shape

(27272, 22)

In [104]:
# to be excluded in detailed model
# data = data[data['purpose'].notna()]
# data = data[data['purpose'] != 'Other']

In [105]:
data.shape

(27272, 22)

## Optional: No acquisition_technique

In [106]:
data[data['acquisition_technique'] == 'Not Applicable'].shape # 13940 not specified

(13940, 22)

In [107]:
data.shape

(27272, 22)

In [108]:
# to be excluded in detailed dataset
# data = data[data['acquisition_technique'] != 'Not Applicable']

In [109]:
data.shape

(27272, 22)

## Optional: missing financial data

In [110]:
fin_feat = ['deal_value', 'acquiror_total_assets']
#for feat in fin_feat:
#    data = data[data[feat].notna()]

In [111]:
data.shape

(27272, 22)

# Upload 1

In [112]:
data.dtypes

id                                     int64
announcement_date             datetime64[ns]
deal_value                           float64
acquiror_total_assets                float64
acquiror_name                         object
acquiror_ticker                       object
target_name                           object
target_nation                         object
acquiror_nation                       object
target_status                         object
acquiror_industry                     object
target_industry                       object
shares_at_announcement               float64
shares_acquired                      float64
consideration_offered                 object
attitude                              object
acquisition_technique                 object
acquiror_financial_advisor            object
target_financial_advisor              object
purpose                               object
acquiror_code                          int64
target_code                            int64
dtype: obj

In [70]:
data.shape # detailed: 5047 deals, not_detailed: 27272 deals

(5047, 22)

In [126]:
data.to_csv('../MA_PREDICTOR/data/ma_data.csv', index=False, date_format="%d/%m/%Y")

# Clean missing feature (CAR)

In [132]:
data_car = pd.read_csv('../MA_PREDICTOR/data/ma_data_car.csv', parse_dates=['announcement_date'])

In [137]:
data_car.isna().sum() # we will loose 8813 observations

id                                0
announcement_date                 0
deal_value                    12921
acquiror_total_assets          1882
acquiror_name                     0
acquiror_ticker                   0
target_name                       0
target_nation                     0
acquiror_nation                   0
target_status                     0
acquiror_industry                 0
target_industry                   0
shares_at_announcement            0
shares_acquired                   0
consideration_offered             0
attitude                          0
acquisition_technique             0
acquiror_financial_advisor    22135
target_financial_advisor      19527
purpose                       12533
acquiror_code                     0
target_code                       0
car                            8813
dtype: int64

In [139]:
data_car = data_car[data_car['car'].notna()]

In [140]:
data_car.shape # feature clean will leave us with 18459 observations

(18459, 23)

# Upload 2

In [141]:
data_car.to_csv('../MA_PREDICTOR/data/ma_data_car.csv', index=False, date_format="%d/%m/%Y")