In [301]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [302]:
import pandas as pd
import numpy as np
from datetime import datetime

# Raw data

In [303]:
data = pd.read_csv('../raw_data/210908_LeWagon_finalproject_v3.csv')
data.head(2)

Unnamed: 0,SDC Deal No,Date Announced,"Deal Value\r\n(USD, Millions)","Acquiror Total Assets Last 12 Months\r\n(USD, Millions)",Acquiror Full Name,Acquiror Primary Ticker Symbol,Target Full Name,Target Nation,Acquiror Nation,Target Public Status,Acquiror TRBC Industry,Target TRBC Industry,Percentage of Shares Held at Announcement,Percentage of Shares Acquired in Transaction,Consideration Offered\r\n('|'),Deal Attitude,Acquisition Techniques\r\n('|'),Acquiror Financial Advisors Name\r\n('|'),Target Financial Advisors Name\r\n('|'),Purpose\r\n('|')
0,1631732020,01/01/2005,12.0,3160.41,Plains All American Pipeline LP,PAA,Shell Pipeline Co LP-Crude Oil Pipeline Assets...,United States,United States,Subsidiary,Oil & Gas Refining and Marketing,Oil & Gas Transportation Services,,100.0,Cash|Cash Only,Friendly,Financial Acquiror|Divestiture,,,Strengthen existing operations/expand presence...
1,1653652020,01/01/2005,,1680.57,Regis Corp,RGS,Scot Lewis Schools,United States,United States,Private,Personal Services,Professional & Business Education,,100.0,Unspecified,Friendly,Not Applicable,,,


# Adjust data

## Column names

In [304]:
data.columns

Index(['SDC Deal No', 'Date Announced', 'Deal Value\r\n(USD, Millions)',
       'Acquiror Total Assets Last 12 Months\r\n(USD, Millions)',
       'Acquiror Full Name', 'Acquiror Primary Ticker Symbol',
       'Target Full Name', 'Target Nation', 'Acquiror Nation',
       'Target Public Status', 'Acquiror TRBC Industry',
       'Target TRBC Industry', 'Percentage of Shares Held at Announcement',
       'Percentage of Shares Acquired in Transaction',
       'Consideration Offered\r\n('|')', 'Deal Attitude',
       'Acquisition Techniques\r\n('|')',
       'Acquiror Financial Advisors Name\r\n('|')',
       'Target Financial Advisors Name\r\n('|')', 'Purpose\r\n('|')'],
      dtype='object')

In [305]:
new_columns = [
    'id', 'announcement_date', 'deal_value', 'acquiror_total_assets', 'acquiror_name', 'acquiror_ticker', 'target_name',
    'target_nation', 'acquiror_nation', 'target_status', 'acquiror_industry', 'target_industry',
    'shares_at_announcement', 'shares_acquired', 'consideration_offered', 'attitude',
    'acquisition_technique', 'acquiror_financial_advisor', 'target_financial_advisor', 'purpose'
]

In [306]:
data.columns = new_columns

## Adding TRBC (industry classifier)

In [307]:
trbc = pd.read_pickle('../MA_PREDICTOR/data/trbc.pkl')
trbc.head()

Unnamed: 0,Title,Hierarchical_Code
0,Coal,50101010
1,Integrated Oil & Gas,50102010
2,Oil & Gas Exploration and Production,50102020
3,Oil & Gas Refining and Marketing,50102030
4,Oil & Gas Drilling,50103010


In [308]:
# Acquiror
data = data.join(trbc.set_index('Title'), on='acquiror_industry')
data.rename(columns={'Hierarchical_Code': 'acquiror_code'}, inplace=True)

# Target
data = data.join(trbc.set_index('Title'), on='target_industry')
data.rename(columns={'Hierarchical_Code': 'target_code'}, inplace=True)

## Adding acquisition_count

In [309]:
# Loading acquisition data from 1990-2021
hist_acquisition = pd.read_pickle('../MA_PREDICTOR/data/acquisition_count.pkl')
hist_acquisition.drop(columns=['announcement_date'], inplace=True)

In [310]:
data = data.join(hist_acquisition.set_index('id')['acquisition_count'], on='id')

In [311]:
# We have to fill Nan's with previous value for cumcount for this specific acuqiror

# Get all acquirors' name in dataset
acquirors = set(data.acquiror_name.unique())

# Get all acquirors' name in historical dataset
hist_acquirors = set(hist_acquisition.acquiror_name.unique())

# Iterate through set of acquirors and fill NaN values accordingly
for acquiror in acquirors:
    
    # Set deals per acquiror as variable
    series = data.acquisition_count[data.acquiror_name == acquiror]
    
    # Special fill if first deal in dataset is not completed (ffill not suited)
    if pd.isna(series.iloc[0]):
        
        # No completed acquisition can be observed from 2005-2021 dataset
        if all(pd.isna(data.acquisition_count[data.acquiror_name == acquiror])):
            
            # Take most recent value (happening before 2005) and manually accumulate
            if acquiror in hist_acquirors:
                series.iloc[0] = hist_acquisition[hist_acquisition.acquiror_name == acquiror
                                                 ].acquisition_count.max() + 1

            # If not available in historical dataset, completed mergers are assumed to be 0
            else:
                series.iloc[0] = 0
                
        # Find next available number of completed acquisition and manually decumulate
        else:
            for i in range(len(series)):
                if not pd.isna(series.iloc[i]):
                    series.iloc[0] = series.iloc[i] - 1
                    break
                    
    # With first observation filled we can forwardfill the rest
    series = series.ffill()
                
    data.acquisition_count[data.acquiror_name == acquiror] = series

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.acquisition_count[data.acquiror_name == acquiror] = series


In [312]:
# e.g. all NaNs
data.acquisition_count[data.acquiror_name == 'Fourth Wave Energy Inc']

39029    0.0
41188    0.0
Name: acquisition_count, dtype: float64

In [313]:
# e.g. first observations NaN
data.acquisition_count[data.acquiror_name == 'Post Properties Inc']

31       0.0
2678     1.0
2679     2.0
3539     3.0
4566     3.0
10490    3.0
14857    3.0
21363    4.0
25441    4.0
Name: acquisition_count, dtype: float64

## Adding bidder_count

In [314]:
# Importing dataframe with all bidders complete and extracting relevant columns
bidders = pd.read_csv('../raw_data/210914_bidder_count.csv')[['SDC Deal No', 'Number of Bidders']]
bidders.columns = ['id', 'bidder_count']

In [315]:
data = data.join(bidders.set_index('id'), on='id')

In [316]:
data.bidder_count.value_counts()

1.0    41037
2.0      152
3.0       25
4.0        2
Name: bidder_count, dtype: int64

In [317]:
# Assume that nas are 1 (based on above observation)
data.bidder_count.fillna(1, inplace=True)

## dtypes

In [318]:
data.dtypes

id                              int64
announcement_date              object
deal_value                     object
acquiror_total_assets          object
acquiror_name                  object
acquiror_ticker                object
target_name                    object
target_nation                  object
acquiror_nation                object
target_status                  object
acquiror_industry              object
target_industry                object
shares_at_announcement        float64
shares_acquired               float64
consideration_offered          object
attitude                       object
acquisition_technique          object
acquiror_financial_advisor     object
target_financial_advisor       object
purpose                        object
acquiror_code                 float64
target_code                   float64
acquisition_count             float64
bidder_count                  float64
dtype: object

In [319]:
# transforming announcement_date
data['announcement_date'] = pd.to_datetime(data['announcement_date'], format="%d/%m/%Y")

In [320]:
# transforming deal_value
rem_com = lambda x: x.replace(',', '') if type(x) == str else x # removing comma in string, leaving float
num_feat = ['deal_value', 'acquiror_total_assets']
for feat in num_feat:
    data[feat]= data[feat].apply(rem_com).astype(float)

In [321]:
# Transforming trbc codes

# Removing nas
data = data[data['acquiror_code'].notna()]
data = data[data['target_code'].notna()]

# Type transformation
data['acquiror_code'] = data['acquiror_code'].astype(int)
data['target_code'] = data['target_code'].astype(int)

In [322]:
# Type transformation
data['acquisition_count'] = data['acquisition_count'].astype('int')
data['bidder_count'] = data['bidder_count'].astype('int')

# Modifying data

In [323]:
data.isna().sum()

id                                0
announcement_date                 0
deal_value                    17813
acquiror_total_assets          8738
acquiror_name                     0
acquiror_ticker                  73
target_name                       1
target_nation                     1
acquiror_nation                   0
target_status                     0
acquiror_industry                 0
target_industry                   0
shares_at_announcement        40009
shares_acquired                   0
consideration_offered             7
attitude                          0
acquisition_technique             3
acquiror_financial_advisor    35036
target_financial_advisor      32109
purpose                       19147
acquiror_code                     0
target_code                       0
acquisition_count                 0
bidder_count                      0
dtype: int64

In [324]:
# deal_value=not considered
# acquiror_ticker=remove
# target_name/target_nation/target_industry=remove
# target_status=remove
# shares_at_announcement=0
# consideration_offered=remove
# acquiror/target_financial_advisor=not considered

## Fill up missing values

In [325]:
# NA in 'shares_at_announcement' implies 0
data['shares_at_announcement'].fillna(0, inplace=True)

## Removing NAs

In [326]:
drop_na = ['acquiror_ticker', 'target_name', 'target_nation', 'target_status', 'target_industry',
           'consideration_offered']

In [327]:
data.shape

(41209, 24)

In [328]:
for col in drop_na:
    data = data[data[col].notna()]

## Optional: clean consideration offered

In [329]:
# 'Unspecified' in consideration_offered has to be removed -> we would lose 38% of our data
len(data[data['consideration_offered'] == 'Unspecified']) / len(data)

0.3874583606876261

In [330]:
data.shape

(41127, 24)

In [331]:
# to be excluded in detailed dataset
data = data[data['consideration_offered'] != 'Unspecified']

In [332]:
data.shape

(25192, 24)

## US only

In [333]:
# Raw data was filtered for nation of incorporation, some nations stayed in
data['acquiror_nation'].unique()

array(['United States', 'Russia', 'Canada', 'Hong Kong', 'Philippines',
       'Poland', 'China (Mainland)', 'Japan', 'Netherlands', 'France',
       'Malaysia', 'United Kingdom', 'Australia', 'Taiwan', 'Bermuda',
       'Sweden', 'Switzerland', 'Singapore', 'Israel', 'Peru',
       'Argentina', 'United Arab Emirates', 'New Zealand', 'South Korea',
       'Brazil', 'Colombia', 'Ireland', 'Cambodia', 'Ecuador', 'Vietnam',
       'Hungary', 'Czech Republic', 'Dominican Republic', 'Thailand',
       'Malta', 'Denmark', 'Puerto Rico', 'Greece', 'U.S. Virgin Islands',
       'Italy', 'India'], dtype=object)

In [334]:
data = data[data['acquiror_nation'] == 'United States']

## Empty acquisitions

In [335]:
data.shape

(24410, 24)

In [336]:
data[data['shares_acquired'] <= 0.0].shape # we have 12579 empty acquisitions

(9223, 24)

In [337]:
data = data[data['shares_acquired'] > 0.0]

In [338]:
data.shape

(15187, 24)

## Optional: No purpose given

In [339]:
data[data['purpose'] == 'Other'].shape # 673 undefined purpose
# + 12533 NaN
# = 13206 lost deals

(527, 24)

In [340]:
data.shape

(15187, 24)

In [341]:
# to be excluded in detailed model
data = data[data['purpose'].notna()]
data = data[data['purpose'] != 'Other']

In [342]:
data.shape

(9319, 24)

## Optional: No acquisition_technique

In [343]:
data[data['acquisition_technique'] == 'Not Applicable'].shape # 13940 not specified

(3679, 24)

In [344]:
data.shape

(9319, 24)

In [345]:
# to be excluded in detailed dataset
data = data[data['acquisition_technique'] != 'Not Applicable']

In [346]:
data.shape

(5640, 24)

## Optional: missing financial data

In [347]:
fin_feat = ['deal_value', 'acquiror_total_assets']
for feat in fin_feat:
    data = data[data[feat].notna()]

In [348]:
data.shape

(5047, 24)

# Upload 1

In [349]:
data.dtypes

id                                     int64
announcement_date             datetime64[ns]
deal_value                           float64
acquiror_total_assets                float64
acquiror_name                         object
acquiror_ticker                       object
target_name                           object
target_nation                         object
acquiror_nation                       object
target_status                         object
acquiror_industry                     object
target_industry                       object
shares_at_announcement               float64
shares_acquired                      float64
consideration_offered                 object
attitude                              object
acquisition_technique                 object
acquiror_financial_advisor            object
target_financial_advisor              object
purpose                               object
acquiror_code                          int64
target_code                            int64
acquisitio

In [350]:
data.shape # detailed: 5047 deals, not_detailed: 27272 deals

(5047, 24)

In [351]:
data.to_csv('../MA_PREDICTOR/data/ma_detailed_data.csv', index=False, date_format="%d/%m/%Y")

# Clean missing feature (CAR)

In [352]:
data_car = pd.read_csv('../MA_PREDICTOR/data/ma_detailed_data_car.csv', parse_dates=['announcement_date'])

In [353]:
data_car.isna().sum() # we will loose 8813 observations and 1717 for detailed

id                               0
announcement_date                0
deal_value                       0
acquiror_total_assets            0
acquiror_name                    0
acquiror_ticker                  0
target_name                      0
target_nation                    0
acquiror_nation                  0
target_status                    0
acquiror_industry                0
target_industry                  0
shares_at_announcement           0
shares_acquired                  0
consideration_offered            0
attitude                         0
acquisition_technique            0
acquiror_financial_advisor    2463
target_financial_advisor      1987
purpose                          0
acquiror_code                    0
target_code                      0
acquisition_count                0
bidder_count                     0
car                           1717
dtype: int64

In [354]:
data_car = data_car[data_car['car'].notna()]

In [355]:
data_car.shape # feature clean will leave us with 18459 observations; detailed=3300 observations

(3330, 25)

# Upload 2

In [356]:
data_car.to_csv('../MA_PREDICTOR/data/ma_detailed_data_car.csv', index=False, date_format="%d/%m/%Y")