In [2]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.dummy import DummyRegressor
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
from sklearn.linear_model import Ridge

In [3]:
data = pd.read_csv('../../MA_PREDICTOR/data/ma_detailed_data_car_clean.csv')

In [4]:
data.drop('month', inplace=True, axis=1)

In [5]:
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data = data[(abs(data[f'car_{horizon}']) <= 0.3)]

In [6]:
data

Unnamed: 0,announcement_date,target_status,acquisition_count,shares_at_announcement,shares_acquired,consideration_offered,bidder_count,rel_deal_value,cross_border,relatedness,...,business_sector_ac,economic_sector_target,business_sector_target,cluster_category,a_fin_adv_count,t_fin_adv_count,car_1,car_3,car_5,car_10
0,04/01/2005,public,23,no,full,Cash,1,0.024922,cross_border,not_related,...,Food & Drug Retailing,Industrials,Transportation,divestiture,1,0,-0.009362,0.001308,0.018041,0.056428
1,11/01/2005,public,18,no,full,Other,1,0.008012,cross_border,industry,...,Banking & Investment Services,Financials,Banking & Investment Services,asset_driven_op,0,0,-0.011249,0.004454,0.012696,0.044807
2,12/01/2005,public,8,no,not_full,Cash,1,0.382792,national,industry,...,Mineral Resources,Basic Materials,Mineral Resources,divestiture,0,0,0.013061,0.056958,0.055917,0.016918
3,18/01/2005,public,40,no,full,Other,1,0.014665,cross_border,industry,...,Banking & Investment Services,Financials,Banking & Investment Services,asset_driven_op,0,0,0.060476,-0.010042,0.002116,0.040544
4,26/01/2005,public,0,no,full,Other,1,2.000163,cross_border,industry,...,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,asset_driven_op,0,0,-0.093760,-0.082288,-0.082288,-0.061740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,16/07/2021,others,9,no,full,Cash,1,0.004449,cross_border,not_related,...,Utilities,Energy,Energy - Fossil Fuels,divestiture,0,1,-0.015629,-0.045382,-0.025400,-0.054948
3021,19/07/2021,others,1,no,full,Other,1,0.104705,cross_border,industry,...,Software & IT Services,Technology,Software & IT Services,asset_driven_op,0,0,0.088395,0.122123,0.059973,0.066900
3022,19/07/2021,others,6,no,full,Other,1,0.270094,cross_border,industry,...,Software & IT Services,Technology,Software & IT Services,divestiture,0,0,0.068755,0.110112,0.102726,0.120972
3023,28/07/2021,others,21,no,full,Cash,1,0.232595,cross_border,not_related,...,Applied Resources,Consumer Cyclicals,Cyclical Consumer Services,divestiture,1,0,0.022186,0.041487,0.041487,0.016503


In [7]:
data.columns

Index(['announcement_date', 'target_status', 'acquisition_count',
       'shares_at_announcement', 'shares_acquired', 'consideration_offered',
       'bidder_count', 'rel_deal_value', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
       'business_sector_target', 'cluster_category', 'a_fin_adv_count',
       't_fin_adv_count', 'car_1', 'car_3', 'car_5', 'car_10'],
      dtype='object')

In [8]:
y_1=data['car_1']

In [9]:
y_3=data['car_3']

In [10]:
y_5=data['car_5']

In [11]:
y_10=data['car_10']

In [16]:
X=data[['target_status', 'acquisition_count',
       'shares_at_announcement', 'shares_acquired', 'consideration_offered',
       'bidder_count', 'rel_deal_value', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
       'business_sector_target', 'cluster_category', 'a_fin_adv_count',
       't_fin_adv_count']]

# Car_1

## Splitting of the dataset

In [17]:
#Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_1, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [21]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [22]:
num_transformer= MinMaxScaler()

In [23]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [24]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [25]:
baseline_model_mean = DummyRegressor(strategy="mean") 

In [26]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='r2').mean()

-0.0028035919184601354

In [27]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.0023029481616530594

## ElasticNet

In [28]:
# Hyperparameter Grid
grid = {'model__alpha': [0.1, 0.2, 0.3, 0.4],
        'model__l1_ratio': [0,0.05, 0.1, 0.15, 0.2],
        'model__max_iter': [10000]}

In [29]:
# Combine preprocessor and linear model in pipeline
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ElasticNet())])

In [30]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = ['neg_mean_squared_error', 'r2','neg_mean_absolute_error'],
                      refit='neg_mean_squared_error',
                      cv = 5,
                      n_jobs=-1) 

In [31]:
# Fit data to Grid Search
search.fit(X_train,y_train);

  model = cd_fast.sparse_enet_coordinate_descent(


In [32]:
# Best score 1
search.best_score_

-0.002267369834701439

In [33]:
search.best_params_

{'model__alpha': 0.1, 'model__l1_ratio': 0, 'model__max_iter': 10000}

# Car_3

## Splitting of the dataset

In [34]:
#Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_3, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [35]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [36]:
num_transformer= MinMaxScaler()

In [37]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [38]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [39]:
baseline_model_mean = DummyRegressor(strategy="mean") 

In [40]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='r2').mean()

-0.0035816677480401625

In [41]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.003272686721817799

## ElasticNet

In [42]:
# Hyperparameter Grid
grid = {'model__alpha': [0.1, 0.2, 0.3, 0.4],
        'model__l1_ratio': [0,0.05, 0.1, 0.15, 0.2],
        'model__max_iter': [10000]}

In [43]:
# Combine preprocessor and linear model in pipeline
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ElasticNet())])

In [44]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = ['neg_mean_squared_error', 'r2','neg_mean_absolute_error'],
                      refit='neg_mean_squared_error',
                      cv = 5,
                      n_jobs=-1) 

In [45]:
# Fit data to Grid Search
search.fit(X_train,y_train);

  model = cd_fast.sparse_enet_coordinate_descent(


In [46]:
# Best score 1
search.best_score_

-0.0032326437611872342

In [47]:
search.best_params_

{'model__alpha': 0.2, 'model__l1_ratio': 0, 'model__max_iter': 10000}

# Car_5

## Splitting of the dataset

In [48]:
#Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_5, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [49]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [50]:
num_transformer= MinMaxScaler()

In [51]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [52]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [53]:
baseline_model_mean = DummyRegressor(strategy="mean") 

In [54]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='r2').mean()

-0.0027172066560102516

In [55]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.004008256293086424

## ElasticNet

In [56]:
# Hyperparameter Grid
grid = {'model__alpha': [0.1, 0.2, 0.3, 0.4],
        'model__l1_ratio': [0,0.05, 0.1, 0.15, 0.2],
        'model__max_iter': [10000]}

In [57]:
# Combine preprocessor and linear model in pipeline
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ElasticNet())])

In [58]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = ['neg_mean_squared_error', 'r2','neg_mean_absolute_error'],
                      refit='neg_mean_squared_error',
                      cv = 5,
                      n_jobs=-1) 

In [59]:
# Fit data to Grid Search
search.fit(X_train,y_train);

  model = cd_fast.sparse_enet_coordinate_descent(


In [60]:
# Best score 1
search.best_score_

-0.003987410810003885

In [61]:
search.best_params_

{'model__alpha': 0.3, 'model__l1_ratio': 0, 'model__max_iter': 10000}

# Car_10

## Splitting of the dataset

In [62]:
#Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_10, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [63]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [64]:
num_transformer= MinMaxScaler()

In [65]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [66]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [67]:
baseline_model_mean = DummyRegressor(strategy="mean") 

In [68]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='r2').mean()

-0.002114042773200575

In [69]:
cross_val_score(baseline_model_mean, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.005898614531676237

## ElasticNet

In [70]:
# Hyperparameter Grid
grid = {'model__alpha': [0.1, 0.2, 0.3, 0.4],
        'model__l1_ratio': [0,0.05, 0.1, 0.15, 0.2],
        'model__max_iter': [10000]}

In [71]:
# Combine preprocessor and linear model in pipeline
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ElasticNet())])

In [72]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = ['neg_mean_squared_error', 'r2','neg_mean_absolute_error'],
                      refit='neg_mean_squared_error',
                      cv = 5,
                      n_jobs=-1) 

In [73]:
# Fit data to Grid Search
search.fit(X_train,y_train);

  model = cd_fast.sparse_enet_coordinate_descent(


In [74]:
# Best score 1
search.best_score_

-0.005866874444325098

In [75]:
search.best_params_

{'model__alpha': 0.3, 'model__l1_ratio': 0, 'model__max_iter': 10000}

  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
