In [81]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet

We will now create another pipeline, similar to the one wrote in v1, but only selecting the deals that correspond to the 3 most represented industries. Then, we will evaluate the model without the gridsearch.

In [82]:
data = pd.read_csv('../MA_PREDICTOR/data/ma_data_car_clean.csv')

In [83]:
data.head()

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,month,cross_border,relatedness,economic_sector_ac,business_sector_ac,economic_sector_target,business_sector_target,car
0,Cash,full,no,50102030,50103030,others,1,cross_border,business_sector,Energy,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,0.006854
1,Other,full,no,54201030,63103010,others,1,cross_border,not_related,Consumer Non-Cyclicals,Personal & Household Products & Services,Academic & Educational Services,Academic & Educational Services,-0.010266
2,Other,full,no,57201030,57201020,others,1,cross_border,industry_group,Technology,Software & IT Services,Technology,Software & IT Services,0.007746
3,Cash,full,no,52102010,51101010,others,1,national,not_related,Industrials,Industrial Goods,Basic Materials,Chemicals,-0.011133
4,Cash,not_full,no,50102030,50102030,public,1,cross_border,industry,Energy,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,-0.003971


- 52: 'Industrials'
- 55: 'Financials'
- 57: 'Technology'

# Filtering for top 3

In [130]:
d_filtered=data[data['economic_sector_ac'].str.match('Industrials|Financials|Technology')].reset_index(drop=True)

In [131]:
d_filtered

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,month,cross_border,relatedness,economic_sector_ac,business_sector_ac,economic_sector_target,business_sector_target,car
0,Other,full,no,57201030,57201020,others,1,cross_border,industry_group,Technology,Software & IT Services,Technology,Software & IT Services,0.007746
1,Cash,full,no,52102010,51101010,others,1,national,not_related,Industrials,Industrial Goods,Basic Materials,Chemicals,-0.011133
2,Other,full,no,55101010,55301010,others,1,cross_border,economic_sector,Financials,Banking & Investment Services,Financials,Insurance,-0.002947
3,Other,full,no,55101010,55301010,others,1,cross_border,economic_sector,Financials,Banking & Investment Services,Financials,Insurance,-0.002947
4,Other,full,no,52102050,53205020,others,1,cross_border,not_related,Industrials,Industrial Goods,Consumer Cyclicals,Cyclical Consumer Products,0.006169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9607,Cash,full,no,52102010,52203040,others,8,cross_border,economic_sector,Industrials,Industrial Goods,Industrials,Industrial & Commercial Services,0.015438
9608,Other,full,no,57201010,57201020,others,8,cross_border,industry_group,Technology,Software & IT Services,Technology,Software & IT Services,0.002050
9609,Other,full,no,57201030,52203030,others,8,cross_border,not_related,Technology,Software & IT Services,Industrials,Industrial & Commercial Services,0.031459
9610,Cash,full,no,52102010,59103010,others,8,cross_border,not_related,Industrials,Industrial Goods,Utilities,Utilities,0.020108


## Splitting 

In [132]:
y=d_filtered['car']

In [133]:
X=d_filtered[['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']]

In [134]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

## Pipeline

In [135]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [136]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',  'target_status',
       'business_sector_target']

In [137]:
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features)])

In [138]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LinearRegression())])

In [139]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.0020277133032756945

In [140]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

-0.014902799110255183

# Filtering for one industry:

## 'Industrials'

### Filtering

In [141]:
d_industrials=data[data['economic_sector_ac'].str.match('Industrials')].reset_index(drop=True)

In [142]:
y=d_industrials['car']

In [143]:
X=d_industrials[['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']]

### Splitting

In [144]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### Pipeline

In [145]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [146]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']

In [147]:
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features)])

In [148]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LinearRegression())])

In [149]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.0023978933778701834

In [150]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

-0.036430491581535974

## 'Technology'

### Filtering

In [154]:
d_tech=data[data['economic_sector_ac'].str.match('Technology')].reset_index(drop=True)

In [155]:
y=d_tech['car']

In [156]:
X=d_tech[['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']]

### Splitting

In [157]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### Pipeline

In [158]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [159]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']

In [160]:
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features)])

In [161]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LinearRegression())])

In [162]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.0024472570271191476

In [163]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

-0.03873172307462773

## 'Financials'

### Baseline

### Filtering

In [165]:
d_fin=data[data['economic_sector_ac'].str.match('Financials')].reset_index(drop=True)

In [166]:
y=d_fin['car']

In [167]:
X=d_fin[['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']]

### Splitting

In [168]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

### Pipeline

In [169]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [170]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'month', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status',
       'business_sector_target']

In [171]:
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features)])

In [172]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', LinearRegression())])

In [173]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()

-0.0015304784917848212

In [174]:
cross_val_score(pipe, X_train, y_train, cv=5, scoring='r2').mean()

-0.03633027193853826