In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.dummy import DummyRegressor
import xgboost as xgb

In [2]:
!pwd

/Users/manonlaffly/code/cobergmann/MA_PREDICTOR/notebooks


In [3]:
data= pd.read_csv("../MA_PREDICTOR/data/ma_data_car_clean.csv")

In [4]:
data

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,month,cross_border,relatedness,economic_sector_ac,business_sector_ac,economic_sector_target,business_sector_target,car
0,Cash,full,no,50102030,50103030,others,1,cross_border,business_sector,Energy,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,0.006854
1,Other,full,no,54201030,63103010,others,1,cross_border,not_related,Consumer Non-Cyclicals,Personal & Household Products & Services,Academic & Educational Services,Academic & Educational Services,-0.010266
2,Other,full,no,57201030,57201020,others,1,cross_border,industry_group,Technology,Software & IT Services,Technology,Software & IT Services,0.007746
3,Cash,full,no,52102010,51101010,others,1,national,not_related,Industrials,Industrial Goods,Basic Materials,Chemicals,-0.011133
4,Cash,not_full,no,50102030,50102030,public,1,cross_border,industry,Energy,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,-0.003971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18219,Cash,full,no,52102010,59103010,others,8,cross_border,not_related,Industrials,Industrial Goods,Utilities,Utilities,0.020108
18220,Other,full,no,53203020,53205020,others,8,cross_border,business_sector,Consumer Cyclicals,Cyclical Consumer Products,Consumer Cyclicals,Cyclical Consumer Products,-0.040156
18221,Other,full,no,54301020,57201010,others,8,national,not_related,Consumer Non-Cyclicals,Food & Drug Retailing,Technology,Software & IT Services,-0.003545
18222,Other,full,no,55101010,52203030,others,8,cross_border,not_related,Financials,Banking & Investment Services,Industrials,Industrial & Commercial Services,-0.025992


# Adding one binary column

In [6]:
data['car_positive'] = np.where(data['car'] > 0, 1, 0)

In [7]:
data

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,month,cross_border,relatedness,economic_sector_ac,business_sector_ac,economic_sector_target,business_sector_target,car,car_positive
0,Cash,full,no,50102030,50103030,others,1,cross_border,business_sector,Energy,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,0.006854,1
1,Other,full,no,54201030,63103010,others,1,cross_border,not_related,Consumer Non-Cyclicals,Personal & Household Products & Services,Academic & Educational Services,Academic & Educational Services,-0.010266,0
2,Other,full,no,57201030,57201020,others,1,cross_border,industry_group,Technology,Software & IT Services,Technology,Software & IT Services,0.007746,1
3,Cash,full,no,52102010,51101010,others,1,national,not_related,Industrials,Industrial Goods,Basic Materials,Chemicals,-0.011133,0
4,Cash,not_full,no,50102030,50102030,public,1,cross_border,industry,Energy,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,-0.003971,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18219,Cash,full,no,52102010,59103010,others,8,cross_border,not_related,Industrials,Industrial Goods,Utilities,Utilities,0.020108,1
18220,Other,full,no,53203020,53205020,others,8,cross_border,business_sector,Consumer Cyclicals,Cyclical Consumer Products,Consumer Cyclicals,Cyclical Consumer Products,-0.040156,0
18221,Other,full,no,54301020,57201010,others,8,national,not_related,Consumer Non-Cyclicals,Food & Drug Retailing,Technology,Software & IT Services,-0.003545,0
18222,Other,full,no,55101010,52203030,others,8,cross_border,not_related,Financials,Banking & Investment Services,Industrials,Industrial & Commercial Services,-0.025992,0


In [9]:
data.columns

Index(['consideration_offered', 'shares_acquired', 'shares_at_announcement',
       'acquiror_code', 'target_code', 'target_status', 'month',
       'cross_border', 'relatedness', 'economic_sector_ac',
       'business_sector_ac', 'economic_sector_target',
       'business_sector_target', 'car', 'car_positive'],
      dtype='object')

# Splitting the dataset

We decide to not include month in the X fetaures because the statistical study did previously did not consider this feature to be statistically significant (AT ALL!)

In [134]:
y=data['car']

In [135]:
X=data[['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status', 
       'business_sector_target']]

In [136]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=0)

# Pipeline

In [137]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [138]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status', 
       'business_sector_target']

In [139]:
preprocessor = ColumnTransformer([
    ('cat_transformer', cat_transformer, cat_features)])

# ElasticNet

In [28]:
grid = {'model__alpha': [0.15, 0.18, 0.2, 0.22, 0.24],
        'model__l1_ratio': [0,0.05],
        'model__max_iter': [10000]}

In [29]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ElasticNet())])

In [30]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = ['neg_mean_squared_error', 'r2','neg_mean_absolute_error'],
                      refit='neg_mean_squared_error',
                      cv = 5,
                      n_jobs=-1) 

In [31]:
# Fit data to Grid Search
search.fit(X_train,y_train);

  model = cd_fast.sparse_enet_coordinate_descent(


In [22]:
# Best score 1
search.best_score_

-0.002120288949990078

In [23]:
# Best params 1
search.best_params_

{'model__alpha': 0.2, 'model__l1_ratio': 0, 'model__max_iter': 10000}

In [32]:
# Best score 2
search.best_score_

-0.0021202583976565516

In [33]:
# Best params 2
search.best_params_

{'model__alpha': 0.15, 'model__l1_ratio': 0, 'model__max_iter': 10000}

# Estimating with the parameters

In [140]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', ElasticNet(alpha= 0.15, l1_ratio=0, max_iter= 10000))])

In [141]:
# Train pipeline
pipe_trained = pipe.fit(X_train,y_train)

  model = cd_fast.sparse_enet_coordinate_descent(


In [142]:
# Make predictions
results= pipe_trained.predict(X_test)

In [143]:
# Score model
pipe_trained.score(X_test,y_test)

0.0012461683723647976

# Encoding the results

In [144]:
list_y_test= list(y_test)

In [145]:
list_y_test.count(0)

213

In [146]:
list_y_pred= list(results)

In [147]:
list_y_pred.count(0)

0

In [148]:
y_test_list=[]
y_pred_list=[]
for i in range(len(list_y_test)):
    if list_y_test[i] !=0:
        y_test_list.append(list_y_test[i])
        y_pred_list.append(list_y_pred[i])
        

In [155]:
len(y_test_list)==len(y_pred_list)

True

In [156]:
y_true_array=np.array(y_test_list)

In [157]:
y_pred_array=np.array(y_pred_list)

In [158]:
len(y_true_array)==len(y_true_array)

True

In [159]:
y_true_array= np.where(y_true_array > 0, 1, 0)

In [160]:
y_pred_array= np.where(y_pred_array > 0, 1, 0)

# Metrics

In [161]:
confusion_matrix = pd.crosstab(index= y_true_array,
                               columns = y_pred_array)

In [162]:
confusion_matrix
#col_0: Predicted
#row_0: Actual

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,144,2367
1,142,2602


In [163]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print('Accuracy =', round(accuracy_score(y_true_array, y_pred_array), 6)) # Accuracy

print('Precision =', round(precision_score(y_true_array, y_pred_array), 6)) # Precision

print('Recall =', round(recall_score(y_true_array, y_pred_array), 6)) # Recall

print('F1 score =', round(f1_score(y_true_array, y_pred_array), 6)) # F1 score

Accuracy = 0.52255
Precision = 0.523647
Recall = 0.948251
F1 score = 0.674705
