In [18]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier

In [19]:
data = pd.read_csv('../../MA_PREDICTOR/data/ma_detailed_data_car_clean.csv')

In [20]:
data.drop('month', inplace=True, axis=1)

In [21]:
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data = data[(abs(data[f'car_{horizon}']) <= 0.3)]

In [22]:
data

Unnamed: 0,announcement_date,target_status,acquisition_count,shares_at_announcement,shares_acquired,consideration_offered,bidder_count,rel_deal_value,cross_border,relatedness,...,business_sector_ac,economic_sector_target,business_sector_target,cluster_category,a_fin_adv_count,t_fin_adv_count,car_1,car_3,car_5,car_10
0,04/01/2005,public,23,no,full,Cash,1,0.024922,cross_border,not_related,...,Food & Drug Retailing,Industrials,Transportation,divestiture,1,0,-0.009362,0.001308,0.018041,0.056428
1,11/01/2005,public,18,no,full,Other,1,0.008012,cross_border,industry,...,Banking & Investment Services,Financials,Banking & Investment Services,asset_driven_op,0,0,-0.011249,0.004454,0.012696,0.044807
2,12/01/2005,public,8,no,not_full,Cash,1,0.382792,national,industry,...,Mineral Resources,Basic Materials,Mineral Resources,divestiture,0,0,0.013061,0.056958,0.055917,0.016918
3,18/01/2005,public,40,no,full,Other,1,0.014665,cross_border,industry,...,Banking & Investment Services,Financials,Banking & Investment Services,asset_driven_op,0,0,0.060476,-0.010042,0.002116,0.040544
4,26/01/2005,public,0,no,full,Other,1,2.000163,cross_border,industry,...,Energy - Fossil Fuels,Energy,Energy - Fossil Fuels,asset_driven_op,0,0,-0.093760,-0.082288,-0.082288,-0.061740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,16/07/2021,others,9,no,full,Cash,1,0.004449,cross_border,not_related,...,Utilities,Energy,Energy - Fossil Fuels,divestiture,0,1,-0.015629,-0.045382,-0.025400,-0.054948
3021,19/07/2021,others,1,no,full,Other,1,0.104705,cross_border,industry,...,Software & IT Services,Technology,Software & IT Services,asset_driven_op,0,0,0.088395,0.122123,0.059973,0.066900
3022,19/07/2021,others,6,no,full,Other,1,0.270094,cross_border,industry,...,Software & IT Services,Technology,Software & IT Services,divestiture,0,0,0.068755,0.110112,0.102726,0.120972
3023,28/07/2021,others,21,no,full,Cash,1,0.232595,cross_border,not_related,...,Applied Resources,Consumer Cyclicals,Cyclical Consumer Services,divestiture,1,0,0.022186,0.041487,0.041487,0.016503


In [23]:
data['car_class_1'] = data['car_1']

In [24]:
data['car_class_3'] = data['car_3']

In [25]:
data['car_class_5'] = data['car_5']

In [26]:
data['car_class_10'] = data['car_10']

In [27]:
data

Unnamed: 0,announcement_date,target_status,acquisition_count,shares_at_announcement,shares_acquired,consideration_offered,bidder_count,rel_deal_value,cross_border,relatedness,...,a_fin_adv_count,t_fin_adv_count,car_1,car_3,car_5,car_10,car_class_1,car_class_3,car_class_5,car_class_10
0,04/01/2005,public,23,no,full,Cash,1,0.024922,cross_border,not_related,...,1,0,-0.009362,0.001308,0.018041,0.056428,-0.009362,0.001308,0.018041,0.056428
1,11/01/2005,public,18,no,full,Other,1,0.008012,cross_border,industry,...,0,0,-0.011249,0.004454,0.012696,0.044807,-0.011249,0.004454,0.012696,0.044807
2,12/01/2005,public,8,no,not_full,Cash,1,0.382792,national,industry,...,0,0,0.013061,0.056958,0.055917,0.016918,0.013061,0.056958,0.055917,0.016918
3,18/01/2005,public,40,no,full,Other,1,0.014665,cross_border,industry,...,0,0,0.060476,-0.010042,0.002116,0.040544,0.060476,-0.010042,0.002116,0.040544
4,26/01/2005,public,0,no,full,Other,1,2.000163,cross_border,industry,...,0,0,-0.093760,-0.082288,-0.082288,-0.061740,-0.093760,-0.082288,-0.082288,-0.061740
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,16/07/2021,others,9,no,full,Cash,1,0.004449,cross_border,not_related,...,0,1,-0.015629,-0.045382,-0.025400,-0.054948,-0.015629,-0.045382,-0.025400,-0.054948
3021,19/07/2021,others,1,no,full,Other,1,0.104705,cross_border,industry,...,0,0,0.088395,0.122123,0.059973,0.066900,0.088395,0.122123,0.059973,0.066900
3022,19/07/2021,others,6,no,full,Other,1,0.270094,cross_border,industry,...,0,0,0.068755,0.110112,0.102726,0.120972,0.068755,0.110112,0.102726,0.120972
3023,28/07/2021,others,21,no,full,Cash,1,0.232595,cross_border,not_related,...,1,0,0.022186,0.041487,0.041487,0.016503,0.022186,0.041487,0.041487,0.016503


In [28]:
def trans_class(x):
    if x > 0.01:
        return "positive"
    elif x < -0.01:
        return "negative"
    else:
        return "neutral"

In [29]:
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data[f'car_class_{horizon}']=data[f'car_class_{horizon}'].apply(trans_class)

In [30]:
data

Unnamed: 0,announcement_date,target_status,acquisition_count,shares_at_announcement,shares_acquired,consideration_offered,bidder_count,rel_deal_value,cross_border,relatedness,...,a_fin_adv_count,t_fin_adv_count,car_1,car_3,car_5,car_10,car_class_1,car_class_3,car_class_5,car_class_10
0,04/01/2005,public,23,no,full,Cash,1,0.024922,cross_border,not_related,...,1,0,-0.009362,0.001308,0.018041,0.056428,neutral,neutral,positive,positive
1,11/01/2005,public,18,no,full,Other,1,0.008012,cross_border,industry,...,0,0,-0.011249,0.004454,0.012696,0.044807,negative,neutral,positive,positive
2,12/01/2005,public,8,no,not_full,Cash,1,0.382792,national,industry,...,0,0,0.013061,0.056958,0.055917,0.016918,positive,positive,positive,positive
3,18/01/2005,public,40,no,full,Other,1,0.014665,cross_border,industry,...,0,0,0.060476,-0.010042,0.002116,0.040544,positive,negative,neutral,positive
4,26/01/2005,public,0,no,full,Other,1,2.000163,cross_border,industry,...,0,0,-0.093760,-0.082288,-0.082288,-0.061740,negative,negative,negative,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,16/07/2021,others,9,no,full,Cash,1,0.004449,cross_border,not_related,...,0,1,-0.015629,-0.045382,-0.025400,-0.054948,negative,negative,negative,negative
3021,19/07/2021,others,1,no,full,Other,1,0.104705,cross_border,industry,...,0,0,0.088395,0.122123,0.059973,0.066900,positive,positive,positive,positive
3022,19/07/2021,others,6,no,full,Other,1,0.270094,cross_border,industry,...,0,0,0.068755,0.110112,0.102726,0.120972,positive,positive,positive,positive
3023,28/07/2021,others,21,no,full,Cash,1,0.232595,cross_border,not_related,...,1,0,0.022186,0.041487,0.041487,0.016503,positive,positive,positive,positive


# Encoding the target

In [31]:
# creating instance of labelencoder
labelencoder = LabelEncoder()


In [32]:
# Assigning numerical values and storing in another column
data['car_class_1'] = labelencoder.fit_transform(data['car_class_1'])
data['car_class_3'] = labelencoder.fit_transform(data['car_class_3'])
data['car_class_5'] = labelencoder.fit_transform(data['car_class_5'])
data['car_class_10'] = labelencoder.fit_transform(data['car_class_10'])

In [33]:
data

Unnamed: 0,announcement_date,target_status,acquisition_count,shares_at_announcement,shares_acquired,consideration_offered,bidder_count,rel_deal_value,cross_border,relatedness,...,a_fin_adv_count,t_fin_adv_count,car_1,car_3,car_5,car_10,car_class_1,car_class_3,car_class_5,car_class_10
0,04/01/2005,public,23,no,full,Cash,1,0.024922,cross_border,not_related,...,1,0,-0.009362,0.001308,0.018041,0.056428,1,1,2,2
1,11/01/2005,public,18,no,full,Other,1,0.008012,cross_border,industry,...,0,0,-0.011249,0.004454,0.012696,0.044807,0,1,2,2
2,12/01/2005,public,8,no,not_full,Cash,1,0.382792,national,industry,...,0,0,0.013061,0.056958,0.055917,0.016918,2,2,2,2
3,18/01/2005,public,40,no,full,Other,1,0.014665,cross_border,industry,...,0,0,0.060476,-0.010042,0.002116,0.040544,2,0,1,2
4,26/01/2005,public,0,no,full,Other,1,2.000163,cross_border,industry,...,0,0,-0.093760,-0.082288,-0.082288,-0.061740,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3020,16/07/2021,others,9,no,full,Cash,1,0.004449,cross_border,not_related,...,0,1,-0.015629,-0.045382,-0.025400,-0.054948,0,0,0,0
3021,19/07/2021,others,1,no,full,Other,1,0.104705,cross_border,industry,...,0,0,0.088395,0.122123,0.059973,0.066900,2,2,2,2
3022,19/07/2021,others,6,no,full,Other,1,0.270094,cross_border,industry,...,0,0,0.068755,0.110112,0.102726,0.120972,2,2,2,2
3023,28/07/2021,others,21,no,full,Cash,1,0.232595,cross_border,not_related,...,1,0,0.022186,0.041487,0.041487,0.016503,2,2,2,2


In [34]:
data.columns

Index(['announcement_date', 'target_status', 'acquisition_count',
       'shares_at_announcement', 'shares_acquired', 'consideration_offered',
       'bidder_count', 'rel_deal_value', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
       'business_sector_target', 'cluster_category', 'a_fin_adv_count',
       't_fin_adv_count', 'car_1', 'car_3', 'car_5', 'car_10', 'car_class_1',
       'car_class_3', 'car_class_5', 'car_class_10'],
      dtype='object')

In [35]:
y_1=data['car_class_1']

In [36]:
y_3=data['car_class_3']

In [37]:
y_5=data['car_class_5']

In [38]:
y_10=data['car_class_10']

In [39]:
X=data[['target_status', 'acquisition_count',
       'shares_at_announcement', 'shares_acquired', 'consideration_offered',
       'bidder_count', 'rel_deal_value', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
       'business_sector_target', 'cluster_category', 'a_fin_adv_count',
       't_fin_adv_count']]

# Car_1

## Splitting of the dataset

In [40]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_1, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [41]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [42]:
num_transformer= MinMaxScaler()

In [43]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [44]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [45]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [46]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.3549942405792331

## SGDClassifier

In [47]:
model= SGDClassifier()

In [48]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [49]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [50]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [51]:
# Fit data to Grid Search
search.fit(X_train,y_train);

In [52]:
# Best score 1
search.best_score_

0.4224194268788641

# Car_3

## Splitting of the dataset

In [53]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_3, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [54]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [55]:
num_transformer= MinMaxScaler()

In [56]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [57]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [58]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [59]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.37149791955617195

## SGDClassifier

In [60]:
model= SGDClassifier()

In [61]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [62]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [63]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [64]:
# Fit data to Grid Search
search.fit(X_train,y_train);

In [65]:
# Best score 1
search.best_score_

0.44859658196008373

# Car_5

## Splitting of the dataset

In [66]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_5, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [67]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [68]:
num_transformer= MinMaxScaler()

In [69]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [70]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [71]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [72]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.37195279625755184

## SGDClassifier

In [73]:
model= SGDClassifier()

In [74]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [75]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [76]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [77]:
# Fit data to Grid Search
search.fit(X_train,y_train);

In [78]:
# Best score 1
search.best_score_

0.45975222736782717

# Car_10

## Splitting of the dataset

In [79]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_10, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [80]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [81]:
num_transformer= MinMaxScaler()

In [82]:
cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

In [83]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Baseline

In [84]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [85]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.37538494087778274

## SGDClassifier

In [86]:
model= SGDClassifier()

In [87]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [88]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [89]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [90]:
# Fit data to Grid Search
search.fit(X_train,y_train);

In [91]:
# Best score 1
search.best_score_

0.4767237123580714