In [233]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier

In [234]:
data = pd.read_csv('../../MA_PREDICTOR/data/ma_data_car_clean.csv')

In [235]:
data.drop('month', inplace=True, axis=1)

In [236]:
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data = data[(abs(data[f'car_{horizon}']) <= 0.3)]

In [237]:
data

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,acquisition_count,bidder_count,cross_border,relatedness,economic_sector_ac,business_sector_ac,economic_sector_target,business_sector_target,car_1,car_3,car_5,car_10
0,Other,full,no,55101010,55301010,others,5,1,cross_border,economic_sector,Financials,Banking & Investment Services,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640
1,Other,full,no,55101010,55301010,others,6,1,cross_border,economic_sector,Financials,Banking & Investment Services,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640
2,Other,full,no,52102050,53205020,others,22,1,cross_border,not_related,Industrials,Industrial Goods,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150
3,Other,full,no,54201010,56201040,others,34,1,cross_border,not_related,Consumer Non-Cyclicals,Personal & Household Products & Services,Healthcare,Pharmaceuticals & Medical Research,0.007969,0.029287,0.009896,0.028176
4,Other,full,no,52102050,53205020,others,23,1,cross_border,not_related,Industrials,Industrial Goods,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15921,Cash,full,no,52102010,59103010,others,15,1,cross_border,not_related,Industrials,Industrial Goods,Utilities,Utilities,0.020108,0.052302,0.070522,0.093236
15922,Other,full,no,53203020,53205020,others,60,1,cross_border,business_sector,Consumer Cyclicals,Cyclical Consumer Products,Consumer Cyclicals,Cyclical Consumer Products,-0.040156,-0.034438,-0.058843,-0.028895
15923,Other,full,no,54301020,57201010,others,52,1,national,not_related,Consumer Non-Cyclicals,Food & Drug Retailing,Technology,Software & IT Services,-0.003545,-0.022961,-0.024630,-0.004876
15924,Other,full,no,55101010,52203030,others,6,1,cross_border,not_related,Financials,Banking & Investment Services,Industrials,Industrial & Commercial Services,-0.025992,-0.009654,-0.019414,-0.025331


In [238]:
data['car_class_1'] = data['car_1']

In [239]:
data['car_class_3'] = data['car_3']

In [240]:
data['car_class_5'] = data['car_5']

In [241]:
data['car_class_10'] = data['car_10']

In [242]:
data

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,acquisition_count,bidder_count,cross_border,relatedness,...,economic_sector_target,business_sector_target,car_1,car_3,car_5,car_10,car_class_1,car_class_3,car_class_5,car_class_10
0,Other,full,no,55101010,55301010,others,5,1,cross_border,economic_sector,...,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640,-0.002947,-0.000599,-0.019639,-0.007640
1,Other,full,no,55101010,55301010,others,6,1,cross_border,economic_sector,...,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640,-0.002947,-0.000599,-0.019639,-0.007640
2,Other,full,no,52102050,53205020,others,22,1,cross_border,not_related,...,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150,0.006169,0.000123,-0.020599,-0.001150
3,Other,full,no,54201010,56201040,others,34,1,cross_border,not_related,...,Healthcare,Pharmaceuticals & Medical Research,0.007969,0.029287,0.009896,0.028176,0.007969,0.029287,0.009896,0.028176
4,Other,full,no,52102050,53205020,others,23,1,cross_border,not_related,...,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150,0.006169,0.000123,-0.020599,-0.001150
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15921,Cash,full,no,52102010,59103010,others,15,1,cross_border,not_related,...,Utilities,Utilities,0.020108,0.052302,0.070522,0.093236,0.020108,0.052302,0.070522,0.093236
15922,Other,full,no,53203020,53205020,others,60,1,cross_border,business_sector,...,Consumer Cyclicals,Cyclical Consumer Products,-0.040156,-0.034438,-0.058843,-0.028895,-0.040156,-0.034438,-0.058843,-0.028895
15923,Other,full,no,54301020,57201010,others,52,1,national,not_related,...,Technology,Software & IT Services,-0.003545,-0.022961,-0.024630,-0.004876,-0.003545,-0.022961,-0.024630,-0.004876
15924,Other,full,no,55101010,52203030,others,6,1,cross_border,not_related,...,Industrials,Industrial & Commercial Services,-0.025992,-0.009654,-0.019414,-0.025331,-0.025992,-0.009654,-0.019414,-0.025331


In [243]:
def trans_class(x):
    if x > 0.01:
        return "positive"
    elif x < -0.01:
        return "negative"
    else:
        return "neutral"

In [246]:
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data[f'car_class_{horizon}']=data[f'car_class_{horizon}'].apply(trans_class)

In [247]:
data

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,acquisition_count,bidder_count,cross_border,relatedness,...,economic_sector_target,business_sector_target,car_1,car_3,car_5,car_10,car_class_1,car_class_3,car_class_5,car_class_10
0,Other,full,no,55101010,55301010,others,5,1,cross_border,economic_sector,...,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640,neutral,neutral,negative,neutral
1,Other,full,no,55101010,55301010,others,6,1,cross_border,economic_sector,...,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640,neutral,neutral,negative,neutral
2,Other,full,no,52102050,53205020,others,22,1,cross_border,not_related,...,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150,neutral,neutral,negative,neutral
3,Other,full,no,54201010,56201040,others,34,1,cross_border,not_related,...,Healthcare,Pharmaceuticals & Medical Research,0.007969,0.029287,0.009896,0.028176,neutral,positive,neutral,positive
4,Other,full,no,52102050,53205020,others,23,1,cross_border,not_related,...,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150,neutral,neutral,negative,neutral
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15921,Cash,full,no,52102010,59103010,others,15,1,cross_border,not_related,...,Utilities,Utilities,0.020108,0.052302,0.070522,0.093236,positive,positive,positive,positive
15922,Other,full,no,53203020,53205020,others,60,1,cross_border,business_sector,...,Consumer Cyclicals,Cyclical Consumer Products,-0.040156,-0.034438,-0.058843,-0.028895,negative,negative,negative,negative
15923,Other,full,no,54301020,57201010,others,52,1,national,not_related,...,Technology,Software & IT Services,-0.003545,-0.022961,-0.024630,-0.004876,neutral,negative,negative,neutral
15924,Other,full,no,55101010,52203030,others,6,1,cross_border,not_related,...,Industrials,Industrial & Commercial Services,-0.025992,-0.009654,-0.019414,-0.025331,negative,neutral,negative,negative


# Encoding the target

In [248]:
# creating instance of labelencoder
labelencoder = LabelEncoder()


In [249]:
# Assigning numerical values and storing in another column
data['car_class_1'] = labelencoder.fit_transform(data['car_class_1'])
data['car_class_3'] = labelencoder.fit_transform(data['car_class_3'])
data['car_class_5'] = labelencoder.fit_transform(data['car_class_5'])
data['car_class_10'] = labelencoder.fit_transform(data['car_class_10'])

In [250]:
data

Unnamed: 0,consideration_offered,shares_acquired,shares_at_announcement,acquiror_code,target_code,target_status,acquisition_count,bidder_count,cross_border,relatedness,...,economic_sector_target,business_sector_target,car_1,car_3,car_5,car_10,car_class_1,car_class_3,car_class_5,car_class_10
0,Other,full,no,55101010,55301010,others,5,1,cross_border,economic_sector,...,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640,1,1,0,1
1,Other,full,no,55101010,55301010,others,6,1,cross_border,economic_sector,...,Financials,Insurance,-0.002947,-0.000599,-0.019639,-0.007640,1,1,0,1
2,Other,full,no,52102050,53205020,others,22,1,cross_border,not_related,...,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150,1,1,0,1
3,Other,full,no,54201010,56201040,others,34,1,cross_border,not_related,...,Healthcare,Pharmaceuticals & Medical Research,0.007969,0.029287,0.009896,0.028176,1,2,1,2
4,Other,full,no,52102050,53205020,others,23,1,cross_border,not_related,...,Consumer Cyclicals,Cyclical Consumer Products,0.006169,0.000123,-0.020599,-0.001150,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15921,Cash,full,no,52102010,59103010,others,15,1,cross_border,not_related,...,Utilities,Utilities,0.020108,0.052302,0.070522,0.093236,2,2,2,2
15922,Other,full,no,53203020,53205020,others,60,1,cross_border,business_sector,...,Consumer Cyclicals,Cyclical Consumer Products,-0.040156,-0.034438,-0.058843,-0.028895,0,0,0,0
15923,Other,full,no,54301020,57201010,others,52,1,national,not_related,...,Technology,Software & IT Services,-0.003545,-0.022961,-0.024630,-0.004876,1,0,0,1
15924,Other,full,no,55101010,52203030,others,6,1,cross_border,not_related,...,Industrials,Industrial & Commercial Services,-0.025992,-0.009654,-0.019414,-0.025331,0,1,0,0


In [251]:
data.columns

Index(['consideration_offered', 'shares_acquired', 'shares_at_announcement',
       'acquiror_code', 'target_code', 'target_status', 'acquisition_count',
       'bidder_count', 'cross_border', 'relatedness', 'economic_sector_ac',
       'business_sector_ac', 'economic_sector_target',
       'business_sector_target', 'car_1', 'car_3', 'car_5', 'car_10',
       'car_class_1', 'car_class_3', 'car_class_5', 'car_class_10'],
      dtype='object')

In [252]:
y_1=data['car_class_1']

In [253]:
y_3=data['car_class_3']

In [254]:
y_5=data['car_class_5']

In [255]:
y_10=data['car_class_10']

In [256]:
X=data[['consideration_offered', 'shares_acquired', 'shares_at_announcement',
       'acquiror_code', 'target_code', 'target_status', 'acquisition_count',
       'bidder_count', 'cross_border', 'relatedness', 'economic_sector_ac',
       'business_sector_ac', 'economic_sector_target',
       'business_sector_target']]

# Car_1

## Splitting of the dataset

In [257]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_1, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [258]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [259]:
num_transformer= MinMaxScaler()

In [260]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status', 
       'business_sector_target']

In [261]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count']),
    ('cat_transformer', cat_transformer, cat_features)],
    remainder='passthrough')

## Baseline

In [262]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [263]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.3422764227642276

## SGDClassifier

In [264]:
model= SGDClassifier()

In [265]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [266]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [267]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [268]:
# Fit data to Grid Search
search.fit(X_train,y_train);



In [269]:
# Best score 1
search.best_score_

0.35636856368563685

# Car_3

## Splitting of the dataset

In [270]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_3, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [271]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [272]:
num_transformer= MinMaxScaler()

In [273]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status', 
       'business_sector_target']

In [274]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count']),
    ('cat_transformer', cat_transformer, cat_features)],
    remainder='passthrough')

## Baseline

In [275]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [276]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.3438121047877145

## SGDClassifier

In [277]:
model= SGDClassifier()

In [278]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [279]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [280]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [281]:
# Fit data to Grid Search
search.fit(X_train,y_train);

In [282]:
# Best score 1
search.best_score_

0.39358626919602535

# Car_5

## Splitting of the dataset

In [283]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_5, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [284]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [285]:
num_transformer= MinMaxScaler()

In [286]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status', 
       'business_sector_target']

In [287]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count']),
    ('cat_transformer', cat_transformer, cat_features)],
    remainder='passthrough')

## Baseline

In [288]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [289]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.35618789521228544

## SGDClassifier

In [290]:
model= SGDClassifier()

In [291]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [292]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [293]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [294]:
# Fit data to Grid Search
search.fit(X_train,y_train);

In [295]:
# Best score 1
search.best_score_

0.4149051490514905

# Car_10

## Splitting of the dataset

In [296]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_10, test_size=0.3, random_state=0)

## Pipeline and GridSearch

In [297]:
cat_transformer = OneHotEncoder(handle_unknown='ignore')

In [298]:
num_transformer= MinMaxScaler()

In [299]:
cat_features=['consideration_offered', 'shares_acquired', 'shares_at_announcement', 'cross_border', 'relatedness',
       'economic_sector_ac', 'business_sector_ac', 'economic_sector_target', 'target_status', 
       'business_sector_target']

In [300]:
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count']),
    ('cat_transformer', cat_transformer, cat_features)],
    remainder='passthrough')

## Baseline

In [301]:
baseline_model_str = DummyClassifier(strategy="stratified") 

In [302]:
cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean()

0.38193315266486005

## SGDClassifier

In [303]:
model= SGDClassifier()

In [304]:
pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

In [305]:
grid = {
    'model__alpha': [1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

In [306]:
search = GridSearchCV(pipe, 
                      grid, 
                      scoring = 'accuracy',
                      cv = 5,
                      n_jobs=-1) 

In [307]:
# Fit data to Grid Search
search.fit(X_train,y_train);



In [308]:
# Best score 1
search.best_score_

0.45718157181571806





