In [289]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, LabelEncoder
from sklearn.linear_model import SGDClassifier

import joblib

In [2]:
data = pd.read_csv('../../MA_PREDICTOR/data/ma_detailed_data_car_clean.csv')
data.drop('month', inplace=True, axis=1)

In [4]:
# Removing outliers
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data = data[(abs(data[f'car_{horizon}']) <= 0.3)]

In [11]:
# Classifying target

data['car_class_1'] = data['car_1']
data['car_class_3'] = data['car_3']
data['car_class_5'] = data['car_5']
data['car_class_10'] = data['car_10']

def trans_class(x):
    if x > 0.01:
        return "positive"
    elif x < -0.01:
        return "negative"
    else:
        return "neutral"
    
horizons = [1, 3, 5, 10]
for horizon in horizons:
    data[f'car_class_{horizon}']=data[f'car_class_{horizon}'].apply(trans_class)

# Target preprocessing

In [14]:
# Creating instance of labelencoder
labelencoder = LabelEncoder()

# Assigning numerical values and storing in another column
data['car_class_1'] = labelencoder.fit_transform(data['car_class_1'])
data['car_class_3'] = labelencoder.fit_transform(data['car_class_3'])
data['car_class_5'] = labelencoder.fit_transform(data['car_class_5'])
data['car_class_10'] = labelencoder.fit_transform(data['car_class_10'])

In [47]:
y_1 = data['car_class_1']
y_3 = data['car_class_3']
y_5 = data['car_class_5']
y_10 = data['car_class_10']

In [19]:
X = data[['target_status', 'acquisition_count',
          'shares_at_announcement', 'shares_acquired', 'consideration_offered',
          'bidder_count', 'rel_deal_value', 'cross_border', 'relatedness',
          'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
          'business_sector_target', 'cluster_category', 'a_fin_adv_count',
          't_fin_adv_count']]

# SGDClassifier

## Setup

In [79]:
# Defining the baseline
baseline_model_str = DummyClassifier(strategy="stratified") 

In [80]:
# Defining features and transformation
cat_transformer = OneHotEncoder(handle_unknown='ignore')
num_transformer= MinMaxScaler()

cat_features=['target_status', 'shares_at_announcement', 'shares_acquired', 
              'consideration_offered',
               'cross_border', 'relatedness',
               'economic_sector_ac', 'business_sector_ac', 'economic_sector_target',
               'business_sector_target', 'cluster_category']

# Defining preprocessor
preprocessor = ColumnTransformer([
    ('num_transformer', num_transformer, ['acquisition_count', 'rel_deal_value']),
    ('cat_transformer', cat_transformer, cat_features)], remainder='passthrough')

## Car_1

In [295]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_1,
                                                    test_size=0.3, random_state=0)

# Baseline
print('Baseline:', cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean())

# Putting model into pipeline
model = SGDClassifier()

pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

# Setting up GridSearch
grid = {
    'model__alpha': [0.1, 0.11, 0.115, 0.12, 0.125], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

search = GridSearchCV(pipe, 
                      grid, 
                      scoring='accuracy',
                      cv=5,
                      n_jobs=-1)

# Fit data to GridSearch and retrieve best score
search.fit(X_train, y_train)
print('Model score:', search.best_score_)

# Assigining best parameters to pipeline
best_pipe = search.best_estimator_
best_pipe.fit(X_train, y_train);

# Uploading pipeline
joblib.dump(best_pipe, '../../MA_PREDICTOR/models/SGDClassifier_car1');
print('Model uploaded to joblib.')

Baseline: 0.3385152448341522
Model score: 0.4185453348691789
Model uploaded to joblib.


## Car_3

In [296]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_3,
                                                    test_size=0.3, random_state=0)

# Baseline
print('Baseline:', cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean())

# Putting model into pipeline

model = SGDClassifier()

pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

# Setting up GridSearch
grid = {
    'model__alpha': [0.1, 0.11, 0.115, 0.12, 0.125], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

search = GridSearchCV(pipe, 
                      grid, 
                      scoring='accuracy',
                      cv=5,
                      n_jobs=-1)

# Fit data to GridSearch and retrieve best score
search.fit(X_train, y_train)
print('Model score:', search.best_score_)

# Assigining best parameters to pipeline
best_pipe = search.best_estimator_
best_pipe.fit(X_train, y_train);

# Uploading pipeline
joblib.dump(best_pipe, '../../MA_PREDICTOR/models/SGDClassifier_car3');
print('Model uploaded to joblib.')

Baseline: 0.3564434989068855
Model score: 0.44762688356566915
Model uploaded to joblib.


## Car_5

In [297]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_5,
                                                    test_size=0.3, random_state=0)

# Baseline
print('Baseline:', cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean())

# Putting model into pipeline

model = SGDClassifier()

pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

# Setting up GridSearch
grid = {
    'model__alpha': [0.1, 0.11, 0.115, 0.12, 0.125], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

search = GridSearchCV(pipe, 
                      grid, 
                      scoring='accuracy',
                      cv=5,
                      n_jobs=-1)

# Fit data to GridSearch and retrieve best score
search.fit(X_train, y_train)
print('Model score:', search.best_score_)

# Assigining best parameters to pipeline
best_pipe = search.best_estimator_
best_pipe.fit(X_train, y_train);

# Uploading pipeline
joblib.dump(best_pipe, '../../MA_PREDICTOR/models/SGDClassifier_car5');
print('Model uploaded to joblib.')

Baseline: 0.36325019393967894
Model score: 0.45537624297703294
Model uploaded to joblib.


## Car_10

In [298]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_10,
                                                    test_size=0.3, random_state=0)

# Baseline
print('Baseline:', cross_val_score(baseline_model_str, X_train, y_train, cv=5, scoring='accuracy').mean())

# Putting model into pipeline

model = SGDClassifier()

pipe = Pipeline([
    ('preprocessing', preprocessor),
    ('model', model)])

# Setting up GridSearch
grid = {
    'model__alpha': [0.1, 0.11, 0.115, 0.12, 0.125], 
    'model__loss': ['hinge', 'log', 'modified_huber'],
    'model__penalty': ['l2', 'l1', 'elasticnet'],
    'model__n_jobs': [-1]
}

search = GridSearchCV(pipe, 
                      grid, 
                      scoring='accuracy',
                      cv=5,
                      n_jobs=-1)

# Fit data to GridSearch and retrieve best score
search.fit(X_train, y_train)
print('Model score:', search.best_score_)

# Assigining best parameters to pipeline
best_pipe = search.best_estimator_
best_pipe.fit(X_train, y_train);

# Uploading pipeline
joblib.dump(best_pipe, '../../MA_PREDICTOR/models/SGDClassifier_car10');
print('Model uploaded to joblib.')

Baseline: 0.4010613789698865
Model score: 0.4801229460024919
Model uploaded to joblib.


In [288]:
# Testing the model
sgd = joblib.load('../../MA_PREDICTOR/models/SGDClassifier_car10')
sgd.predict_proba(X[300:301])

array([[0.34506577, 0.13383038, 0.52110385]])

# Feature importance

In [None]:
# logistic regression for feature importance
from matplotlib import pyplot
# define dataset
X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, random_state=1)
# define the model
model = LogisticRegression()
# fit the model
model.fit(X, y)
# get importance
importance = model.coef_[0]
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.bar([x for x in range(len(importance))], importance)
pyplot.show()