In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn import svm

from tabulate import tabulate


import sys 
sys.path.append("/scratch/izar/kapps/DEX-Cyclic-Arbitrage/")
from config.get import cfg

# Loading the data
Predictions will use embeddings produced by the previously selected embeddings model as features. The target variable of the prediction is the boolean value corresponding to the profitability of cycles. The embedding was shuffled when splitting the embedding model training data. Thus, one needs to rematch each embedding with the corresponding target using `cycle_id`. 

In [2]:
# load the features
X_train = np.load(cfg['files']['encoded_train_features'])
X_test  = np.load(cfg['files']['encoded_test_features'])

# load ids
train_ids = np.load(cfg['files']['train_ids']).astype(int)
test_ids = np.load(cfg['files']['test_ids']).astype(int)
train_ids = pd.DataFrame({"cycle_id":train_ids})
test_ids  = pd.DataFrame({"cycle_id":test_ids})


target = pd.read_csv(cfg['files']['features'])

y_train = train_ids.join(target,on="cycle_id",lsuffix="_").profitability
y_test = test_ids.join(target,on="cycle_id",lsuffix="_").profitability

In [3]:
print(y_train.mean()) # imbalanced classes

0.9444980177863496


# Rescale the features
* Embeddings are normalized

In [4]:
scaler = StandardScaler()
scaler.fit(X_train)
tX_train = scaler.transform(X_train)
tX_test  = scaler.transform(X_test)


# Logistic regression

## model creation

In [5]:
logistic_model = LogisticRegressionCV(cv=5,Cs=np.logspace(-4,4,10),class_weight="balanced",max_iter=1000)

## fitting the model

In [6]:
logistic_model.fit(tX_train, y_train)

LogisticRegressionCV(Cs=array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04]),
                     class_weight='balanced', cv=5, max_iter=1000)

## Model evaluation

In [5]:
def evaluate_model(model):
    pred = model.predict(tX_test)
    tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
    print(f"True neg : {tn} | False pos : {fp} | False neg : {fn} | True pos : {tp}")
    print(tabulate([['True (real)',tp, fn], ['False (Real)',fp, tn]], headers=['\\', 'True (pred)' ," False (pred)"], tablefmt='fancy_grid'))
   
    f1 = f1_score(y_test,pred)
    print(f"f1 score={f1:0.4f}")

In [8]:
evaluate_model(logistic_model)

True neg : 112 | False pos : 104 | False neg : 1544 | True pos : 2241
╒══════════════╤═══════════════╤═════════════════╕
│ \            │   True (pred) │    False (pred) │
╞══════════════╪═══════════════╪═════════════════╡
│ True (real)  │          2241 │            1544 │
├──────────────┼───────────────┼─────────────────┤
│ False (Real) │           104 │             112 │
╘══════════════╧═══════════════╧═════════════════╛
f1 score=0.7312


# SVM

## Model creation

In [6]:
svm_parameters = {'kernel':('linear', 'rbf','poly'), 'C':np.logspace(-4,4,5)}
svc = svm.SVC()
svm_model = GridSearchCV(svc, svm_parameters,verbose=1,cv = 2)

## Fitting the model

In [None]:
svm_model.fit(tX_train, y_train)

Fitting 2 folds for each of 15 candidates, totalling 30 fits


## Model evaluation

In [None]:
evaluate_model(svm_model)