In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import tensorflow as tf
from tabulate import tabulate


import sys 
import os
sys.path.append('/'.join(os.getcwd().split('/')[:4]))
from config.get import cfg

# Loading the data
Predictions will use embeddings produced by the previously selected embeddings model as features. The target variable of the prediction is the boolean value corresponding to the profitability of cycles. The embedding was shuffled when splitting the embedding model training data. Thus, one needs to rematch each embedding with the corresponding target using `cycle_id`. 

In [62]:
# load the features
X_train = np.load(cfg['files']["liquid"]['encoded_train_features'])
X_test  = np.load(cfg['files']["liquid"]['encoded_test_features'])

fX_train = pd.read_csv(cfg['files']["liquid"]['additional_features_train'])
fX_test  = pd.read_csv(cfg['files']["liquid"]['additional_features_test'])

In [63]:
y_train = fX_train.profitability
y_test  = fX_test.profitability

In [65]:
print(y_train.mean()) # imbalanced classes

0.9453892668178382


In [66]:
print(y_test.mean()) # imbalanced classes

0.9538926681783825


# Rescale the features
* Embeddings are normalized

In [83]:
scaler = StandardScaler()
scaler.fit(X_train)
tX_train = scaler.transform(X_train)
tX_test  = scaler.transform(X_test)

# Transform train / test into Pandas

In [128]:
pX_train = pd.DataFrame(data=tX_train, columns=[str(c) for c in range(X_train.shape[1])])
pX_test  = pd.DataFrame(data=tX_test,  columns=[str(c) for c in range(X_train.shape[1])])

# Token one hot encoding

In [129]:
from sklearn.preprocessing import OneHotEncoder

class TokenEncoding:
    def __init__(self):
        self.one_enc = OneHotEncoder(sparse=False, handle_unknown = 'ignore') 
        self.token_columns = ['token1','token2', 'token3']

    def fit_tokens(self,data):
        unique_tokens = np.unique(pd.concat([data[token] for token in self.token_columns],axis=0))
        self.one_enc.fit(unique_tokens.reshape(-1, 1))
        return self

    def transform_tokens(self,data):
        # transform
        encode = lambda col: self.one_enc.transform(data[col].to_numpy().reshape(-1, 1))
        # encode and convert as dataframes
        encoded = [pd.DataFrame(encode(token)).add_prefix(f"token_{key}_") for key,token in enumerate(self.token_columns)]
        return pd.concat(encoded, axis='columns').astype('float32')
    

    def join_tokens(self,data, tokens):
        if ('token1' in data.columns):
            data = data.drop(columns=self.token_columns)
        return pd.concat([data,tokens], axis='columns')

In [130]:
tokenEnc = TokenEncoding()
tokenEnc.fit_tokens(fX_train)
fpX_train = tokenEnc.join_tokens(pX_train, tokenEnc.transform_tokens(fX_train))
fpX_test  = tokenEnc.join_tokens(pX_test, tokenEnc.transform_tokens(fX_test))

In [131]:
fpX_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,token_2_158,token_2_159,token_2_160,token_2_161,token_2_162,token_2_163,token_2_164,token_2_165,token_2_166,token_2_167
0,0.505291,0.579465,-0.637919,1.917729,-0.471626,-0.269707,-0.487739,0.380295,-0.388063,-0.292947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.862282,0.464563,-0.63386,2.143126,-0.487417,-0.31052,-0.494301,1.236678,-0.266514,-0.230577,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.573433,-0.535478,-0.576774,-0.489549,-0.454229,2.182124,0.244653,-0.415567,-0.304272,0.855866,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.643999,-0.5422,-0.564201,-0.490966,-0.448616,2.199132,0.246344,-0.418407,-0.323347,0.821669,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.517683,-0.483488,-0.372736,-0.488034,-0.442421,-0.195371,-0.407583,-0.498525,-0.31215,-0.622698,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Logistic regression

## model creation & fitting

In [132]:
def fit_logistic(X_train, y_train):
    logistic_model = LogisticRegressionCV(
        cv=5,
        Cs=np.logspace(-4,4,10),
        max_iter=5000,
        class_weight="balanced")
    logistic_model.fit(X_train, y_train)
    return logistic_model

In [133]:
logistic_model = fit_logistic(tX_train, y_train)

## Model evaluation

In [134]:
def print_confusion(tn, fp, fn, tp):
    print(f"True neg : {tn} | False pos : {fp} | False neg : {fn} | True pos : {tp}")
    print(tabulate([['True (real)',tp, fn], ['False (Real)',fp, tn]], headers=['\\', 'True (pred)' ," False (pred)"], tablefmt='fancy_grid'))
   
def evaluate_model(model,test_set=tX_test):

    pred = model.predict(test_set)>0.5
    tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
    print_confusion(tn, fp, fn, tp)
    f1 = f1_score(y_test,pred)
    print(f"f1 score={f1:0.4f}")

In [135]:
evaluate_model(logistic_model)

True neg : 24 | False pos : 37 | False neg : 472 | True pos : 790
╒══════════════╤═══════════════╤═════════════════╕
│ \            │   True (pred) │    False (pred) │
╞══════════════╪═══════════════╪═════════════════╡
│ True (real)  │           790 │             472 │
├──────────────┼───────────────┼─────────────────┤
│ False (Real) │            37 │              24 │
╘══════════════╧═══════════════╧═════════════════╛
f1 score=0.7563


## Let's add the token encoding

In [136]:
logistic_model = fit_logistic(fpX_train, y_train)
evaluate_model(logistic_model, test_set=fpX_test)

True neg : 17 | False pos : 44 | False neg : 378 | True pos : 884
╒══════════════╤═══════════════╤═════════════════╕
│ \            │   True (pred) │    False (pred) │
╞══════════════╪═══════════════╪═════════════════╡
│ True (real)  │           884 │             378 │
├──────────────┼───────────────┼─────────────────┤
│ False (Real) │            44 │              17 │
╘══════════════╧═══════════════╧═════════════════╛
f1 score=0.8073


# SVM

## Model creation

In [6]:
svm_parameters = {'kernel':('linear', 'rbf','poly'), 'C':np.logspace(-4,4,5)}
svc = svm.SVC()
svm_model = GridSearchCV(svc, svm_parameters,verbose=1,cv = 2)

## Fitting the model

In [None]:
svm_model.fit(tX_train, y_train)

Fitting 2 folds for each of 15 candidates, totalling 30 fits


## Model evaluation

In [None]:
evaluate_model(svm_model)

# Neural network

## Loading the data
This new prediction method works on the raw data (not embeddings)

In [7]:
# load the features
X_train_NN = np.load(cfg['files']['raw_train_features'])
X_test_NN  = np.load(cfg['files']['raw_test_features'])


X_train_NN = X_train_NN.reshape((len(X_train_NN),-1))
X_test_NN = X_test_NN.reshape((len(X_test_NN),-1))
print(X_train_NN.shape,X_test_NN.shape)

(9333, 3600) (4001, 3600)


## Scaling the features

In [58]:
NN_scaler = StandardScaler()
NN_scaler.fit(X_train_NN)
tX_train_NN = NN_scaler.transform(X_train_NN)
tX_test_NN  = NN_scaler.transform(X_test_NN)

## Model creation

In [8]:
d = X_train_NN.shape[1]

In [9]:
NN_model = tf.keras.Sequential()
NN_model.add(tf.keras.Input(shape=d))
NN_model.add(tf.keras.layers.Dense(100,activation="relu"))
NN_model.add(tf.keras.layers.Dense(50,activation="relu"))
NN_model.add(tf.keras.layers.Dense(1,activation="sigmoid"))

2021-12-30 13:36:40.054474: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-12-30 13:36:40.054528: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (izar): /proc/driver/nvidia/version does not exist
2021-12-30 13:36:40.055970: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               360100    
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 365,201
Trainable params: 365,201
Non-trainable params: 0
_________________________________________________________________


## Fitting the model

In [25]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

TypeError: compute_class_weight() takes 1 positional argument but 3 were given

In [11]:
NN_model.compile(optimizer='sgd', loss='binary_crossentropy')
# This builds the model for the first time:
NN_model.fit(X_train_NN, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b8c27edd850>

In [24]:
evaluate_model(NN_model,test_set=X_test_NN)

True neg : 0 | False pos : 216 | False neg : 0 | True pos : 3785
╒══════════════╤═══════════════╤═════════════════╕
│ \            │   True (pred) │    False (pred) │
╞══════════════╪═══════════════╪═════════════════╡
│ True (real)  │          3785 │               0 │
├──────────────┼───────────────┼─────────────────┤
│ False (Real) │           216 │               0 │
╘══════════════╧═══════════════╧═════════════════╛
f1 score=0.9723
