In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import GridSearchCV
from sklearn import svm
import tensorflow as tf
from tabulate import tabulate


import sys, os 
sys.path.append('/'.join(os.getcwd().split('/')[:4]))
from config.get import cfg

2022-01-11 15:04:13.533014: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.2


# Loading the data
Predictions will use embeddings produced by the previously selected embeddings model as features. The target variable of the prediction is the boolean value corresponding to the profitability of cycles. The embedding was shuffled when splitting the embedding model training data. Thus, one needs to rematch each embedding with the corresponding target using `cycle_id`. 

In [2]:
# load the features
X_train = np.load(cfg['files']['encoded_train_features'])
X_test  = np.load(cfg['files']['encoded_test_features'])

# load ids
train_ids = np.load(cfg['files']['train_ids']).astype(int)
test_ids = np.load(cfg['files']['test_ids']).astype(int)
train_ids = pd.DataFrame({"cycle_id":train_ids})
test_ids  = pd.DataFrame({"cycle_id":test_ids})


target = pd.read_csv(cfg['files']['features'])

y_train = train_ids.join(target,on="cycle_id",lsuffix="_").profitability
y_test = test_ids.join(target,on="cycle_id",lsuffix="_").profitability

KeyError: 'encoded_train_features'

In [3]:
print(y_train.mean()) # imbalanced classes

0.9444980177863496


# Rescale the features
* Embeddings are normalized

In [4]:
scaler = StandardScaler()
scaler.fit(X_train)
tX_train = scaler.transform(X_train)
tX_test  = scaler.transform(X_test)


# Logistic regression

## model creation

In [5]:
logistic_model = LogisticRegressionCV(cv=5,Cs=np.logspace(-4,4,10),class_weight="balanced",max_iter=1000)

## fitting the model

In [6]:
logistic_model.fit(tX_train, y_train)

LogisticRegressionCV(Cs=array([1.00000000e-04, 7.74263683e-04, 5.99484250e-03, 4.64158883e-02,
       3.59381366e-01, 2.78255940e+00, 2.15443469e+01, 1.66810054e+02,
       1.29154967e+03, 1.00000000e+04]),
                     class_weight='balanced', cv=5, max_iter=1000)

## Model evaluation

In [23]:
def print_confusion(tn, fp, fn, tp):
    print(f"True neg : {tn} | False pos : {fp} | False neg : {fn} | True pos : {tp}")
    print(tabulate([['True (real)',tp, fn], ['False (Real)',fp, tn]], headers=['\\', 'True (pred)' ," False (pred)"], tablefmt='fancy_grid'))
   
def evaluate_model(model,test_set=tX_test):

    pred = model.predict(test_set)>0.5
    tn, fp, fn, tp = confusion_matrix(y_test,pred).ravel()
    print_confusion(tn, fp, fn, tp)
    f1 = f1_score(y_test,pred)
    print(f"f1 score={f1:0.4f}")

In [8]:
evaluate_model(logistic_model)

True neg : 112 | False pos : 104 | False neg : 1544 | True pos : 2241
╒══════════════╤═══════════════╤═════════════════╕
│ \            │   True (pred) │    False (pred) │
╞══════════════╪═══════════════╪═════════════════╡
│ True (real)  │          2241 │            1544 │
├──────────────┼───────────────┼─────────────────┤
│ False (Real) │           104 │             112 │
╘══════════════╧═══════════════╧═════════════════╛
f1 score=0.7312


# SVM

## Model creation

In [6]:
svm_parameters = {'kernel':('linear', 'rbf','poly'), 'C':np.logspace(-4,4,5)}
svc = svm.SVC()
svm_model = GridSearchCV(svc, svm_parameters,verbose=1,cv = 2)

## Fitting the model

In [None]:
svm_model.fit(tX_train, y_train)

Fitting 2 folds for each of 15 candidates, totalling 30 fits


## Model evaluation

In [None]:
evaluate_model(svm_model)

# Neural network

## Loading the data
This new prediction method works on the raw data (not embeddings)

In [7]:
# load the features
X_train_NN = np.load(cfg['files']['raw_train_features'])
X_test_NN  = np.load(cfg['files']['raw_test_features'])


X_train_NN = X_train_NN.reshape((len(X_train_NN),-1))
X_test_NN = X_test_NN.reshape((len(X_test_NN),-1))
print(X_train_NN.shape,X_test_NN.shape)

(9333, 3600) (4001, 3600)


## Scaling the features

In [58]:
NN_scaler = StandardScaler()
NN_scaler.fit(X_train_NN)
tX_train_NN = NN_scaler.transform(X_train_NN)
tX_test_NN  = NN_scaler.transform(X_test_NN)

## Model creation

In [8]:
d = X_train_NN.shape[1]

In [9]:
NN_model = tf.keras.Sequential()
NN_model.add(tf.keras.Input(shape=d))
NN_model.add(tf.keras.layers.Dense(100,activation="relu"))
NN_model.add(tf.keras.layers.Dense(50,activation="relu"))
NN_model.add(tf.keras.layers.Dense(1,activation="sigmoid"))

2021-12-30 13:36:40.054474: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-12-30 13:36:40.054528: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (izar): /proc/driver/nvidia/version does not exist
2021-12-30 13:36:40.055970: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
NN_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 100)               360100    
                                                                 
 dense_1 (Dense)             (None, 50)                5050      
                                                                 
 dense_2 (Dense)             (None, 1)                 51        
                                                                 
Total params: 365,201
Trainable params: 365,201
Non-trainable params: 0
_________________________________________________________________


## Fitting the model

In [25]:
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)

TypeError: compute_class_weight() takes 1 positional argument but 3 were given

In [11]:
NN_model.compile(optimizer='sgd', loss='binary_crossentropy')
# This builds the model for the first time:
NN_model.fit(X_train_NN, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2b8c27edd850>

In [24]:
evaluate_model(NN_model,test_set=X_test_NN)

True neg : 0 | False pos : 216 | False neg : 0 | True pos : 3785
╒══════════════╤═══════════════╤═════════════════╕
│ \            │   True (pred) │    False (pred) │
╞══════════════╪═══════════════╪═════════════════╡
│ True (real)  │          3785 │               0 │
├──────────────┼───────────────┼─────────────────┤
│ False (Real) │           216 │               0 │
╘══════════════╧═══════════════╧═════════════════╛
f1 score=0.9723
