Skip to content

Commit

Permalink
Fix load of models that depend on non thread-safe dependencies
Browse files Browse the repository at this point in the history
A problem was detected when loading TensorFlow models in different threads inside the same JVM. That happened after load a TensorFlow model and then try to import a new TensorFlow model. This was caused by a dependency of TensorFlow (protobuf) that was being reloaded but it already existed in the JVM (through the 1st thread).

The workaround for this problem was to convert the object that wraps Jep on a separated thread to a singleton. This way the dependencies of the models are imported  in the same thread.
  • Loading branch information
Paulo Pereira committed Dec 7, 2018
1 parent 7f2d92f commit 01865e4
Show file tree
Hide file tree
Showing 8 changed files with 528 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
package com.feedzai.openml.python;

import com.feedzai.openml.data.schema.DatasetSchema;
import com.feedzai.openml.mocks.MockInstance;
import com.feedzai.openml.provider.exception.ModelLoadingException;
import com.feedzai.openml.util.algorithm.GenericAlgorithm;
import com.feedzai.openml.util.load.LoadSchemaUtils;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.ExpectedException;

import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.ThreadLocalRandom;

/**
* Tests for the TensorFlow models.
*
* @author Paulo Pereira (paulo.pereira@feedzai.com)
* @since 0.1.5
*/
public class TensorFlowModelTest {

/**
* Expected exception in tests.
*/
@Rule
public final ExpectedException exception = ExpectedException.none();

/**
* Regression test to check that the load of two TensorFlow models work in the same JVM.
*/
@Test
public void testReloadTensorFlowModel() throws ModelLoadingException {
final ClassificationPythonModelLoader modelLoader = new PythonModelProvider()
.getModelCreator(GenericAlgorithm.GENERIC_CLASSIFICATION.getAlgorithmDescriptor().getAlgorithmName())
.get();

final Path path = Paths.get(this.getClass().getResource("/tensorflow_valid").getPath());
final DatasetSchema datasetSchema = LoadSchemaUtils.datasetSchemaFromJson(path);

// 1st load passes
ClassificationPythonModel classificationPythonModel = modelLoader.loadModel(path, datasetSchema);
classificationPythonModel.classify(new MockInstance(datasetSchema, ThreadLocalRandom.current()));

// 2nd load passes
modelLoader.loadModel(path, datasetSchema).classify(new MockInstance(datasetSchema, ThreadLocalRandom.current()));
}
}
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

class ClassifierBase(object):

def getClassDistribution(self, instance):
raise NotImplementedError("This must be implemented by a concrete adapter.")

def classify(self, instance):
raise NotImplementedError("This must be implemented by a concrete adapter.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import sys,math
from ClassifierApi.classifier import ClassifierBase
import numpy as np
import random
import tensorflow as tf


class Classifier(ClassifierBase):

def __init__(self):
ClassifierBase.__init__(self)

# Output files
self.num_classes = 2

# Model
self.threshold = 0.5
self.state_gru1 = np.zeros([1, 20])
self.state_gru2 = np.zeros([1, 10])

self.n_classified = 0
self.n_scored = 0

self.batch_header = ['pos_entry_capability_indexed', 'transaction_response_code_indexed',
'pin_entry_capability_indexed', 'merchant_country_name_indexed',
'verification_method_indexed', 'card_type_indexed', 'merchant_id_indexed',
'is_cnp_indexed', 'card_address_validation_code_indexed', 'merchant_state_indexed',
'processing_code_indexed', 'pos_info_indexed', 'cvv_validation_code_indexed',
'cvv2_validation_code_indexed', 'reversal_indicator_indexed',
'transaction_type_indexed', 'merchant_town_indexed', 'pos_type_indexed',
'numerical', 'merchant_name_indexed', 'merchant_country_indexed',
'terminal_type_indexed', 'mcc_indexed', 'terminal_id_indexed',
'terminal_authentication_indexed', 'split_indicator']
self.header = ['account_balance_normalized', 'amount_normalized', 'diff_event_timestamp_group_by_client_id_normalized',
'sin_hour_of_day_event_timestamp_normalized', 'cos_hour_of_day_event_timestamp_normalized',
'sin_day_of_week_event_timestamp_normalized', 'cos_day_of_week_event_timestamp_normalized',
'sin_day_of_month_event_timestamp_normalized', 'cos_day_of_month_event_timestamp_normalized',
'event_timestamp_minus_account_open_date_normalized', 'event_timestamp_minus_card_exp_date_normalized',
'merchant_name_indexed', 'terminal_id_indexed', 'merchant_id_indexed', 'merchant_town_indexed',
'mcc_indexed', 'merchant_country_name_indexed', 'merchant_state_indexed', 'merchant_country_indexed',
'card_type_indexed', 'pos_info_indexed', 'transaction_type_indexed', 'reversal_indicator_indexed',
'card_address_validation_code_indexed', 'terminal_type_indexed', 'processing_code_indexed',
'pin_entry_capability_indexed', 'pos_entry_capability_indexed', 'pos_type_indexed',
'verification_method_indexed', 'transaction_response_code_indexed', 'terminal_authentication_indexed',
'cvv_validation_code_indexed', 'cvv2_validation_code_indexed', 'fraud_label_indexed', 'is_cnp_indexed',
'event_timestamp', 'client_id', 'account_iban', 'encrypted_pan', 'transaction_id', 'amount',
'is_cnp', 'ID']

# Get lists of fields to parse.
label_field = 'fraud_label_indexed'
# Categorical features: all fields that were indexed except for the label.
categorical_features = {field for field in self.header if field.endswith('_indexed') and field != 'fraud_label_indexed'}
other_fields_to_keep = ['event_timestamp',
'client_id',
'account_iban',
'encrypted_pan',
'transaction_id',
'amount',
'is_cnp']
# Numerical features: all fields except categorical features, label, other fields to keep, and newly created fields.
self.numerical_features = (set(self.header)
- categorical_features
- {label_field}
- set(other_fields_to_keep)
- {'ID'}) # , 'split_indicator'

def normalize(self, v):
norm = np.linalg.norm(v, ord=1)
if norm == 0:
norm = np.finfo(v.dtype).eps
return v / norm

def classify_instance(self, instance):
score = self.getClassDistribution_instance(instance)[0]
classification = int(score > self.threshold)

self.n_classified += 1

return classification

def getClassDistribution_instance(self, instance):
self.n_scored += 1

random.seed(instance[0])
d = self.normalize(random.sample(range(1, 100), self.num_classes))
return d

def validate(self, instances):
for instance in instances:
if not hasattr(instance, "__len__"):
raise Exception('Instance must be an array!')

def classify(self, instances):
self.validate(instances)

return [self.classify_instance(instance) for instance in instances]

def getClassDistribution(self, instances):
return [self.getClassDistribution_instance(instance) for instance in instances]

0 comments on commit 01865e4

Please sign in to comment.