In [None]:
import core.models as models
import core.utils as utils
import core.evaluate as evaluate
import core.predict as predict
import core.test_procedure as test_procedure

In [3]:
MODEL_CHECKPOINT = 'Exscientia/IgBert'
TOKENIZER_CHECKPOINT = 'Exscientia/IgBert'
MODEL_NAME = 'average'
BATCH_SIZE = 4

In [4]:
RESULTS_LOCATION = './results/germline_all_evaluation'
MODEL_LOCATION = './weights/germline_all.pt'
DATASET_LOCATION = './datasets_old/classificator/germline_all/test/test.csv'
TEST_DATASET_LOCATION = './datasets_old/test/test.csv'

# Dataset parameters
shuffle = False     # Shuffle the dataset
frac = None         # Get only the specified fraction of the dataset, if None, take the whole
                    # dataset unless subsample is specified
subsample = 20      # Get a subsample of the specified size
batch_size = 4      # Specifies the batch size
seed = 0            # The seed in case shuffle, frac or subsample are specified
eval_batches = 1    # Number of batches to evaluate

### Predict
Compute the logits for the VH/VL pair and save the results. 

Takes as input the location of the fully trained model, the location of the dataset and the output location. 

The dataset must follow the following format: pair_id,sequence_heavy,sequence_light,label. The label is ignored, hence it can be kept fixed as zero. 

The results are saved in the following format: pair_id,logit_pos,logit_neg.

In [5]:
predict.predict(
    MODEL_LOCATION, DATASET_LOCATION, 
    '{}/predict'.format(RESULTS_LOCATION),
    shuffle=shuffle, frac=frac, subsample=subsample, batch_size=batch_size, seed=seed,
    model_name=MODEL_NAME, model_checkpoint=MODEL_CHECKPOINT, 
    tokenizer_checkpoint=TOKENIZER_CHECKPOINT,
    eval_batches=eval_batches, save=True, out='test'
)

Creating the directory ./results/germline_all_evaluation/predict
Results location: ./results/germline_all_evaluation/predict
Retrieving the dataset...
Reading ./datasets_old/classificator/germline_all/test/test.csv...
Dataset of size 739194
Sampled a subset of size 20
Load the model...
Retrieve the tokenizer...
Tokenizer checkpoint: Exscientia/IgBert
Start evaluating...
Number of batches: 5.
Select only the first 1 batches
Model: ClassificationFromAveraging
Device detected: cpu


100%|██████████| 1/1 [00:15<00:00, 15.54s/it]


### Evaluate
Evaluate a dataset of labeled VH/VL pairs and save the results.

Takes as input the location of the fully trained model, the location of the dataset and the output location. 

The dataset must follow the following format: pair_id,sequence_heavy,sequence_light,label.

Two different files re saved:
* batch specific metrics;
* classification results.

Classification results follow the format pair_id,prediction,label where prediction is the label prediction of the model.

In [6]:
evaluate.evaluate(MODEL_LOCATION, DATASET_LOCATION, 
                  '{}/evaluate'.format(RESULTS_LOCATION),
                  batch_size=batch_size, shuffle=shuffle, subsample=subsample, frac=frac,
                  seed=seed, model_name=MODEL_NAME, model_checkpoint=MODEL_CHECKPOINT,
                  tokenizer_checkpoint=TOKENIZER_CHECKPOINT, eval_batches=eval_batches,
                  save=True, out='test')

Results location: ./results/germline_all_evaluation/evaluate
Retrieving the dataset...
Reading ./datasets_old/classificator/germline_all/test/test.csv...
Dataset of size 739194
Sampled a subset of size 20
Load the model...
Retrieve the tokenizer...
Tokenizer checkpoint: Exscientia/IgBert
Start evaluating...
Number of batches: 5.
Select only the first 1 batches
Model: ClassificationFromAveraging
Device detected: cpu


100%|██████████| 1/1 [00:12<00:00, 12.52s/it]


### Test procedure

In [5]:
test_procedure.test_procedure(MODEL_LOCATION, TEST_DATASET_LOCATION, 
                              '{}/test_procedure'.format(RESULTS_LOCATION),
                              batch_size=batch_size, shuffle=shuffle, subsample=subsample,
                              frac=frac, seed=seed, model_name=MODEL_NAME, 
                              model_checkpoint=MODEL_CHECKPOINT,
                              tokenizer_checkpoint=TOKENIZER_CHECKPOINT,
                              eval_batches=eval_batches,
                              save=True, out='test_procedure')

Results location: ./results/germline_all_evaluation/test_procedure
Retrieving the dataset...
Reading ./datasets_old/test/test.csv...
Dataset of size 369597
Sampled a subset of size 20
Load the model...
Retrieve the tokenizer...
Tokenizer checkpoint: Exscientia/IgBert
Start evaluating...
Number of batches: 5.
Select only the first 1 batches
Model: ClassificationFromAveraging
Device detected: cpu


100%|██████████| 1/1 [00:25<00:00, 25.62s/it]

tensor([1095819, 1145084,  329745,  708218, 1095819, 1145084,  329745,  708218])
tensor([   934, 380061,   1546,  26773,  33113,    201, 663155, 559159])
tensor([ 4.9152,  3.1437,  3.8594,  3.3688, -0.7527,  1.1462, -0.7049, -0.2468])
tensor([1, 1, 1, 1, 0, 0, 0, 0])



