## Imports

In [1]:
import sys
from biocppi_utils import copy_predictions_to_predictions_with_header, load_groundTruth_from_predictions
from biocppi_extraction import biocppi_extraction

## Instantiate the model with custom parameters
### Note that these parameters are tuned to allow runtime to be low, for demonstration purposes

In [2]:
embeddings_path = '/home/kcrouch/USC/csci_548/project/groupedProject/bioNER_refactor/biocppi_extraction/embeddings/PubMed-w2v.txt'
test_params = {'num_ensembles':2,'num_iterations':1000,'num_it_per_ckpt':100}  # note, if num_it_per_ckpt > num_iterations then num_it_per_ckpt will be set to half of num_iterations

## Set up the file information for the dataset you want to train and test on

In [3]:
# demonstrate on small version of conll2003
dataset_name = 'CoNLL_2003'
dataset_dir = '/home/kcrouch/smol_datasets/conll/'  # smol sample
raw_data_train_file = dataset_dir + 'train.txt'
raw_data_dev_file = dataset_dir + 'dev.txt'
raw_data_test_file = dataset_dir + 'test.txt'
file_dict = {'train':{'data':raw_data_train_file},'dev':{'data':raw_data_dev_file},'test':{'data':raw_data_test_file}}

## Instantiate the model!

In [4]:
biocppi = biocppi_extraction(embeddings_path=embeddings_path,**test_params)

## Read in the dataset

In [5]:
data = biocppi.read_dataset(file_dict, dataset_name)  # data read, converted, and written to files in proper location expected by train

## Train the model using the training data we just read in

In [6]:
data_train = data['train']  # test passing actual data [empty also works]
biocppi.train(data_train)
print('DONE TRAIN')

Running BiLSTM model
Will train 2 total models [num_ensembles]
train.txt
Loaded 170 instances from data set
Saved vocab to corpus_train/word_vocab.ner.txt
Loading embeddings.. Organizing embeddings..

Loading dataset..
Loaded 170 instances with a vocab size of 1081 from train.txt


 done f
74s
Vectorizing embeddings 33/1081   Vectorizing embeddings 66/1081   Vectorizing embeddings 99/1081   Vectorizing embeddings 132/1081   Vectorizing embeddings 165/1081   Vectorizing embeddings 198/1081   Vectorizing embeddings 231/1081   Vectorizing embeddings 264/1081   Vectorizing embeddings 297/1081   Vectorizing embeddings 330/1081   Vectorizing embeddings 363/1081   Vectorizing embeddings 396/1081   Vectorizing embeddings 429/1081   Vectorizing embeddings 462/1081   Vectorizing embeddings 495/1081   Vectorizing embeddings 528/1081   Vectorizing embeddings 561/1081   Vectorizing embeddings 594/1081   Vectorizing embeddings 627/1081   Vectorizing embeddings 660/1081   Vectorizing embeddings 693/1081   Vectorizing embeddings 726/1081   Vectorizing embeddings 759/1081   Vectorizing embeddings 792/1081   Vectorizing embeddings 825/1081   Vectorizing embeddings 858/1081   Vectorizing embeddings 891/1081   Vectorizing embeddings 924/1081   Loaded 954/

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Training on 68, tuning on 17
Target labels: ['B-MISC', 'I-MISC', 'O']
68/17 in training/validation set
Using batch_size of 16
ckpt 0 bsize=16 loss 0.00537135824561 fit 100.00% val 94.49%/88.89%/96.00%[1m10s] *
ckpt 1 bsize=16 loss 0.000595263554715 fit 100.00% val 93.39%/84.21%/96.00%[1m6s] 
ckpt 2 bsize=16 loss 0.000265994487563 fit 100.00% val 94.12%/87.27%/96.00%[1m8s] 
ckpt 3 bsize=16 loss 0.000150912877871 fit 100.00% val 94.12%/87.27%/96.00%[1m8s] 
ckpt 4 bsize=16 loss 0.000100554221717 fit 100.00% val 95.70%/87.50%/98.00%[1m5s] *
ckpt 5 bsize=16 loss 7.19405870768e-05 fit 100.00% val 95.70%/87.50%/98.00%[1m5s] *
ckpt 6 bsize=16 loss 5.52921883354e-05 fit 100.00% val 95.70%/87.50%/98.00%[1m6s] *
ckpt 7 bsize=16 loss 4.50886582257e-05 fit 100.00% val 95.70%/87.50%/98.00%[1m37s] *
ckpt 8 bsize=16 loss 3.65110463463e-05 fit 100.00% val 95.70%/87.50%/98.00%[1m18s] *
ckpt 9 bsize=16 loss 3.06697911583e-05 fit 100.00% val 95.70%/87.50%/98.00%[1m4s] *
Fitted to model from chkpt 9 with s

## Now, make predictions on the test set!

In [7]:
data_test = data['test']
predictions = biocppi.predict(data_test)  # test passing actual data [empty also works]
print('DONE PREDICT')

DONE PREDICT


predict() generates a `predictions.txt` file. This file is loaded into evaluate() to calculate our precision, recall, and F1 scores! The results here will not be very accurate, but try training on a full dataset, with a larger number of ensembles and a higher number of iterations ;)
## So, let's evaluate!

In [8]:
evaluation_results = biocppi.evaluate(None,None)  # passing None for predictions and groundTruth parameters allows evaluate() to just read in the information from predictions.txt

tp 16.0
fp 2.0
fn 48.0
final evaluation scores: precision=0.888888888889, recall=0.25, f1=0.390243902439


In [9]:
# show evaluation results tuple
evaluation_results

(0.8888888888888888, 0.25, 0.3902439024390244)