In [1]:
import os, pickle
import numpy as np
import pandas as pd
from tqdm import tqdm
from itertools import compress

import fasttext

import torch
from torch import nn
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [3]:
cd erdos2021-project/

/Users/szabo.48/Desktop/Erdos_proj/erdos2021-project


In [12]:
from engine.utils.LSTM_loader import ADAM_DF
from engine.utils.LSTM_loader import LSTM, LSTM_SA, FastTextTokenizer, EmbeddingsDataset, load_dataframes, get_LSTM_tokenizer
from engine.utils import load_LSTM_pretrained
from engine.wrappers.wrapper import Wrapper

In [5]:
data_dir, adam_path, emb_path = '../data/medal-emnlp/pretrain_subset', 'datasets/adam/valid_adam.txt', '../crawl-300d-2M-subword/crawl-300d-2M-subword.bin'

tokenizer = get_LSTM_tokenizer(data_dir, adam_path, emb_path);

Data loaded
Loading word index from cache... Done.


100%|██████████| 3188033/3188033 [00:38<00:00, 83507.77it/s]


Tokenizer Built


In [6]:
train = pd.read_csv(os.path.join(data_dir, 'train.csv'), engine='c')
adam_df = pd.read_csv(adam_path, sep='\t')
unique_labels = adam_df.EXPANSION.unique()
label_to_ix = {label: ix for ix, label in enumerate(unique_labels)}

train['LABEL_NUM'] = train.LABEL.apply(lambda l: label_to_ix[l]);

In [7]:
DEVICE = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")

train_data = EmbeddingsDataset(train, tokenizer=tokenizer, device=DEVICE);

# Test the Torch.hub.load LSTM Models

## 1. BiLSTM
## 2. LSTM + SAA

In [9]:
net = LSTM;
net.eval();

index = 23450;
with torch.no_grad():
    idx = torch.tensor([index]);
    sents, locs, labels = train_data[idx];
    outputs = net(sents, locs);

pLabels = torch.topk(outputs,20);
print('Predicted values: ')
print(adam_df['EXPANSION'].iloc[pLabels[1].numpy()[0]])

print('Actual: ')
print(adam_df['EXPANSION'].iloc[labels.numpy()])
print('Actual: ')
print(train["LABEL"][index])

Predicted values: 
11547         leukocyte migration inhibition
6754            diaphragmatic electromyogram
12586                  myofibrillar myopathy
492              atypical ductal hyperplasia
4151                   creatinephosphokinase
4686                         cytolytic tcell
13155                       matching pursuit
15821                    photon flux density
10982                          ketoreductase
11821                      heat labile toxin
6575                         electric fields
7519                     friedreichs disease
2718                computeraided diagnostic
93                      min walking distance
5534                daltons lymphoma ascites
7975                            furazolidone
6947                  endocochlear potential
3105                         cervical cancer
13261                     mandelate racemase
16585    slow waves or pacesetter potentials
Name: EXPANSION, dtype: object
Actual: 
6613    enteroglucagon
Name: EXPANSION, d

  sents = torch.tensor(sents, dtype=torch.float32).to(self.device)


In [10]:
net = LSTM_SA;
net.eval();

index = 23450;
with torch.no_grad():
    idx = torch.tensor([index]);
    sents, locs, labels = train_data[idx];
    outputs = net(sents, locs);

pLabels = torch.topk(outputs,20);
print('Predicted values: ')
print(adam_df['EXPANSION'].iloc[pLabels[1].numpy()[0]])

print('Actual: ')
print(adam_df['EXPANSION'].iloc[labels.numpy()])
print('Actual: ')
print(train["LABEL"][index])

Predicted values: 
4242                                c peptide
7364                           acid synthesis
10444        individual placement and support
5486        diisothiocyanostilbenedisulfonate
3870             chronic myelogenous leukemia
9166                          hypoxic hypoxia
11998                        aerobic capacity
4016                     congenital nystagmus
13378                   minimum spanning tree
6112                   diffuse leiomyomatosis
11798                   longterm facilitation
11479                     inhibitory activity
14981            premenstrual assessment form
3054     sulfated cholecystokinin octapeptide
13774                        neuroborreliosis
11757                    laminar shear stress
428                        activated charcoal
14317                 neurosecretory granules
4789        common variable immune deficiency
3755        cutaneous lymphocyteassociated ag
Name: EXPANSION, dtype: object
Actual: 
6613    enteroglucago

# Test loading pretrained from "load_LSTM_pretrained"

## 1. BiLSTM
## 2. LSTM + SAA

In [13]:
net = load_LSTM_pretrained.lstm()
net.eval();

index = 23450;
with torch.no_grad():
    idx = torch.tensor([index]);
    sents, locs, labels = train_data[idx];
    outputs = net(sents, locs);

pLabels = torch.topk(outputs,20);
print('Predicted values: ')
print(adam_df['EXPANSION'].iloc[pLabels[1].numpy()[0]])

print('Actual: ')
print(adam_df['EXPANSION'].iloc[labels.numpy()])
print('Actual: ')
print(train["LABEL"][index])

Predicted values: 
16418                                   net photosynthesis
5358                                    dental fear survey
9051                                hydroalcoholic extract
16050    procollagen type i carboxyterminal extension p...
13007                                     mononuclear cell
5076                       developmental apraxia of speech
11473                                           lefthanded
11464                                      lungheart ratio
5258                       developmental defects of enamel
8308                                   rat pituitary tumor
3886                               caudal mediastinal node
14989                                  periaqueductal grey
5712                                     dinitrophenylated
12213                              mast cell degranulating
6052                                driving while impaired
11318                                 laser doppler imager
15906                            poro

  sents = torch.tensor(sents, dtype=torch.float32).to(self.device)
