# Grab Pubmed Abstracts

## Functions

In [1]:
from Bio import Entrez
import json
from bs4 import BeautifulSoup as bs
import lxml
import json
import numpy as np
from tqdm import tqdm

In [2]:
def entrez_search_pubmed(query,records_per_query=10,email="XXX@YYY.com",retMax=100):
    from Bio import Entrez
    Entrez.email = email
    # Search
    handle = Entrez.esearch(db="pubmed",term=query, idtype="acc", retMax=retMax)
    record = Entrez.read(handle)
    handle.close()
    return record

In [3]:
def flatten_abstract(abstract_xml):
    abstract = ''
    for abstractText in abstract_xml.find_all('abstracttext'):
        if abstractText.get('label') != None:
            abstract = abstract + " " + abstractText.get('label') + ": "
        abstract = abstract + abstractText.text
    return abstract

In [4]:
def entrez_fetch_abstracts(uid,email):
    from Bio import Entrez
    from bs4 import BeautifulSoup as bs
    Entrez.email = email
    handle = Entrez.efetch(db="pubmed", id=uid, rettype='Medline', retmode='xml')
    result = handle.readlines()
    result = b"".join(result)
    bs_content = bs(result, "lxml")
    abstracts = bs_content.find_all('abstract')
    handle.close()
    # Abstract
    results = [ flatten_abstract(abstract) for abstract in abstracts]
    return results

In [5]:
def entrez_construct_abstract_dict(uids,email):
        results = entrez_fetch_abstracts( uids, email )
        suitable = ['Yes', 'No']
        suitable = np.random.choice(suitable, len(results), p=[0.7, 0.3])
        return [{'text':b, 'sentiment':a} for a,b in zip(suitable, results)]

## Query

In [6]:
query_body={ "query": "muscle hypertrophy", 'email' : "XXX@YYY.com" }
## Query search
results_query = entrez_search_pubmed(query = query_body['query'], email = query_body['email'], retMax=200)

In [7]:
uids = results_query['IdList']
len(uids)

200

In [8]:
## Abstract
results_abstracts = entrez_construct_abstract_dict(uids,"XXX@YYY.com")

In [9]:
len(results_abstracts)

200

In [10]:
results_abstracts[0]

{'text': ' BACKGROUND: Cross-education of strength refers to the strength gain that is transferred to the contralateral limb after a unilateral training program. HYPOTHESIS: Unilateral eccentric training using different muscle contraction times would improve the structural and functional properties of the untrained contralateral limb. STUDY DESIGN: Randomized controlled trial. LEVEL OF EVIDENCE: Level 2. METHODS: Thirty-six participants were randomized into a control group, experimental group 1 (EG6s; eccentric contraction runtime = 6 seconds) and experimental group 2 (EG3s; eccentric contraction runtime = 3 seconds). The thickness and elastographic index of the patellar tendon (PT), lean mass and fat percentage of the thigh, contractile properties of the vastus lateralis (VL), as well as isometric, concentric, and eccentric knee extensor peak torques, and eccentric single-leg decline squat (SLDSe) 1 repetition maximum (1-RM) were measured after 6 weeks of SLDSe training (3 times per w

In [16]:
train_rows = int(np.floor(len(results_abstracts)*0.9))
train_data = results_abstracts[:train_rows]
test_data = results_abstracts[(train_rows+1):]
print(len(train_data))
print(len(test_data))

180
19


# BERT Pubmed Classification

In [17]:
import sys
import numpy as np
import random as rn
import pandas as pd
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

### Seed

In [18]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

### Mapping sentences with their Labels...

In [19]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

(180, 180, 19, 19)

#### visualizing one of the sentences from train set

In [20]:
train_texts[0]

' BACKGROUND: Cross-education of strength refers to the strength gain that is transferred to the contralateral limb after a unilateral training program. HYPOTHESIS: Unilateral eccentric training using different muscle contraction times would improve the structural and functional properties of the untrained contralateral limb. STUDY DESIGN: Randomized controlled trial. LEVEL OF EVIDENCE: Level 2. METHODS: Thirty-six participants were randomized into a control group, experimental group 1 (EG6s; eccentric contraction runtime = 6 seconds) and experimental group 2 (EG3s; eccentric contraction runtime = 3 seconds). The thickness and elastographic index of the patellar tendon (PT), lean mass and fat percentage of the thigh, contractile properties of the vastus lateralis (VL), as well as isometric, concentric, and eccentric knee extensor peak torques, and eccentric single-leg decline squat (SLDSe) 1 repetition maximum (1-RM) were measured after 6 weeks of SLDSe training (3 times per week, 80% 

In [21]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

## Preparing Token embeddings...

In [22]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)

(180, 19)

In [23]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((180, 512), (19, 512))

In [24]:
train_y = np.array(train_labels) == 'Yes'
test_y = np.array(test_labels) == 'Yes'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((180,), (19,), 0.7611111111111111, 0.6842105263157895)

### Masking few random IDs

In [25]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# BERT

In [26]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [27]:
# ensuring that the model runs on GPU, not on CPU

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [28]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [29]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()     # running BERT on CUDA_GPU

In [30]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [31]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([3, 512]), torch.Size([3, 512, 768]), torch.Size([3, 768]))

In [32]:
y = bert_clf(x)
y.cpu().detach().numpy()        # kinda Garbage Collector to free up used and cache space

array([[0.3978583 ],
       [0.3872686 ],
       [0.41915417]], dtype=float32)

In [33]:
# Cross- checking CUDA GPU Memory to ensure GPU memory is not overflowing.
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'6697.349632M'

In [34]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()     # Clearing Cache space for fresh Model run
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

# Fine Tune BERT

In [35]:
# Setting hyper-parameters

BATCH_SIZE = 4
EPOCHS = 10

In [36]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'439.065088M'

In [37]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [38]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [39]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [40]:
torch.cuda.empty_cache()   # Clearing Cache space for a fresh Model run

In [41]:
for epoch_num in tqdm( range(EPOCHS) ):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)

100%|██████████| 10/10 [03:53<00:00, 23.31s/it]


In [42]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in tqdm( enumerate(test_dataloader) ):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])


# Example

In [44]:
bert_predicted[0]

True

In [45]:
all_logits[0]

0.9385141

In [46]:
test_data[0]

{'text': 'Free radicals, or reactive oxygen species, have been implicated as one of the primary causes of myocardial pathologies elicited by chronic diseases and age. The imbalance between pro-oxidants and antioxidants, termed "oxidative stress", involves several pathological changes in mouse hearts, including hypertrophy and cardiac dysfunction. However, the molecular mechanisms and adaptations of the hearts in mice lacking cytoplasmic superoxide dismutase (Sod1KO) have not been investigated. We used echocardiography to characterize cardiac function and morphology in vivo. Protein expression and enzyme activity of Sod1KO were confirmed by targeted mass spectrometry and activity gel. The heart weights of the Sod1KO mice were significantly increased compared with their wildtype peers. The increase in heart weights was accompanied by concentric hypertrophy, posterior wall thickness of the left ventricles (LV), and reduced LV volume. Activated downstream pathways in Sod1KO hearts included