# Parse XML

In [2]:
# Imports
import pandas as pd

import matplotlib.pyplot as plt

import numpy as np

from xml.dom import minidom

In [29]:
#Function to parse xml
def parseXML(filename, isInclude):
    titles = []
    abstracts = []
    keywords = []
    tags = []
    xmldoc = minidom.parse(filename)
    records = xmldoc.getElementsByTagName('record')
    for record in records:
        for node in record.getElementsByTagName('titles'):
            title_node = node.getElementsByTagName('title')[0]
            title = title_node.getElementsByTagName('style')[0].firstChild.nodeValue
            titles.append(title)
        for node in record.getElementsByTagName('abstract'):
            abstract = node.getElementsByTagName('style')[0].firstChild.nodeValue
            abstracts.append(abstract)
            tags.append(isInclude)
        for node in record.getElementsByTagName('keywords'):
            article_keywords = []
            keyword_nodes = node.getElementsByTagName('keyword')
            for keyword_node in keyword_nodes:
                article_keywords.append(keyword_node.getElementsByTagName('style')[0].firstChild.nodeValue)
            keywords.append('; '.join(article_keywords))
    return titles, abstracts, keywords, tags

In [30]:
ti, ai, ki, tagsi = parseXML('./uti/utiTrainInclude.xml', 1)
te, ae, ke, tagse = parseXML('./uti/utiTrainExclude.xml', 0)
df = pd.DataFrame(list(zip(tagsInclude + tagsExclude, ti + te, ai + ae, ki + ke)), 
               columns =['code', 'title', 'abstract', 'keywords'])
df.head()

Unnamed: 0,code,title,abstract,keywords
0,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy..."
1,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy..."
2,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...
3,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...
4,1,Probiotics feeding in prevention of urinary tr...,BACKGROUND: It has been suggested that probiot...,Adolescent; Adult; *Drinking; Female; Humans; ...


# Sci-BERT

In [34]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np

scibert_model = BertModel.from_pretrained("allenai/scibert_scivocab_uncased",
                                  output_hidden_states=True)
scibert_tokenizer = BertTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")

print('scibert_tokenizer is type:', type(scibert_tokenizer))
print('    scibert_model is type:', type(scibert_model))

scibert_tokenizer is type: <class 'transformers.tokenization_bert.BertTokenizer'>
    scibert_model is type: <class 'transformers.modeling_bert.BertModel'>


# Calculate Embeddings

In [35]:
def get_embedding(model, tokenizer, text):

    # Encode with special tokens ([CLS] and [SEP], returning pytorch tensors
    encoded_dict = tokenizer.encode_plus(
                        text,
                        truncation=True,
                        max_length=512,
                        add_special_tokens = True,
                        return_tensors = 'pt'
                )

    input_ids = encoded_dict['input_ids']
    
    # Set model to evaluation mode
    model.eval()
    
    # Run through BERT
    with torch.no_grad():

        outputs = model(input_ids)

        # Extract hidden states
        hidden_states = outputs[2]

    # Select the embeddings
    token_vecs = hidden_states[-2][0]

    # Calculate average of token vectors
    sentence_embedding = torch.mean(token_vecs, dim=0)

    # Convert to np array
    sentence_embedding = sentence_embedding.detach().numpy()

    return sentence_embedding

from IPython.display import clear_output
import timeit
def calculate_embeddings(df, field):
    embeddings = []
    length = len(df[field].tolist())
    index = 0

    start = timeit.default_timer()
    for sentence in df[field].tolist():
        clear_output(wait=True)
        index += 1
        sen_emb = get_embedding(scibert_model, scibert_tokenizer, sentence)
        embeddings.append(sen_emb)

        stop = timeit.default_timer()

        if (index/length*100) < 1:
            expected_time = "Calculating..."

        else:
            time_perc = timeit.default_timer()
            expected_time = np.round( (time_perc-start) /(index/length) /60,2)

        print(index, length)
        print(expected_time)


    # Append to dataframe
    df[field + '-scibert'] = embeddings
    return df

## Title

In [36]:
df = calculate_embeddings(df, 'title')
df.head()

1084 1084
0.97


Unnamed: 0,code,title,abstract,keywords,title-scibert
0,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy...","[-0.8316783, -0.2997632, -0.78207463, -0.12131..."
1,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy...","[-0.8316783, -0.2997632, -0.78207463, -0.12131..."
2,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...,"[-0.4130542, -0.054540712, 0.16070063, -0.2823..."
3,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...,"[-0.4130542, -0.054540712, 0.16070063, -0.2823..."
4,1,Probiotics feeding in prevention of urinary tr...,BACKGROUND: It has been suggested that probiot...,Adolescent; Adult; *Drinking; Female; Humans; ...,"[-0.5211807, -0.7154993, -0.045323715, 0.25110..."


## Abstract

In [37]:
df = calculate_embeddings(df, 'abstract')
df.head()

1084 1084
8.16


Unnamed: 0,code,title,abstract,keywords,title-scibert,abstract-scibert
0,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy...","[-0.8316783, -0.2997632, -0.78207463, -0.12131...","[-0.07536177, -0.06736588, -0.37270182, 0.5410..."
1,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy...","[-0.8316783, -0.2997632, -0.78207463, -0.12131...","[-0.07536177, -0.06736588, -0.37270182, 0.5410..."
2,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...,"[-0.4130542, -0.054540712, 0.16070063, -0.2823...","[-0.049732774, 0.14015731, -0.08440088, 0.2876..."
3,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...,"[-0.4130542, -0.054540712, 0.16070063, -0.2823...","[-0.049732774, 0.14015731, -0.08440088, 0.2876..."
4,1,Probiotics feeding in prevention of urinary tr...,BACKGROUND: It has been suggested that probiot...,Adolescent; Adult; *Drinking; Female; Humans; ...,"[-0.5211807, -0.7154993, -0.045323715, 0.25110...","[0.22030136, -0.32066062, -0.17968939, 0.64076..."


## Keywords

In [38]:
df = calculate_embeddings(df, 'keywords')
df.head()

1084 1084
1.91


Unnamed: 0,code,title,abstract,keywords,title-scibert,abstract-scibert,keywords-scibert
0,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy...","[-0.8316783, -0.2997632, -0.78207463, -0.12131...","[-0.07536177, -0.06736588, -0.37270182, 0.5410...","[0.7376937, 0.29904118, -0.85561955, 1.0003155..."
1,1,D-mannose: a promising support for acute urina...,OBJECTIVE: Urinary tract infections still repr...,"Adolescent; Adult; Aged; Aged, 80 and over; Cy...","[-0.8316783, -0.2997632, -0.78207463, -0.12131...","[-0.07536177, -0.06736588, -0.37270182, 0.5410...","[0.7376937, 0.29904118, -0.85561955, 1.0003155..."
2,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...,"[-0.4130542, -0.054540712, 0.16070063, -0.2823...","[-0.049732774, 0.14015731, -0.08440088, 0.2876...","[0.92701435, -0.33541802, -1.171323, 0.8342975..."
3,1,Reducing consultations for symptoms of cystiti...,This study aims to evaluate the effects of a h...,Bacterial Infections/*prevention & control; *D...,"[-0.4130542, -0.054540712, 0.16070063, -0.2823...","[-0.049732774, 0.14015731, -0.08440088, 0.2876...","[0.92701435, -0.33541802, -1.171323, 0.8342975..."
4,1,Probiotics feeding in prevention of urinary tr...,BACKGROUND: It has been suggested that probiot...,Adolescent; Adult; *Drinking; Female; Humans; ...,"[-0.5211807, -0.7154993, -0.045323715, 0.25110...","[0.22030136, -0.32066062, -0.17968939, 0.64076...","[0.9114151, -0.18335994, -1.2328147, 0.7949868..."


In [39]:
# Save
df.to_pickle("./uti/utiScreeningTitleAbstractKeywords.pkl")