# Data preprocessing

The purpose of this notebook is to preprocess all the BioC JSON data obtained from `pokay_processor.ipynb` so that it can be used for training of the downstream BERT models. This involves some minor regex and date filtering as well as text extraction and formating the data into dataframes.

In [5]:
import re
import json
import random
import copy
from bioc import biocjson
import pandas as pd
import pypdf
import os

In [6]:
# Regular expressions
one_letter_aa_change = r'\b([ARNDCQEGHILKMFPSTWYV])([1-9]+\d*)(del|(?!\1)[ARNDCQEGHILKMFPSTWYV])\b'
# three_letter_aa_change = r'\b(?:ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)[1-9]+\d*(?:ALA|ARG|ASN|ASP|CYS|DEL|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VA|DEL)\b'
# three_letter_aa_change = r'\b((?:ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL))(([1-9]+\d*)(?!\1)(?:ALA|ARG|ASN|ASP|CYS|DEL|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)\b'
three_letter_aa_change = r'\b((?:ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL))([1-9]+\d*)(?!(\1))(ALA|ARG|ASN|ASP|CYS|DEL|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)\b'
genome_change = r'\bg\.[ATGCU][1-9]+\d*[ATGCU]\b'
genome_change_alt =  r'\bg\.[1-9]+\d*[ATGCU]\>[ATGCU]\b'

# Load data
Load BioC JSON dictionaries from `pokay_processor.ipynb`

In [7]:
def check_dictionary(d):
    print("size: " + str(len(d)))
    for key in d:
        if d[key] is None:
            print("None: " + key)
    
        if d[key] == "converting":
            print("Converting: " + key)

In [8]:
# Load litcovid data
# This was downloaded from https://ftp.ncbi.nlm.nih.gov/pub/lu/LitCovid/ 
with open('../data/pokay/litcovid2BioCJSON') as f:
    litcovid_data = json.load(f)

In [9]:
# Load all pokay data
with open('../data/pokay/processed/data_bioc.txt') as file:
    pokay_data = json.loads(file.read())

In [10]:
check_dictionary(pokay_data) # Check over pokay_data

size: 316
None: https://doi.org/10.1016/S1473-3099
None: https://doi.org/10.1016/s1473-3099
None: https://doi.org/10.1002/jmv.26997
None: https://doi.org/10.1080/23744235.2021.1977382
None: https://doi.org/10.1002/jmv.27247
None: https://doi.org/10.1016/S0140-6736
None: https://doi.org/10.1073/pnas.1707304114
None: https://doi.org/10.21203/rs.3.rs-318392/v1
None: https://www.researchgate.net/publication/348943694_The_mutation_P681H_in_the_B117_variant_of_SARS-CoV-2_probably_enhances_viral_entry_and_replication
None: https://observablehq.com/@aglucaci/sc2-omicron
None: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/961042/S1095_NERVTAG_update_note_on_B.1.1.7_severity_20210211.pdf
None: https://doi.org/10.47326/ocsat.dashboard.2021.1.0
None: https://www.covid19genomics.dk/2021-05-08_data-overview.html#b1525
None: https://drive.google.com/file/d/1CuxmNYj5cpIuxWXhjjVmuDqntxXwlfXQ/view
None: https://www.moh.gov.sg/news-highlights/details/3-ne

In [36]:
# # Load data_keys
with open("../data/pokay/data_keys.txt") as file:
    pokay_keys = json.loads(file.read())

In [12]:
# check_dictionary(pokay_keys)

# Basic filters
Perform basic filtering on BioC JSON from litcovid database. This will be used as negative examples in downstream training

In [None]:
# Filter using Regex to identify mutations

count = 0
filtered_papers = []

for paper in litcovid_data[1]:
    
    try:
        passage = paper["passages"]
    except:
        continue
    
    text = ""

    for section in passage:
        # print(" ")
        # print(section) 
        try:
            text += section['text']
        except:
            pass

    mutations = []
    mutations += ["".join(x) for x in re.findall(one_letter_aa_change, text, re.IGNORECASE)]
    mutations += ["".join(x) for x in re.findall(three_letter_aa_change, text, re.IGNORECASE)]
    mutations += re.findall(genome_change, text, re.IGNORECASE)
    mutations += re.findall(genome_change_alt, text, re.IGNORECASE)
    mutations = set(mutations)

    if len(mutations) > 0:
        filtered_papers.append(paper)

with open('../data/pokay/filtered_papers.txt', 'w') as file:
     file.write(json.dumps(filtered_papers))

In [20]:
# Load filtered papers
with open('../data/pokay/filtered_papers.txt') as file:
        filtered_papers = json.loads(file.read())

In [21]:
# Remove papers that are in pokay database

def related_paper(paper):
    try:
        doi = paper["passages"][0]['infons']['article-id_doi']
        
        if doi in pokay_data:
            return True
            
    except:
        return False

    return False

filtered_papers_copy = [x for x in filtered_papers if not related_paper(x)]

In [22]:
# Filter by date. Only grab papers before 2021

before_date_filtered_papers = []

for paper in filtered_papers_copy:  
    try:
        year = paper["year"]
        if int(year) <= 2021:
            before_date_filtered_papers.append(paper)
    except:
        continue

In [23]:
# Filter by date. Only grab papers after 2021

after_date_filtered_papers = []

for paper in filtered_papers_copy:  
    try:
        year = paper["year"]
        if int(year) > 2021:
            after_date_filtered_papers.append(paper)
    except:
        continue

# Break into subtasks
Helper functions for preprocessing

In [24]:
# Function to grab subsample from data
def subset_sample(original, n):
    sub = []
    df = copy.deepcopy(original)
    random.seed(42)
    random.shuffle(df)
    
    for i in range(n):
        entry = df.pop(-1)
        sub.append(entry)

    return df, sub

In [25]:
# Function to un-nest data. Example JSON file will contain Key1: {Key 2: {Key3: Val}}  
def extract_nested_elements(input_string):
    elements = []
    start = 0
    brace_count = 0
    inside_element = False

    for i, char in enumerate(input_string):
        if char == '{':
            if brace_count == 0:
                start = i
                inside_element = True
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0 and inside_element:
                elements.append(input_string[start:i+1])
                inside_element = False

    return elements

In [26]:
# Extract from training data (litcovid portion), passes DOI, output is dictionary 
def litcovid_text_extract(data):
    count = 0
    out = {}
    for paper in data:
        try:
            passage = paper["passages"]
            pmid = paper["pmid"]
        except:
            count += 1
            continue

        text = ""
        
        for section in passage:
            try:
                text += section['text']
        
            except:
                pass

            if text[-1].isalnum(): 
                text += ". "
            else:
                text += " "

        out[pmid] = text
        
    # print(count)
    return out

In [27]:
# Grab file name from DOI
def get_file_name(key):
    doi_pattern = r'https:\/\/doi\.org\/[\w/.-]+'
    doi = re.search(doi_pattern, key)

    if doi is not None:
        file_name = key.split('doi.org/')[-1]
    else:
        key = key.split('https://')[-1]
        file_name = key

    # Replace . in DOI with -
    file_name = file_name.replace(".", "-")
    # Replace / in DOI with _
    file_name = file_name.replace("/", "_")
    # file_name += ".pdf"

    return file_name

In [28]:
# text extract of JSON from pubtator API
def pubtator_extract(paper):
    text = ""
    paper = paper[1:-1]

    try:
        bioc_list = extract_nested_elements(paper)
        
        bioc_collection = biocjson.loads(bioc_list[-1])
        
    except:
        return None

    for document in bioc_collection.documents:    
        for passage in document.passages:
            try:
                text += passage.text
                
            except:
                pass

            if text[-1].isalnum(): 
                text += ". "
            else:
                text += " "
   
    if text == "":
        return None

    return text

In [29]:
# text extract of JSON from conversions of JATS XML
def jats_extract(paper):
    text = ""
    
    try:
        paper_copy = paper[1:-1]
        bioc_collection = biocjson.loads(paper_copy)

    except:
        try:
            bioc_collection = biocjson.loads(paper)
        except:
            return None

    for document in bioc_collection.documents:    
        for passage in document.passages:
            try:
                text += passage.text
            except:
                pass

            if text[-1].isalnum(): 
                text += ". "
            else:
                text += " "

    if text == "":
        return None

    return text

In [30]:
# text extract of JSON from conversions of PDF
def pdf_extract(data):
    text = ""

    try:
        bioc_collection = biocjson.loads(paper)

    except:
        return None
        
    for document in bioc_collection.documents:    
        for passage in document.passages:
            try:
                text += passage.text
            except:
                pass

            if text[-1].isalnum(): 
                text += ". "
            else:
                text += " "

    if text == "":
        return None

    return text

In [43]:
# Extract pokay text from each individual pokay paper
def pokay_text_extract(paper):
    text_extracted = False
    text = ""
    
    if paper is not None:
        # Try to extract as pubtator
        try:
            text = pubtator_extract(paper)

            if text is not None:
                text_extracted = True
                pokay_text.append(text)
        except:
            pass

        if text_extracted:
            return text

        # Try to extract as JATS
        try:
            text = jats_extract(paper)

            if text is not None:
                text_extracted = True
                pokay_text.append(text)
        except:
            pass

        if text_extracted:
            return text

        # Try to extract as PDF
        try:
            text = pdf_extract(paper)

            if text is not None:
                text_extracted = True
                pokay_text.append(text)
        except:
            pass

    else:
        file = get_file_name(key)
        file = "../data/raw/pdf/unconverted/" + file + ".pdf"
        isExist = os.path.exists(file) 
        if isExist:
            print(file)
            reader = pypdf.PdfReader(file)
    
            for page in reader.pages:
                text += page.extract_text()

    return text

In [44]:
# Final function to handle all cases for pokay data
def pokay_extract(data):
    pokay_text = []

    for key in data:
        paper = data[key]
        text_extracted = False
        text = ""
        
        if paper is not None:
            # Try to extract as pubtator
            try:
                text = pubtator_extract(paper)
    
                if text is not None:
                    text_extracted = True
                    pokay_text.append(text)
            except:
                pass
    
            if text_extracted:
                continue
    
            # Try to extract as JATS
            try:
                text = jats_extract(paper)
    
                if text is not None:
                    text_extracted = True
                    pokay_text.append(text)
            except:
                pass
    
            if text_extracted:
                continue
    
            # Try to extract as PDF
            try:
                text = pdf_extract(paper)
    
                if text is not None:
                    text_extracted = True
                    pokay_text.append(text)
            except:
                pass
    
        else:
            file = get_file_name(key)
            file = "../data/raw/pdf/unconverted/" + file + ".pdf"
            isExist = os.path.exists(file) 
            if isExist:
                print(file)
                reader = pypdf.PdfReader(file)
        
                for page in reader.pages:
                    text += page.extract_text()
        
                if text != "":
                    pokay_text.append(text)
    
    return pokay_text

# Create datasets for BERT model

In [40]:
# Create training dataset for initial training
litcovid, train_data = subset_sample(before_date_filtered_papers, 309) # Subsample negative examples for training data
train_data_text = litcovid_text_extract(train_data) # Extract text from BioC JSON
pokay_text = pokay_extract(pokay_data) # Extract text from BioC JSON of Pokay (positive examples)

# Create dataframe
df = pd.DataFrame(train_data_text, columns=["text"])
df["label"] = 0

df_2 = pd.DataFrame(pokay_text, columns=["text"])
df_2["label"] = 1

df = pd.concat([df, df_2])

# Save dataset
df.to_csv("../data/pipeline_data/paper_flagging_data/bert_dataset.csv")

In [41]:
# Create retraining dataset
papers, retrain_data = subset_sample(litcovid, 500) # 
retrain_data_text = litcovid_text_extract(retrain_data)

# Create dataframe
df = pd.DataFrame(retrain_data_text, columns=["text"])

# Save chunks dataset -> this dataset will need to be manually annotated using the notebook dataset_labeller.ipynb
df.to_csv('../data/pipeline_data/paper_flagging_data/chunks_dataset.csv') 

In [42]:
# Create dataset with 500 papers after 2021 (example of data we scrape in the future)
new_papers, data = subset_sample(after_date_filtered_papers, 500)
data_text = litcovid_text_extract(data)

# Create dataframe
df = pd.DataFrame(data_text, columns=["text"])

# Save dataset
df.to_csv("../data/pipeline_data/paper_flagging_data/new_papers_dataset.csv")

# Create dataset for BART

In [45]:
combined_pokay_keys = {key: "\n".join(value) for key, value in pokay_keys.items()}

# Input text
df = pd.DataFrame.from_dict(pokay_data, orient='index', columns=["text"])

# Summary output
df_2 = pd.DataFrame.from_dict(combined_pokay_keys, orient='index', columns=['summary'])

pokay = pd.merge(df, df_2, left_index=True, right_index=True)
pokay = pokay.dropna()

pokay["text"] = pokay["text"].map(pokay_text_extract)

pokay.to_csv('../data/pipeline_data/paper_flagging_data/BART_dataset.csv') 