In [1]:
import re
import json
import random
import copy
from bioc import biocjson
import pandas as pd
import pypdf
import os

In [2]:
# Regular expressions
one_letter_aa_change = r'\b([ARNDCQEGHILKMFPSTWYV])([1-9]+\d*)(del|(?!\1)[ARNDCQEGHILKMFPSTWYV])\b'
# three_letter_aa_change = r'\b(?:ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)[1-9]+\d*(?:ALA|ARG|ASN|ASP|CYS|DEL|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VA|DEL)\b'
# three_letter_aa_change = r'\b((?:ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL))(([1-9]+\d*)(?!\1)(?:ALA|ARG|ASN|ASP|CYS|DEL|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)\b'
three_letter_aa_change = r'\b((?:ALA|ARG|ASN|ASP|CYS|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL))([1-9]+\d*)(?!(\1))(ALA|ARG|ASN|ASP|CYS|DEL|GLN|GLU|GLY|HIS|ILE|LEU|LYS|MET|PHE|PRO|SER|THR|TRP|TYR|VAL)\b'
genome_change = r'\bg\.[ATGCU][1-9]+\d*[ATGCU]\b'
genome_change_alt =  r'\bg\.[1-9]+\d*[ATGCU]\>[ATGCU]\b'

In [3]:
def check_dictionary(d):
    print("size: " + str(len(d)))
    for key in d:
        if d[key] is None:
            print("None: " + key)
    
        if d[key] == "converting":
            print("Converting: " + key)

In [4]:
def get_file_name(key):
    doi_pattern = r'https:\/\/doi\.org\/[\w/.-]+'
    doi = re.search(doi_pattern, key)

    if doi is not None:
        file_name = key.split('doi.org/')[-1]
    else:
        key = key.split('https://')[-1]
        file_name = key

    # Replace . in DOI with -
    file_name = file_name.replace(".", "-")
    # Replace / in DOI with _
    file_name = file_name.replace("/", "_")
    # file_name += ".pdf"

    return file_name

# Load data

In [7]:
# Load litcovid data
with open('/home/david.yang1/autolit/litcovid/data/litcovid2BioCJSON') as f:
    litcovid_data = json.load(f)

In [13]:
# Load all pokay data
with open('../data/processed/pokay/data_bioc.txt') as file:
    pokay_data = json.loads(file.read())

In [9]:
# Load pokay publication data
with open('../data/processed/pokay/publication_bioc.txt') as file:
    publication_bioc= json.loads(file.read())

publication_bioc = {k: v for k, v in publication_bioc.items() if v is not None}

In [10]:
# Load pokay publication unk data
with open('../data/processed/pokay/publication_unk_bioc.txt') as file:
    publication_unk_bioc= json.loads(file.read())

publication_unk_bioc = {k: v for k, v in publication_unk_bioc.items() if v is not None}

In [11]:
# Load pokay rxiv data
# with open('rxiv_bioc.txt') as file:
#         rxiv_bioc= json.loads(file.read())

with open('../data/processed/pokay/rxiv_bioc.txt') as file:
    rxiv_bioc= json.loads(file.read())
    
rxiv_bioc = {k: v for k, v in rxiv_bioc.items() if v is not None}

In [12]:
# Load pokay rxiv unknown data
with open('../data/processed/pokay/rxiv_unk_bioc.txt') as file:
    rxiv_unk_bioc= json.loads(file.read())

rxiv_unk_bioc = {k: v for k, v in rxiv_unk_bioc.items() if v is not None}

In [14]:
# Load pokay grey literature
with open('../data/processed/pokay/grey_bioc.txt') as file:
    grey_bioc = json.loads(file.read())

grey_bioc = {k: v for k, v in grey_bioc.items() if v is not None}

In [15]:
check_dictionary(pokay_data)

size: 316
None: https://doi.org/10.1016/S1473-3099
None: https://doi.org/10.1016/s1473-3099
None: https://doi.org/10.1002/jmv.26997
None: https://doi.org/10.1080/23744235.2021.1977382
None: https://doi.org/10.1002/jmv.27247
None: https://doi.org/10.1016/S0140-6736
None: https://doi.org/10.1073/pnas.1707304114
None: https://doi.org/10.21203/rs.3.rs-318392/v1
None: https://www.researchgate.net/publication/348943694_The_mutation_P681H_in_the_B117_variant_of_SARS-CoV-2_probably_enhances_viral_entry_and_replication
None: https://observablehq.com/@aglucaci/sc2-omicron
None: https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/961042/S1095_NERVTAG_update_note_on_B.1.1.7_severity_20210211.pdf
None: https://doi.org/10.47326/ocsat.dashboard.2021.1.0
None: https://www.covid19genomics.dk/2021-05-08_data-overview.html#b1525
None: https://drive.google.com/file/d/1CuxmNYj5cpIuxWXhjjVmuDqntxXwlfXQ/view
None: https://www.moh.gov.sg/news-highlights/details/3-ne

# Basic filters

In [None]:
# Filter by REGEX
# count = 0
# filtered_papers = []

# for paper in litcovid_data[1]:
    
#     try:
#         passage = paper["passages"]
#     except:
#         continue
    
#     text = ""

#     for section in passage:
#         # print(" ")
#         # print(section) 
#         try:
#             text += section['text']
#         except:
#             pass

#     mutations = []
#     mutations += ["".join(x) for x in re.findall(one_letter_aa_change, text, re.IGNORECASE)]
#     mutations += ["".join(x) for x in re.findall(three_letter_aa_change, text, re.IGNORECASE)]
#     mutations += re.findall(genome_change, text, re.IGNORECASE)
#     mutations += re.findall(genome_change_alt, text, re.IGNORECASE)
#     mutations = set(mutations)

#     if len(mutations) > 0:
#         filtered_papers.append(paper)

# with open('filtered_papers.txt', 'w') as file:
#      file.write(json.dumps(filtered_papers))

In [8]:
with open('../data/processed/pokay/filtered_papers.txt') as file:
        filtered_papers = json.loads(file.read())

In [16]:
# Remove papers that are in pokay database

def related_paper(paper):
    try:
        doi = paper["passages"][0]['infons']['article-id_doi']
        
        if doi in pokay_data:
            return True
            
    except:
        return False

    return False

filtered_papers_copy = [x for x in filtered_papers if not related_paper(x)]

In [17]:
# Filter by date. Only grab papers from 2021 to guarantee

before_date_filtered_papers = []

for paper in filtered_papers_copy:  
    try:
        year = paper["year"]
        if int(year) <= 2021:
            before_date_filtered_papers.append(paper)
    except:
        continue

In [18]:
# Filter by date. Only grab papers from 2021 to guarantee

after_date_filtered_papers = []

for paper in filtered_papers_copy:  
    try:
        year = paper["year"]
        if int(year) > 2021:
            after_date_filtered_papers.append(paper)
    except:
        continue

# Sample from dataset to make initial training data

In [19]:
# Function to grab subsample from data

def subset_sample(original, n):
    sub = []
    df = copy.deepcopy(original)
    random.seed(42)
    random.shuffle(df)
    
    for i in range(n):
        entry = df.pop(-1)
        sub.append(entry)

    return df, sub

# Extract text portions from paper

In [21]:
# Extract from training data (litcovid portion)
def litcovid_text_extract(data):
    count = 0
    out = []
    for paper in data:
        try:
            passage = paper["passages"]
        except:
            count += 1
            continue

        text = ""
        
        for section in passage:
            try:
                text += section['text']
            except:
                pass

        out.append(text)
        
    # print(count)
    return out

In [22]:
# Function to un-nest data

def extract_nested_elements(input_string):
    elements = []
    start = 0
    brace_count = 0
    inside_element = False

    for i, char in enumerate(input_string):
        if char == '{':
            if brace_count == 0:
                start = i
                inside_element = True
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0 and inside_element:
                elements.append(input_string[start:i+1])
                inside_element = False

    return elements

## Break into subtasks

In [23]:
# Pubtator
def pubtator_extract(paper):
    text = ""
    paper = paper[1:-1]

    try:
        bioc_list = extract_nested_elements(paper)
        
        bioc_collection = biocjson.loads(bioc_list[-1])
        
    except:
        return None

    for document in bioc_collection.documents:    
        for passage in document.passages:
            try:
                text += passage.text
            except:
                print(passage)
                pass
   
    if text == "":
        return None

    return text

In [24]:
# JATS
def jats_extract(paper):
    text = ""
    
    try:
        paper_copy = paper[1:-1]
        bioc_collection = biocjson.loads(paper_copy)

    except:
        try:
            bioc_collection = biocjson.loads(paper)
        except:
            return None

    for document in bioc_collection.documents:    
        for passage in document.passages:
            try:
                text += passage.text
            except:
                pass

    if text == "":
        return None

    return text

In [25]:
# PDF
def pdf_extract(data):
    text = ""

    try:
        bioc_collection = biocjson.loads(paper)

    except:
        return None
        
    for document in bioc_collection.documents:    
        for passage in document.passages:
            try:
                text += passage.text
            except:
                pass

    if text == "":
        return None

    return text

In [26]:
# script to iterate through
pokay_text = []

for key in pokay_data:
    paper = pokay_data[key]
    text_extracted = False
    text = ""
    
    if paper is not None:
        # Try to extract as pubtator
        try:
            text = pubtator_extract(paper)

            if text is not None:
                text_extracted = True
                pokay_text.append(text)
        except:
            pass

        if text_extracted:
            continue

        # Try to extract as JATS
        try:
            text = jats_extract(paper)

            if text is not None:
                text_extracted = True
                pokay_text.append(text)
        except:
            pass

        if text_extracted:
            continue

        # Try to extract as PDF
        try:
            text = pdf_extract(paper)

            if text is not None:
                text_extracted = True
                pokay_text.append(text)
        except:
            pass

    else:
        file = get_file_name(key)
        file = "/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/" + file + ".pdf"
        isExist = os.path.exists(file) 
        if isExist:
            print(file)
            reader = pypdf.PdfReader(file)
    
            for page in reader.pages:
                text += page.extract_text()
    
            if text != "":
                pokay_text.append(text)

/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/10-21203_rs-3-rs-318392_v1.pdf
/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/www-researchgate-net_publication_348943694_The_mutation_P681H_in_the_B117_variant_of_SARS-CoV-2_probably_enhances_viral_entry_and_replication.pdf
/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/observablehq-com_@aglucaci_sc2-omicron.pdf
/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/assets-publishing-service-gov-uk_government_uploads_system_uploads_attachment_data_file_961042_S1095_NERVTAG_update_note_on_B-1-1-7_severity_20210211-pdf.pdf
/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/10-47326_ocsat-dashboard-2021-1-0.pdf
/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/www-covid19genomics-dk_2021-05-08_data-overview-html#b1525.pdf
/home/david.yang1/autolit/viriation/data/raw/pdf/unconverted/drive-google-com_file_d_1CuxmNYj5cpIuxWXhjjVmuDqntxXwlfXQ_view.pdf
/home/david.yang1/autolit/v

# Create train, evaluation, and test dataset

In [28]:
# Create training dataset for initial training
litcovid, train_data = subset_sample(before_date_filtered_papers, 309)
train_data_text = litcovid_text_extract(train_data)

# Create dataframe
df = pd.DataFrame(train_data_text, columns=["text"])
df["label"] = 0

df_2 = pd.DataFrame(pokay_text, columns=["text"])
df_2["label"] = 1

df = pd.concat([df, df_2])

# Save dataset
df.to_csv("bert_dataset.csv")

In [29]:
# Create retraining dataset
papers, retrain_data = subset_sample(litcovid, 500)
retrain_data_text = litcovid_text_extract(retrain_data)

# Create dataframe
df = pd.DataFrame(retrain_data_text, columns=["text"])

# Save dataset
df.to_csv('chunks_dataset.csv')

In [30]:
# Create dataset with 500 papers after 2021
new_papers, data = subset_sample(after_date_filtered_papers, 500)
data_text = litcovid_text_extract(data)

# Create dataframe
df = pd.DataFrame(data_text, columns=["text"])

# Save dataset
df.to_csv("new_papers_dataset.csv")