## Processing of PubMed Open Access Case Reports

Case reports in the form of .xml files are cleaned using the text processing helper functions, filtered for length of text and english language text, and stored as a dictionary with PMIDs as keys and text as values.

In [None]:
import re, os, html, pickle, unicodedata, sys
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import numpy as np
from langdetect import detect

import spacy
from spacy.lang.en import English

## text processing helper functions

In [None]:
# helper functions for text pre-processing

# Pull the contents within the <body> tag
def get_body_and_abst_from_xml(raw_xml):
    body_xml = re.findall('<body>.*?</body>',raw_xml, re.DOTALL) # re.DOTALL is required to match across line breaks
    abstract_xml = re.findall('<abstract.*?>.*?</abstract>', raw_xml, re.DOTALL)
    
    ## get rid of body and abstract sections that are not in english because some documents have multiple language
    ## copies of the case report
    if len(body_xml) > 1:
        body_xml = [i for i in body_xml if detect(i) == 'en']
    if len(abstract_xml) > 1:
        abstract_xml = [i for i in abstract_xml if detect(i) == 'en']
   

    return(' '.join(abstract_xml) + ' '.join(body_xml)) 
    
    
    
    ## this could be added
    # article_title_xml = re.findall('<article-title>.*?</article-title>', raw_xml, re.DOTALL)
    
    # need the period to allow for sentence tokenization
    # return(' '.join(abstract_xml) + ' '.join(body_xml) + '. '.join(article_title_xml))
    
    

# Removes all tags of the form <x> from the text without any replacement
def strip_xml_tags(raw_xml):
    tag_def = r'<.*?>'
    stripped = re.sub(tag_def, '', raw_xml)
    return(stripped)

# thankfully Mike Becker found this SO post:
# https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
# Fix representation of unicode characters
def fix_unicode_chars(s):
    reg = re.compile('&.*?;') #re.compile('&#x.*?;')
    matches = re.findall(reg, s)
    for m in matches:
        s = re.sub(m, html.unescape(m), s)
    return(s)

# strip newline characters
# Removes all newline and tab characters and extra whitespace
def strip_breaks_and_tabs(s):
    s = re.sub(r'[\n\t]',' ', s)
    s = re.sub(r'\s\s+', ' ', s)
    return(s)



# remove tags of the form <tag/> that appear in body and break further processing
def strip_misplaced_tags(s, tag):
    reg = re.compile('<' + tag + '/>')
    s = re.sub(reg, '', s)
    return(s)


# remove contents between a particular tag (don't call this directly -- see below)
# Removes an opening tag, its ending, and everything between
def strip_tag_and_content(s, tag):
    reg = re.compile('\s?<' + tag + '.*?>.*?</' + tag + '>,?', re.DOTALL)
    s = re.sub(reg, '', s)
    return(s)

# keep list of tags to strip here and do it
# Removes anything not in the body of the text, e.g. figure captions, section headers, etc
def strip_non_body_text(s):
    to_remove = ['title', 'label', 'caption', 'table', 'table-wrap-foot', 'sup', 'xref', 'disp-formula', 'inline-formula']
    for tag in to_remove:
        s = strip_misplaced_tags(s, tag)
        s = strip_tag_and_content(s, tag)
    return(s)

# sections that have titles and content in p's
def strip_section_by_title(s, title):
    reg = re.compile('<sec>\n\s+' + '<title>' + title + '</title>' + '.*?</sec>', re.DOTALL)
    s = re.sub(reg, '', s)
    return(s)

# now strip the sections
def strip_trailing_content(s):
    to_remove = ["competing interests", "authors' contributions", "pre-publication history",
                "acknowledgements", "acknowledgments", "consent", "funding", "funding sources",
                "founding sources", "conflict of interests"]
    for t in to_remove:
        s = strip_section_by_title(s, t)
    return(s)

# unicode normalize
# A unicode normalize
def unicode_norm(s):
    s = unicodedata.normalize('NFKD', s)
    return(s)

# replace years with "year"
# Replace four-digit years with the word year
def proc_years(s):
    reg = r'\b(19|20)\d{2}s?\b'
    s = re.sub(reg, 'year', s)
    return(s)

# strip citations in square brackets and parentheses
# strip citations in square brackets and parentheses
# ''' Also remove the space that comes before it'''-- the \s at the beginning of the reg
def strip_citation_numbers(s):
    reg = r'\s[\[\(](\d{1,3}\s?[,-–]?\s?)+[\]\)]'
    s = re.sub(reg, '', s)
    # also remove any remaining empty brackets or braces (xrefs...)
    s = re.sub(r'\s(\[[,-–\s]?\]|\([,-–\s]?\))', '', s)
    return(s)

# remove figure and label references
# Remove references to Table 2 or (Fig 1) or (Figure 2)
# ''' Also remove the space that comes before it''' -- the \s at the beginning of the reg
def strip_fig_table_refs(s):
    reg = re.compile('\s\(?(table|figure|fig.?)\s?\d?[A-Za-z]?\)?')
    s = re.sub(reg, '', s)
    return(s)

# convert real numbers to real_number
def proc_real_numbers(s):
    reg = r'\d+\.\d+'
    s = re.sub(reg, 'real_number', s)
    return(s)

# convert remaining integers to int
# Convert integers to "integer"
def proc_integers(s):
    reg = r'\b\d+(\b|\w)'
    s = re.sub(reg, 'integer', s)
    return(s)

def remove_url(s):
    reg = r'^https?:\/\/.*[\r\n]*\s?'
    s = re.sub(reg, '', s, flags=re.MULTILINE)
    return(s)

# define aggregate function to do all of the above
def preprocess_text(s):
    # the order here is crucial: do not change!
    s = get_body_and_abst_from_xml(s)
    
    s = fix_unicode_chars(s)
    s = unicode_norm(s)
    
    s = s.lower()
    
    s = strip_trailing_content(s)
    s = strip_non_body_text(s)
    s = strip_xml_tags(s)
    s = strip_breaks_and_tabs(s)

    s = strip_fig_table_refs(s)
    s = strip_citation_numbers(s)
    
    s = proc_years(s)
    
    s = remove_url(s)

    s = proc_real_numbers(s)    ## skip these for now to keep numbers
    s = proc_integers(s)        ## which may contain important information

    return(s)

## process case report xml files

In [None]:
# open and process all files
# consider do this in parallel with joblib if very slow (about 4 minutes not too bad)
dict_text = {} # this data structured will be used for analysis
flist = [fname for fname in os.listdir('/Users/zacharyflamholz/Desktop/oa_file_downloads/download_11_1_18/') if not fname.startswith('.')]

no_text_reports = []
no_english_reports = []

for i,fname in enumerate(flist):
    
    if ((i+1) % 100 == 0):
        sys.stdout.write('Completed: %d of %d reports \r' % (i+1, len(flist)))
    
    f = open('/Users/zacharyflamholz/Desktop/oa_file_downloads/download_11_1_18/' + fname, 'r', encoding  = 'utf-8')
    txt = f.read()
    processed_text = preprocess_text(txt)
    
    
    if len(processed_text) < 1:
        no_text_reports = no_text_reports + [fname]
    else:
        try:
            if detect(processed_text) == 'en':
                dict_text[fname[:-4]] = processed_text
        except:
            no_english_reports = no_english_reports + [fname]
    f.close()

## analysis of the processed files

In [None]:
print('number of documents with no text:', len(no_text_reports))
print('number of documents with no english text:', len(no_english_reports))

# analysis of spacy and nltk

## saving to dictionary

In [None]:
# build the spacy toeknizer

# https://github.com/explosion/spaCy/issues/453
# https://github.com/explosion/spaCy/issues/1854

def use_spacy(s):
    res = nlp(s)
    return([t.text for t in res])

import time
start_time = time.time()

nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
tokenized_dict = {k:use_spacy(v) for k,v in dict_text.items()}

print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# lets look at how much text we got
list_doc_lengths = [len(t) for t in tokenized_dict.values()]
plt.hist(list_doc_lengths, bins = 30)
plt.xlabel('words')
plt.ylabel('count')
plt.title('Distrbution of word count for ' + str(len(dict_text)) + ' items.')
plt.show()

In [None]:
# do we need to filter by some minimum size?
word_count_threshold = 100
print('There are', len([x for x in list_doc_lengths if x < word_count_threshold]),
      'documents with less than', word_count_threshold, 'words.')
print('In total the word/token count is:', sum(list_doc_lengths))

In [None]:
start_time = time.time()

# filter
dict_text_final = {}
tokenized_dict_final = {}

for k,v in dict_text.items():
    tokenized_doc = use_spacy(v)
    if len(tokenized_doc) >= word_count_threshold:
        dict_text_final[k] = v
        tokenized_dict_final[k] = tokenized_doc

        
print("--- %s seconds ---" % (time.time() - start_time))

## summary and export

In [None]:
list_doc_lengths_final = [len(v) for v in tokenized_dict_final.values()]
print('Documents:',len(dict_text_final))
print('Total tokens/Words:',sum(list_doc_lengths_final))
print('Median document token count:', np.median(list_doc_lengths_final))
print('IQR of document token count:', np.percentile(list_doc_lengths_final, 25), '-',
      np.percentile(list_doc_lengths_final, 75))
print('Range of token count: ', min(list_doc_lengths_final), '-', max(list_doc_lengths_final))

In [None]:
# save all processed text to file
# full texts as dict
pickle.dump(dict_text_final, open("dict_text_OA_CR_Full_Text.pkl", "wb" ))
# tokenized texts as dict
pickle.dump(tokenized_dict_final, open("dict_text_OA_CR_Tokenized.pkl", "wb" ))