In [45]:
import glob, re, sys, os, ssl, unicodedata, itertools, lxml, bs4, requests, multiprocessing
import pandas as pd
from nltk.tokenize import sent_tokenize
from dataclasses import dataclass
from pathlib import Path
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from multiprocessing.pool import Pool
from grobid_client_python import grobid_client as grobid
from bs4 import BeautifulSoup


##########
# This script allows for conversion of pdfs to an excel spreadsheet, parsing out text by keywords from each file
# Written/modified by Corinn Small
#########


def convert_text(inpath,outpath):
    
    '''
    converts text from pdf to tei.xml using grobid web service api, remember computer has to be connected to the server: cd grobid-0.6.1/ -> ./gradlew run
    input: path to papers
    output: tei.xml file per pdf
    '''

    print('Converting text...')
    print('')
    
    
    client = grobid.grobid_client(config_path="./grobid_client_python/config.json")
    client.process("processFulltextDocument", inpath, outpath)
    
    print('Done')

def read_tei(tei_file):
    '''
    Reads in an xml file and returns a beautifulsoup object
    
    '''
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'html.parser')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')
    

def elem_to_text(elem, default='NA'):
    '''
    Returns element if it exists, if not returns NA
    
    '''
    if elem:
        return elem.getText()
    else:
        return default
    

#create class for storing pdf info

class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)  #creates soup object
        self._text = None
        self._title = ''  
        self._abstract = ''
        self._keytext = {}
        self._doublecheck = None
        
    @property
    def doi(self, id_='DOI'):
        '''
        retrieve id
        '''
    
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return 'no id'
        else:
            return idno_elem.getText()
    
    @property
    def title(self):
        '''
        retrieve title
        '''
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title
    
    @property
    def abstract(self):
        '''
        retrieve abstract
        '''
        
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract
      
    @property
    def authors(self):
        '''
        retrieve authors
        '''
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        
        @dataclass
        class Person:
            firstname: str
            middlename: str
            surname: str
                
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(surname, firstname, middlename)
            result.append(person)
        return result
    
    @property
    def text(self):
        '''
        retrieves text
        returns dictionary by subsection
        '''
        #print(self.soup.prettify())
        print(self.filename.split('/')[-1])
        if not self._text:
            divs_text = {}
            
            for div in self.soup.body.find_all('div'): 
                
                if not div.get('type'):  # div is neither an appendix nor references, just plain text.
                    heads = div.find_all('head')
                    
                    if not heads:
                        self._doublecheck = True
                        div_text = []
                        p_text = div.get_text(separator=' ', strip=True)
                        div_text.append(p_text)
                        divs_text['body'] = div_text
                        
                    else:
                        sect = []
                        
                        for head in heads:
                            head_text = head.get_text(separator=' ', strip=True).lower()
                            #print(head_text)
                            
                            if head_text.find('fig') == -1:  #exclude figure 
                                print(head_text)
                                
                                for p in div.find_all('p'):
                                    #print(p)
                                    sect_text = p.get_text(separator=' ', strip=True)
                                    sect.append(sect_text)
                                    divs_text[head.get_text()] = sect
                    
            self._text = divs_text
        
        return self._text
    
    
    def keytext(self, keywords):
        '''
        retrieves sentences by keyword
        returns dictionary
        '''
        
        
        for keyword in keywords:
            self._keytext[keyword] = []
            
        
        for k,v in self._text.items():  #for subsection and text 
                for keyword in keywords:  #search thru sentences 
                    sub_dict = {}
                    section = k.lower()
                    sub_dict[section] = []  #create list for each subsection in dictionary
                    
                    for i in v:  #for paragraph in text
                        sentences = sent_tokenize(i)  #get list of sentences
                        sentences = [s.lower() for s in sentences]
                    
                        for sentence in sentences:
                            result = re.findall('\\b' + keyword + '\\b', sentence)  #find keyword in sentence

                            if len(result) > 0:  #if keyword exists, 
                                sub_dict[section].append(sentence)  #add sentence to subsection list

                            else:  
                                pass
                    
                    #print(sub_dict)
                    self._keytext[keyword].append(sub_dict)  #adds each subsection to keyword dictionary
                        
        return self._keytext                   

In [10]:
#get input from user

cases = 'single'

path = '/Users/corinnsmall/Documents/BPDCN/bpdcn_papers/' + cases +'_case_papers/'
papers = '*.xml'
keywords_ = 'morphologies'
output = 'output_' + cases + '_cases_' + keywords_

associated_diseases_keywords = ['acute myeloid leukemia', 'AML', 'acute lymphoblastic leukemia', 'ALL', 'leukemia', 'non-hodgkin lymphoma', 'Hodgkin lymphoma',
                                'lymphoma','myelogenous leukemia', 'multiple myeloma', 'myeloma', 'chronic myelogenous leukemia', 'CML', 'chronic lymphocytic leukemia', 
                                'CLL', 'carcinoma','Pleuropulmonary blastoma', 'blastoma', 'neuroblastoma', 'melanoma', 'sarcoma', 'skin cancer', 'hairy cell leukemia',
                                'ependymoma', 'chordoma','bone cancer', 'bladder', 'AIDS-related lymphoma', 'thyroid cancer', 'colon cancer', 'rectal cancer',
                                'prostate cancer', 'chronic myeloid leukemia', 'myeloproliferative', 'myelodysplastic', 'mast cell', 'mastocytosis', 'lymphoblastic', 
                                'follicular lymphoma', 'marginal zone lymphoma', 'langerhan', 'polycythemia vera', 'essential thrombocythemia', 'myelofibrosis', 
                                'mycosis fungoides', 'sezary', 'burkitt', 'cmml', 'chronic myelomonocytic leukemia']

morphology_keywords = ['Vacuoles','Vacuolated','Microvacuoles','Lymphoid','Eccentrically','Eccentric','Prominent nucleoli', 'Small nucleoli','Large nucleoli',
                       'Medium nucleoli','azurophilic','Blast','Blastoid','Agranular','Basophilic','Eosinophilic','Perivascular','Periadnexal','Pseudopodia','Hairy','Rosary beads',
                       'Large nucleolus','Monoblastic','Monocytic','Histiocytic','Histiocytoid','Small granules','Large granules','Granulated','Granular','Condensed chromatin',
                       'Dispersed chromatin','Fine chromatin','Pale cytoplasm','Poorly differentiated', 'Large sized','Medium sized','Small sized','Plasmablast', 
                       'Plasmacytoid','plasmacytic','Immature','lymphoblast']

keylist = morphology_keywords

In [3]:
#convert pdfs to xml

outpath = path + 'xml_output/'
convert_text(path,outpath)

NameError: name 'paper_folder' is not defined

In [46]:
#get .xml files from path, convert pdf to TEIFILE object
#get dictionary of keywords and sentences
#create formatted lists for creation of dataframe

files = glob.glob(path + 'xml_output/' + papers)
files_list = []

for file in files:
    #print(file)
    file_list = []
    f = TEIFile(file)
    name = f.filename.split('/')[-1].split('.')[0]
    f.text
    '''f.keytext(keylist)
    
    for k,v in f._keytext.items():  
        for d in v:
            for i,j in d.items():
                
                if len(j) == 0 and f._doublecheck == None:
                    file_list = [name, f.filename, f.doi, 'NA', k, i, 'NA']
        
                elif len(j) == 0 and f._doublecheck == True:
                    file_list = [name, f.filename, f.doi, f._doublecheck, k, i, 'NA']
                    
                elif len(i.split(' ')) > 10:  #check for whether section title has more than 10 words, and flags it if so
                    f._doublecheck = True
                    file_list = [name, f.filename, f.doi, f._doublecheck, k, i, 'NA']

                elif f._doublecheck == None:
                    file_list = [name, f.filename, f.doi, 'NA', k, i, j]   

                else:
                    file_list = [name, f.filename, f.doi, f._doublecheck, k, i, j]

                files_list.append(file_list)'''

munoz_2011.tei.xml
[]
['Sixty-six-year-old female presented with left forearm soft tissue swelling for 2 months. An ultrasound of her left forearm showed a 6.5 cm 3 1 cm 3 3.6 cm soft tissue mass, which was also confirmed by magnetic resonance imaging (Panel A and B). A routine screening mammogram revealed a new 4 mm mass in her left breast. Subsequent left breast needle biopsy and left forearm fine needle aspiration favored an undifferentiated malignant neoplasm of hematolymphoid origin (Panel D) with positive CD4 (Panel E) and positive CD56 (Panel F) markers compatible with blastic plasmacytoid dendritic cell neoplasm (BPDCN). Positron emission tomography (PET) scan showed increased uptake in the left forearm (Panel C), left breast, bilateral pleura, liver, spleen, portocaval lymph nodes, and omental caking. A bone marrow biopsy was negative for malignancy, and shortly thereafter the patient underwent induction therapy with cytarabine and idarubicin. Following progression of metastat

UnboundLocalError: local variable 'sect' referenced before assignment

In [22]:
#create dataframe and write to excel file

df = pd.DataFrame(files_list, columns = ['pdf_name','pdf_location','doi', 'doublecheck paper?','keywords', 'section', 'text'])
                
try:
    with open(path + output + '.xlsx', 'wb') as out:
        df.to_excel(out)
        
    if out.closed:
        print('Data Ahoy!')

except IOError:
        print('I/O error')

Data Ahoy!
