In [1]:
import glob, re, sys, os, ssl, unicodedata, itertools, lxml, bs4, requests, multiprocessing
import pandas as pd
from nltk.tokenize import sent_tokenize
from dataclasses import dataclass
from pathlib import Path
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from multiprocessing.pool import Pool
#from grobid_client_python import grobid_client as grobid
from bs4 import BeautifulSoup
import boto3


##########
# This script allows for conversion of pdfs to an excel spreadsheet, parsing out text by keywords from each file
# Written/modified by Corinn Small, corinn.small@ucsf.edu
#########


def convert_text(inpath,outpath):
    
    '''
    converts text from pdf to tei.xml using grobid web service api, remember computer has to be connected to the server: cd grobid-0.6.1/ -> ./gradlew run
    input: path to papers
    output: tei.xml file per pdf
    '''

    print('Converting text...')
    print('')
    
    # make sure your aws role has the correct permissions to run textract, see https://docs.aws.amazon.com/textract/latest/dg/api-async-roles.html to configue 
    
    client = boto3.client('textract')
    for m in dir(client): print(m)  # what are the function attached to the client object?... _ = 'private' function meaning dont call it
    
    
    client.StartDocumentTextDetection()  #for detecting and analyzing text in multipage docs (asynchronous op), I would try both syncrhonous and asynchronous, maybe the synchronous works for pdfs with just a couple of pages?

#have to set up a aws account s3 bucket... (cloud object storage)
#for s3object, you need to specify bucket, name and version:
    s3_obj =  {"Bucket": BUCKET, "Name": NAME}  #creating dictionary obj

    response = client.GetDocumentTextDetection(Document = {'S3Object': s3_obj})  #nesting dictionary s3_obj inside document dictionary and giving that to the client function 
#get results returned, we want an s3object dictionary to be returned 

#response is a dictionary type: you can look at the document page that specifies the basic the s3 object metadata to get fields that you can query from the dict

    detect = response.copy()  #save output as detection (of the pdf output)

#now to analyze all the gibberish (output = list of block objects) this is the synchronous version:
    response2 = client.analyze_document(Document = {'S3Object': s3_obj}, FeatureTypes = ['TABLES'])
#document = s3object
#observation: of required syntax: the layout is confusing to you, the request syntax that the boto3 doc specifies is telling you what type of ojbect to pass the client.function and the naming convention of the returned object! thats it! dont worry, just keep in mind the indendation and parentheses
#featuretypes is required
    response2['Blocks'][0].keys()  # print name of each block

    for b in response2['Blocks']:
        if b['BlockType'] == 'LINE':
            print("{}\t{}".format(b['Text'], b['Confidence']))  # for everytime line is found, what is that word and the confidence at which its identified, it's important/critical to check the confidence scores!



    
    #client = grobid.grobid_client(config_path="./grobid_client_python/config.json")
    #client.process("processFulltextDocument", inpath, outpath)
    
    print('Done')

def read_tei(tei_file):
    '''
    Reads in an xml file and returns a beautifulsoup object
    
    '''
    with open(tei_file, 'r') as tei:
        soup = BeautifulSoup(tei, 'html.parser')
        return soup
    raise RuntimeError('Cannot generate a soup from the input')
    

def elem_to_text(elem, default='NA'):
    '''
    Returns element if it exists, if not returns NA
    
    '''
    if elem:
        return elem.getText()
    else:
        return default
    

#create class for storing pdf info

class TEIFile(object):
    def __init__(self, filename):
        self.filename = filename
        self.soup = read_tei(filename)  #creates soup object
        self._text = None
        self._title = ''  
        self._abstract = ''
        self._keytext = {}
        self._check = None
        
    @property
    def doi(self, id_='DOI'):
        '''
        retrieve id
        '''
    
        idno_elem = self.soup.find('idno', type='DOI')
        if not idno_elem:
            return 'no id'
        else:
            return idno_elem.getText()
    
    @property
    def title(self):
        '''
        retrieve title
        '''
        if not self._title:
            self._title = self.soup.title.getText()
        return self._title
    
    @property
    def abstract(self):
        '''
        retrieve abstract
        '''
        
        if not self._abstract:
            abstract = self.soup.abstract.getText(separator=' ', strip=True)
            self._abstract = abstract
        return self._abstract
      
    @property
    def authors(self):
        '''
        retrieve authors
        '''
        authors_in_header = self.soup.analytic.find_all('author')

        result = []
        
        @dataclass
        class Person:
            firstname: str
            middlename: str
            surname: str
                
        for author in authors_in_header:
            persname = author.persname
            if not persname:
                continue
            firstname = elem_to_text(persname.find("forename", type="first"))
            middlename = elem_to_text(persname.find("forename", type="middle"))
            surname = elem_to_text(persname.surname)
            person = Person(surname, firstname, middlename)
            result.append(person)
        return result
    
    @property
    def text(self):
        '''
        retrieves text
        returns dictionary by subsection
        '''
        #print(self.soup.prettify())
        print(self.filename.split('/')[-1])
        print('')
        
        if not self._text:
            paper_text = {}  
                
            divs = self.soup.body.find_all('div')
            #print(divs)
                
            for div in divs:   
                if not div.get('type'):  # if div is neither an appendix nor reference
                    heads = div.find_all('head')  #find all subsections
                    #print(heads)
                    
                    if not heads:  #if there aren't any subsections
                        sect = []
                        for p in div.find_all('p'):  #add all sentences to one section titled 'body'
                            #print(p)
                            sect_text = p.get_text(separator=' ', strip=True)
                            sect.append(sect_text)
                            paper_text['body'] = sect  
                        
                        #print(sect)
                        
                    else:  #otherwise for each subsection create a new list with corresponding text and add it to a div dictionary
                        for head in heads:
                            sect_ = []
                            head_text = head.get_text(separator=' ', strip=True).lower()
                            
                            
                            if len(head_text.split(' ')) > 7:  #if the head title is < 7 words long (arbitrary #) ask the user to clarify its validity
                                print('subtitle sentence: ', head_text)
                                
                                ans = input('Is this subsection a full sentence? y/n')
                                ok_ans = ('y','n')
                                
                                while ans not in ok_ans:
                                    print('Y or N only please!')
                                    ans = input('Is this subsection a full sentence? y/n')

                                if ans == 'y':  #if the title is a sentence
                                    print('Including subsection title in searchable text, but still include it as a separate subsection...')
                                    print('')
                                    sect_.append(head_text)  #include it in the text
                                    for p in div.find_all('p'):  #for each paragraph find all text 
                                        #print(p)
                                        sect_text = p.get_text(separator=' ', strip=True)
                                        sect_.append(sect_text)  #add it to the list


                                elif ans == 'n':  #if the title is not a sentence
                                    ans1 = input('Does this subsection make sense? y/n')  #does the title make sense in general?

                                    while ans1 not in ok_ans:
                                        print('Y or N only please!')
                                        ans1 = input('Is this subsection a full sentence? y/n')


                                    if ans1 == 'y':  #if the title makes sense treat it as a regular subsection
                                        print('Valid subsection...')
                                        print('')

                                        for p in div.find_all('p'):
                                            #print(p)
                                            sect_text = p.get_text(separator=' ', strip=True)
                                            sect_.append(sect_text)

                                    elif ans1 == 'n':  #if the title doesn't make sense, mark self._check True so the user knows to check it later
                                        print('Flagging nonsense...')
                                        print()

                                        self._check = True

                                        for p in div.find_all('p'):  #still include the subsection's text in list for keyword searching
                                            #print(p)
                                            sect_text = p.get_text(separator=' ', strip=True)
                                            sect_.append(sect_text)

                                    

                            else:  #if the subsection title is not longer than 7 words, find all text and add it to the list
                                for p in div.find_all('p'):
                                    #print(p)
                                    sect_text = p.get_text(separator=' ', strip=True)
                                    sect_.append(sect_text)
                                    #print('subsection title < 7 words')
                        
                                
                            
                            paper_text[head_text] = sect_ 
                                    
                                    
                        #for figure descriptions....(WORK IN PROGRESS)
                            if head_text.find('fig') != -1:  #if figure is treated as its own subsection, find all text and include it in its own list
                                print('Figure treated as subsection: ', head_text)
                    
                                for p in div.find_all('p'):
                                    #print(p)
                                    sect_text = p.get_text(separator=' ', strip=True)
                                    sect_.append(sect_text)
                                    paper_text['figure'] = sect_
                                
                            for fig in self.soup.body.find_all('figure'):   #for figure sections find the figure descriptions and print it
                                figDesc = fig.find_all('figDesc')
                                for f in figDesc:
                                    print('Figure Description: ',f)
                                           
                  
                
            #if paper is short, all the text might show up in the abstract section of the xml file instead of div
            if not divs:
                self._check = True
                
                sect = []
                ps = self.soup.abstract.find_all('p')
                
                if not ps:
                    sect_text = self.soup.abstract.get_text(separator=' ', strip=True)
                    sect.append(sect_text)


                else:
                    for p in ps:
                        sect_text = p.get_text(separator=' ', strip=True)
                        sect.append(sect_text)

                paper_text['body'] = sect   #add all sentences to one section titled 'body'
                
                    
                
                
                
            self._text = paper_text
        return self._text
    
    

    
    def keytext(self, keywords):
        '''
        retrieves sentences by keyword
        returns dictionary
        '''
        
        
        for keyword in keywords:
            self._keytext[keyword] = []
            
        
        for k,v in self._text.items():  #for subsection and text 
                for keyword in keywords:  #search thru sentences 
                    sub_dict = {}
                    section = k.lower()
                    sub_dict[section] = []  #create list for each subsection in dictionary
                    
                    for i in v:  #for paragraph in text
                        sentences = sent_tokenize(i)  #get list of sentences
                        sentences = [s.lower() for s in sentences]
                        
                    
                        for sentence in sentences:
                            result = re.findall('\\b' + keyword + '\\b', sentence)  #find keyword in sentence

                            if len(result) > 0:  #if keyword exists, 
                                sub_dict[section].append(sentence)  #add sentence to subsection list

                            else:  
                                pass
                    
                    #print(sub_dict)
                    self._keytext[keyword].append(sub_dict)  #adds each subsection to keyword dictionary
                        
        return self._keytext                   

In [25]:
#get input from user

cases = 'summary'
path = '/Users/corinnsmall/Documents/BPDCN/bpdcn_papers/' + cases +'_case_papers/'
key = 'wgs'
output = 'output_' + cases + '_cases_' + key

associated_diseases_keywords = ['acute myeloid leukemia', 'AML', 'acute lymphoblastic leukemia', 'ALL', 'leukemia', 'non-hodgkin lymphoma', 'Hodgkin lymphoma',
                                'lymphoma','myelogenous leukemia', 'multiple myeloma', 'myeloma', 'chronic myelogenous leukemia', 'CML', 'chronic lymphocytic leukemia', 
                                'CLL', 'carcinoma','Pleuropulmonary blastoma', 'blastoma', 'neuroblastoma', 'melanoma', 'sarcoma', 'skin cancer', 'hairy cell leukemia',
                                'ependymoma', 'chordoma','bone cancer', 'bladder', 'AIDS-related lymphoma', 'thyroid cancer', 'colon cancer', 'rectal cancer',
                                'prostate cancer', 'chronic myeloid leukemia', 'myeloproliferative', 'myelodysplastic', 'mast cell', 'mastocytosis', 'lymphoblastic', 
                                'follicular lymphoma', 'marginal zone lymphoma', 'langerhan', 'polycythemia vera', 'essential thrombocythemia', 'myelofibrosis', 
                                'mycosis fungoides', 'sezary', 'burkitt', 'cmml', 'chronic myelomonocytic leukemia']

morphology_keywords = ['Vacuoles','Vacuolated','Microvacuoles','Lymphoid','Eccentrically','Eccentric','Prominent nucleoli', 'Small nucleoli','Large nucleoli',
                       'Medium nucleoli','azurophilic','Blast','Blastoid','Agranular','Basophilic','Eosinophilic','Perivascular','Periadnexal','Pseudopodia','Hairy','Rosary beads',
                       'Large nucleolus','Monoblastic','Monocytic','Histiocytic','Histiocytoid','Small granules','Large granules','Granulated','Granular','Condensed chromatin',
                       'Dispersed chromatin','Fine chromatin','Pale cytoplasm','Poorly differentiated', 'Large sized','Medium sized','Small sized','Plasmablast', 
                       'Plasmacytoid','plasmacytic','Immature','lymphoblast']

nuclei_keywords = ['convoluted nuclei','convoluted','convolutions','nuclear folds','folds','membrane irregularities','slightly irregular','round nuclei','oval nuclei','vesicular chromatin','cleaved nuclei']


gene_keywords = ['ABL1','AKT1','ALK','APC','ARF','ARID1A','ASH1L','ASXL1','ASXL3','ATM','ATR','AXIN2','BRAF','BCORL1','CAL-1','CCND3','CEBPA','CHD8','CHP2','CDH1','CDKN2A',
                 'CDKN1B','CDKN2B','CIC','CREBBP','CSF1R','CTNNB1','CPS1','CROCC','CXCR4','DAXX','DIP2A','DNMT3A','EGFR','EGR1','EP300','ERBB2','ERBB4','ERCC4','ETV6','EYA2',
                 'EZH2','FBXW7','FGFR1','FGFR2','FGFR3','FLT3','FLT3-ITD','FLT3-other','GNA11','GNAS','GNAQ','GPR160','HES6','HNF1A','HOXB9','HRAS','IDH1','IDH2','IKZF1',
                 'IKZF2','IKZF3','IVL','JAK2','JAK3','KDM40','KDR','KIT','cKIT','KRAS','MAD1L1','MAPK1','MCL1','MET','MLH1','MLL','MLL2','MLL3','MPL','MSH6','MYB','MYBL1',
                 'MYC','MYST3','MYST4','NF1','NOTCH1','NPM1','NR3C1','NRAS','PALB2','PARK2','PBRM1','PDGFRA','PHF2','PHF6','PIK3CA','PMDC05','PLCXD3','PLP1','PTEN','PTPN11',
                 'PTPN23','PVT1','RAD52','RANBP2','RAS','RB1','RET','RFPL1','RHOA','RUNX1','RUNX2','SARDH','SIGLEC6','SLC25A10','SMAD4','SMARCB1','SMARCD1','SMO','SRC','SRCAP',
                 'SRSF2','STK3','STK11','SUPT3H','SUZ12','TCF3','TCL1A','TEL','TET2','TRMT61B','TP53','U2AF1','UBE2G2','VHL','WNT3','WNT7B','WNT10A','WT1','ZEB2','ZRSR2']

wgs_keywords = ['whole exome sequencing', 'whole genome sequencing', 'wes', 'wgs', 'sequencing', 'exome', 'whole genome', 'whole-genome', 'whole-exome', 'genome-wide']




keylist = [i.lower() for i in wgs_keywords]  #make all keywords lower case

In [16]:
#convert pdfs to xml, need to connect to grobid api to use convert_text()

path = '/Users/corinnsmall/Documents/BPDCN/bpdcn_papers/'
outpath = path + 'xml_output/'
convert_text(path,outpath)

Converting text...

GROBID server does not appear up and running, the connection to the server failed


NameError: name 'exit' is not defined

In [26]:
#get .xml files from path, convert pdf to TEIFILE object
#get dictionary of keywords and sentences
#create formatted lists for creation of dataframe


files = glob.glob(path + 'xml_output/' + '*.xml')
#files = glob.glob('/Users/corinnsmall/Documents/BPDCN/bpdcn_papers/example/output/*.xml')

files_list = []


for file in files:
    #print(file)
    file_list = []
    f = TEIFile(file)
    name = f.filename.split('/')[-1].split('.')[0]
    f.text
    print('-----------------------------------------------------------')
    print('')
    
    
    f.keytext(keylist)
    
    for k,v in f._keytext.items():
        for d in v:
            #print('subsection dict: ', d)
            for i,j in d.items():
                #print(i, '\n', j)
                
                if len(j) == 0 and f._check == None:
                    file_list = [name, f.filename, f.doi, 'NA', k, i, 'NA']

                elif f._check == None:
                    file_list = [name, f.filename, f.doi, 'NA', k, i, j]   

                else:
                    file_list = [name, f.filename, f.doi, f._check, k, i, j]

                files_list.append(file_list)
   

laribi_2014.tei.xml

-----------------------------------------------------------

leclerc_2017.tei.xml

-----------------------------------------------------------

hamadeh_2019.tei.xml

-----------------------------------------------------------

martinmartin_2015.tei.xml

subtitle sentence:  immunophenotypic subgroups of blastic plasmacytoid dendritic cell neoplasms


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  clinical and laboratory features of the distinct maturation-associated subgroups of bpdcn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

jaye_2006.tei.xml

subtitle sentence:  bdca-2 is expressed on a subset of cd4 þ cd56 þ hematodermic neoplasms


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

moreno_2013.tei.xml

subtitle sentence:  immunohistochemical expression in ffpe samples and image capture


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  spib protein expression in major mature b-and t-cell lymphoma subtypes


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  spib protein expression in bpdc neoplasms and their potential mimics


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

rizvi_2012.tei.xml

subtitle sentence:  expression of the cd2ap adaptor molecule in normal, reactive and neoplastic human tissue


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

lee_2019.tei.xml

-----------------------------------------------------------

wang_2020.tei.xml

subtitle sentence:  differential immunophenotypic characteristics of bpdcn and reactive pdcs


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  establishment and validation of a flow cytometry assay for minimal residual disease


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  flow cytometry versus immunohistochemistry in the assessment of minimal residual disease


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

tsunoda_2012.tei.xml

-----------------------------------------------------------

sapienza_2014.tei.xml

subtitle sentence:  bpdcns mirror myeloid resting pdcs and are distinct from other acute leukemias


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  bpdcns differ from non-neoplastic pdcs for several specific genes and cellular programs


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  nf-kb pathway is a candidate therapeutic target in bpdcn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

facchetti_2008.tei.xml

-----------------------------------------------------------

martin_2006.tei.xml

-----------------------------------------------------------

martin_2016.tei.xml

-----------------------------------------------------------

nomburg_2020.tei.xml

-----------------------------------------------------------

benet_2011.tei.xml

subtitle sentence:  de novo mlcs show a specific histologic phenotype


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  does the immunohistochemical panel allow diagnosis of mlc and distinction of types of myeloid disorders?


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  chronic mpss cannot be distinguished from other underlying myeloid disorders


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

feuillard_2002.tei.xml

subtitle sentence:  immunophenotypic profile of cd4 ؉ cd56 ؉ malignant cells


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  immunophenotypic characterization of cd4 ؉ cd56 ؉ tumor cells


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

sun_2018.tei.xml

-----------------------------------------------------------

petrella_2010.tei.xml

-----------------------------------------------------------

sukswai_2019.tei.xml

subtitle sentence:  staining characteristics of tcf4/cd123 dual-color immunostain in bpdcn cases


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  comparison of tcf4/cd123 dual-color immunohistochemistry staining to multiparameter flow cytometry in bone marrow samples


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  staining characteristics of tcf4/cd123 dual-color immunohistochemistry in non-bpdcn tissues


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

ferreira_2016.tei.xml

-----------------------------------------------------------

tang_2018.tei.xml

subtitle sentence:  conventional chromosomal analysis and fish analysis on bone marrow samples


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

saczonek_2018.tei.xml

-----------------------------------------------------------

yu_2015.tei.xml

subtitle sentence:  cd56 + dcs clustered together with mdcs but not pdcs by transcriptomic analysis


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  cd56 + dcs are functionally analogous to mdcs


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  bpdcn is closely related to cd56 + dcs, but not pdcs by global gene expression profiling


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  facs analysis of dc subsets in human blood


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

child_2003.tei.xml

subtitle sentence:  analysis of t-cell receptor c gene and immunoglobulin heavy chain gene rearrangements


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

ohe_2018.tei.xml

-----------------------------------------------------------

tsagarakis_2010.tei.xml

-----------------------------------------------------------

hashikawa_2011.tei.xml

-----------------------------------------------------------

murashige_2005.tei.xml

-----------------------------------------------------------

bruggen_2020.tei.xml

-----------------------------------------------------------

roosweil_2013.tei.xml

-----------------------------------------------------------

deng_2017.tei.xml

-----------------------------------------------------------

petrella_2002.tei.xml

subtitle sentence:  npm is a rare population of cd56+ cells related to plasmacytoid monocytes


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

mori_2001.tei.xml

-----------------------------------------------------------

pemmaraju_2019_nejm.tei.xml

-----------------------------------------------------------

johnson_2016.tei.xml

subtitle sentence:  comparison of mnda expression with other markers of myelomonocytic differentiation in emls


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  mnda expression in cd4+/cd56+ and cd4+/ cd56à emls


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

dabaja_2017.tei.xml

subtitle sentence:  study objectives, definition of endpoints, and statistical methodology


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  engraftment kinetics, gvhd (acute and chronic), nrm and relapse/progression


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

emadali_2016.tei.xml

subtitle sentence:  glucocorticoid response element and e2f reporter assays and rna interference


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  results 5q anomalies in bpdcn confer adverse clinical outcome


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  targeting of nr3c1 by 5q anomalies in bpdcn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  loss-of-ezh2 function is a hallmark of 5q alterations that target nr3c1 in bpdcn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

massone_2004.tei.xml

subtitle sentence:  cutaneous medium/large pleomorphic t-cell lymphoma, not otherwise specified


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

marmouset_2019.tei.xml

-----------------------------------------------------------

pagano_2013.tei.xml

subtitle sentence:  © f e r r a t a s t o r t i f o u n d a t i o n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

gilis_2012.tei.xml

-----------------------------------------------------------

park_2014.tei.xml

-----------------------------------------------------------

tzankov_2017.tei.xml

subtitle sentence:  plasmacytoid dendritic cell proliferations and mature plasmacytoid dendritic cell neoplasm in the bone marrow


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  blastic plasmacytoid dendritic cell neoplasms in the bone marrow


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

fujii_2020.tei.xml

-----------------------------------------------------------

bekkenk_2004.tei.xml

subtitle sentence:  comparison between blastic nk cutaneous cd561 myeloid leukemia


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

marafioti_2008.tei.xml

subtitle sentence:  detection of novel markers associated with normal pdcs


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

boddu_2018.tei.xml

-----------------------------------------------------------

dorfman_2010.tei.xml

-----------------------------------------------------------

karube_2003.tei.xml

subtitle sentence:  no. classification cd2 cd3 cd4 cd7 cd8 cd13 cd20 cd33 cd123 cd34 cd56 cd68 mpo tdt eb-ish


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

sangle_2014.tei.xml

-----------------------------------------------------------

griffin_2017.tei.xml

subtitle sentence:  and b.3). leucostasis complicating a compromised cerebrovascular circulation may have been the patho-biology behind clinical presentation.


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

lozzi_2006.tei.xml

-----------------------------------------------------------

julia_2014.tei.xml

-----------------------------------------------------------

gruson_2013.tei.xml

subtitle sentence:  l-asparaginase with methotrexate and dexamethasone is an effective treatment combination in blastic plasmacytoid dendritic cell neoplasm


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

jen_2019.tei.xml

-----------------------------------------------------------

dietrich_2011.tei.xml

-----------------------------------------------------------

heinicke_2015.tei.xml

-----------------------------------------------------------

jardin_2011.tei.xml

subtitle sentence:  tet2 and tp53 mutations are frequently observed in blastic plasmacytoid dendritic cell neoplasm


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

fiandrino_2013.tei.xml

subtitle sentence:  g . fi a n d r i n o m. a r r a 1 r. r i b o


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

urosevic_2005.tei.xml

-----------------------------------------------------------

khwaja_2016.tei.xml

-----------------------------------------------------------

deotare_2015.tei.xml

-----------------------------------------------------------

luicioni_2012_blood.tei.xml

-----------------------------------------------------------

nguyen_2015.tei.xml

-----------------------------------------------------------

xu_2016.tei.xml

subtitle sentence:  evaluation and scoring of ihc staining of pd-l1 and pd-l2


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  expression of pd-l1 and pd-l2 in histiocytes and dendritic cell subsets


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  pd-1 ligand expression in non-neoplastic histiocytic and dendritic cell disorders


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  pd-1 ligand expression in malignant histiocytic and dendritic cell neoplasms


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  comparison of pd-l1 and pd-l2 expression between hs, ids, and fds


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

betrian_2017.tei.xml

-----------------------------------------------------------

ottou_2019.tei.xml

-----------------------------------------------------------

ottou_2009.tei.xml

subtitle sentence:  bdca-2 expression is strongly associated with a pdcl phenotype


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  cd135 and dc maturation marker expression does not distinguish pdcl from other leukaemia


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

shapiro_2003.tei.xml

-----------------------------------------------------------

hamada_2014.tei.xml

-----------------------------------------------------------

vitte_2012.tei.xml

-----------------------------------------------------------

suzuki_2017_leu.tei.xml

-----------------------------------------------------------

sweet_2020.tei.xml

-----------------------------------------------------------

jacob_2003.tei.xml

subtitle sentence:  characterization of the malignant cell cd4 + 56 + lin − malignant cells arise from pdc


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  © f e r r a t a s t o r t i f o u n d a t i o n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  tumor cell affiliation to the pdc lineage relies on immunophenotypic criteria


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  © f e r r a t a s t o r t i f o u n d a t i o n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  cd4 + cd56 + lin − pdc are arrested at an early stage of maturation


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  © f e r r a t a s t o r t i f o u n d a t i o n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  which origin for malignant cd4 + cd56 + lin − dc: lymphoid or myeloid?


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

Figure treated as subsection:  histologic findings (figure 4)
subtitle sentence:  © f e r r a t a s t o r t i f o u n d a t i o n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

lucioni_2011.tei.xml

-----------------------------------------------------------

kawamata_2005.tei.xml

subtitle sentence:  m u m u m u m u m u d a u d i n o r m a l p a t i e n t 3 p a t i e n t 4 p a t i e n t 5


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

dijkman_2007.tei.xml

subtitle sentence:  novel membranebound markers for cd4 ؉ cd56 ؉ hn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  detailed analysis of chromosomal regions with recurrent deletion in cd4 ؉ cd56 ؉ hn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  confirmation of microarray expression data by real-time pcr


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

magro_2009.tei.xml

subtitle sentence:  cd4 1 cd56 1 malignancy associated with t-cell clonality (nk t-cell lymphoma, anaplastic large cell lymphoma, and mycosis fungoides) nk t-cell lymphoma (cases 1 and 2)


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

bulbul_2018.tei.xml

-----------------------------------------------------------

kleppe_2016.tei.xml

-----------------------------------------------------------

ulrickson_2017.tei.xml

subtitle sentence:  autoimmune hemolytic anemia in a young man with acute hepatitis e infection


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

safaei_2019.tei.xml

-----------------------------------------------------------

menezes_2014.tei.xml

-----------------------------------------------------------

martinez_2014.tei.xml

-----------------------------------------------------------

jegalian_2010.tei.xml

subtitle sentence:  clinical features of the national cancer institute cases and previously published cases


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  patient n. cd4 cd56 tdt cd123 cd303* cd7 cd3 cd68 cd117 mpo other


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

gera_2014.tei.xml

-----------------------------------------------------------

ascani_2008.tei.xml

-----------------------------------------------------------

rakozy_2001.tei.xml

-----------------------------------------------------------

pilichowska_2007.tei.xml

-----------------------------------------------------------

delettre_2012.tei.xml

subtitle sentence:  immunostaining and flow cytometry analysis on normal peripheral blood and acute leukemia cell samples


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  expression of tcl1 and ilt7 in normal blood cells


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  tcl1 expression on plasmacytoid denditic cell leukemia and other acute leukemia


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

stenzinger_2014.tei.xml

-----------------------------------------------------------

testa_2014.tei.xml

subtitle sentence:  cd123 expression on human hematopoietic stem/ progenitor cells


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

agha_2019.tei.xml

subtitle sentence:  case records of the massachusetts general hospital (case


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  hospital of the university of pennsylvania philadelphia, pa


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

kim_2015.tei.xml

-----------------------------------------------------------

kim_2005.tei.xml

-----------------------------------------------------------

rauh_2012.tei.xml

-----------------------------------------------------------

pemmaraju_2019.tei.xml

subtitle sentence:  case records of the massachusetts general hospital (case


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  hospital of the university of pennsylvania philadelphia, pa


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

ji_2009.tei.xml

-----------------------------------------------------------

bayerl_2002.tei.xml

subtitle sentence:  a b s t r a c t only a few blastic natural killer (nk) cell leukemias and lymphomas have been reported. as such, the clinicopathologic spectrum of this disease is incompletely understood.


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

demiroz_2020.tei.xml

-----------------------------------------------------------

sapienza_2019.tei.xml

subtitle sentence:  more bioinformatics details are provided in the online supplementary appendix and online


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  whole-exome sequencing reveals the epigenetic program dysregulation as the main theme of the blastic plasmacytoid dendritic cell neoplasm mutational landscape


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  blastic plasmacytoid dendritic cell neoplasm transcriptome profiling confirms the dysregulation of epigenetic programs


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  genome-wide chip-sequencing substantiates epigenetic dysregulation of cell cycle program in blastic plasmacytoid dendritic cell neoplasms


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  in vivo blastic plasmacytoid dendritic cell neoplasm modeling demonstrates combined epigenetic therapy as effective in controlling disease progression


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

lezama_2019.tei.xml

-----------------------------------------------------------

eros_2009.tei.xml

-----------------------------------------------------------

subramanian_2018.tei.xml

-----------------------------------------------------------

cronin2012.tei.xml

-----------------------------------------------------------

herling_2003.tei.xml

subtitle sentence:  cd123 and tcl1 expression is characteristic of cd4 ؉ cd56 ؉ bts


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  table 1. tcl1 expression as detected by immunostaining and western blot


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  tcl1 is highly expressed in cd123 ؉ lymph node plasmacytoid dendritic cells (dc2s)


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  tcl1 is negative in true nk-cell lymphomas, mature t-cell malignancies, and myeloid leukemias


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  tcl1 expression does not correlate with levels of pakt


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

oshimi_2013.tei.xml

subtitle sentence:  clinical and laboratory features of patients with definite diagnosis


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  comparison of precursor nk-cell all and blastic nk-cell lymphoma


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  is aggressive nk-cell leukemia/lymphoma a leukemic phase of extranasal nk-cell lymphoma?


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

salva_2014.tei.xml

-----------------------------------------------------------

yamada_2000.tei.xml

-----------------------------------------------------------

hwang_2013.tei.xml

subtitle sentence:  p a t i e n t s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  i m m u n o h i s t o c h e m i s t r y ( i h c )


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

borchiellini_2013.tei.xml

-----------------------------------------------------------

alfayez_2019.tei.xml

subtitle sentence:  investigation into possible mechanisms of resistance in bpdcn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

cernan_2020.tei.xml

subtitle sentence:  pt. common bpdcn markers myeloid, b and t-lymphoid lineage specific markers


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

aung_2019.tei.xml

subtitle sentence:  evaluation of pd1 and pd-l1 expression by immunohistochemistry


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

torres_2019.tei.xml

subtitle sentence:  | detection of genomic rearrangements and fusion transcripts


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  the output was subjected to a wilcoxon rank test and a kolmogorov-


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  | validation of structural genomic alterations and small-scale mutations


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  | structural alterations involving ikzf1 are recurrent in bpdcn


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

suzuki_2017.tei.xml

subtitle sentence:  p a t i e n t s a m p l e s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  h i s t o p a t h o g y a n d i m m u n o p h e n o t y p i c a l a n a l y s i s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  r e s p o n s e t o t r e a t m e n t


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  s t a t i s t i c a l a n a l y s i s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  p a t i e n t c h a r a c t e r i s t i c s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  h i s t o p a t h o l o g i c a l , i m m u n o p h e n o t y p i c a l a n d k a r y o t y p i c a l f e a t u r e s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  d u a l p r i m a r y m a l i g n a n c i e s a n d o t h e r d i s e a s e


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  c o m p a r i s o n o f t d t -p o s i t i v e a n d t d t -n e g a t i v e c u t a n e o u s b p d c n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  c o m p a r i s o n o f n o d u l a r a n d d i s s e m i n a t e d t y p e s o f c u t a n e o u s b p d c n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

agha_2018.tei.xml

-----------------------------------------------------------

wiesner_2010.tei.xml

subtitle sentence:  loss of chromosome region 12p13 and the weak expression of p27 kip1


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  loss of chromosome 9 and no expression of p16 ink4a


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

kazakov_2003.tei.xml

-----------------------------------------------------------

julia_2013.tei.xml

-----------------------------------------------------------

arora_2013.tei.xml

-----------------------------------------------------------

paluri_2015.tei.xml

-----------------------------------------------------------

sakamoto_2018.tei.xml

subtitle sentence:  recurrent 8q24 rearrangement and myc expression, and the association with cytomorphology


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  gene expression patterns of myc + bpdcn and myc


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------

lezama_2018.tei.xml

subtitle sentence:  c a s e s e l e c t i o n


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  c y t o g e n e t i c a n a l y s i s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  l i t e r a t u r e r e v i e w


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  c a s e r e p o r t s


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

subtitle sentence:  r e v i e w o f c a s e s p u b l i s h e d i n t h e l i t e r a t u r e


Is this subsection a full sentence? y/n y


Including subsection title in searchable text, but still include it as a separate subsection...

-----------------------------------------------------------



In [24]:
#create dataframe and write to excel file

df = pd.DataFrame(files_list, columns = ['pdf_name','pdf_location','doi', 'check_paper_content','keywords', 'section', 'text'])
                
try:
    with open(path + output + '.xlsx', 'wb') as out:
        df.to_excel(out)
        
    if out.closed:
        print('Data Ahoy!')

except IOError:
        print('I/O error')

Data Ahoy!
