# 2.  Extract text
Read in the data from XML and txt files, create a data structure to work with, and look for relevant section headings, extracting relevant text from these sections along the way.

In [1]:
import os
import pandas as pd
import xml.etree.ElementTree as ET

# own files
from pysci import docutils as du
from pysci import geoparse as gp

In [2]:
articles_dir = 'pdfs'
corpus_name = 'test-corpus'
path_to_pickle = 'science_articles.pkl'

In [3]:
documents = []  # list of ScienceDocs
min_characters = 100

count_pdf_files = 0
count_txt_files = 0
count_xml_files = 0

for root, dirs, files in os.walk(articles_dir):
    # ignore files that aren't pdf
    files[:] = [f for f in files if os.path.splitext(f)[1] == du.PDF_extension]
    for filename_pdf in files:
        count_pdf_files += 1
        filepath_pdf = os.path.join(root,filename_pdf)
        filename_raw = du.remove_extension(filename_pdf)
        # create ScienceDoc object
        scidoc = du.ScienceDoc(corpus_name=corpus_name, file_name=filename_raw)
        documents.append(scidoc)
        
        ### PROCESS TXT ###
        filename_txt = filename_raw + du.TXT_extension
        filepath_txt = os.path.join(root,filename_txt)
        if os.path.isfile(filepath_txt):
            count_txt_files += 1
            scidoc.has_text = True
            with open(filepath_txt, 'r', encoding='utf-8') as f:
                scidoc.raw_contents = f.read()
                
            ### Detect methods sections and extract relevant text from TXT
            # here using regular expression for Orchards corpus, TXT
            section_titles_txt, relevant_text_txt = gp.extract_methods_text(scidoc.raw_contents, 
                                                                            re_to_match=gp.RE_ORCHARDS_METHODS_TEXT)
            print("Section titles text:")
            print(section_titles_txt)  # section titles in a list
            print("Relevant content text, length: %s" %len(relevant_text_txt))  # one single string
        else:
            print("No txt document for %s" %filename_raw)
            section_titles_txt = []
            relevant_text_txt = ''
            
        ### PROCESS XML ###
        filename_xml = filename_raw + du.XML_extension
        filepath_xml = os.path.join(root,filename_xml)
        if os.path.isfile(filepath_xml):
            count_xml_files += 1
            # defaults to False in class
            scidoc.has_xml = True
            tree = ET.parse(filepath_xml)
            xml_root = tree.getroot()
            scidoc.xml_root = xml_root
            # Extract info directly from XML
            scidoc.title = du.get_article_title(xml_root)
            scidoc.year = du.get_publication_year(xml_root)
            scidoc.journal = du.get_journal_title(xml_root)
            scidoc.xml_contents = du.extract_content_text(xml_root)
            scidoc.authors, scidoc.affiliations = du.get_article_authors_affiliations(xml_root)
            scidoc.countries = du.get_affiliation_countries(xml_root)
            
            ### Detect methods sections and extract relevant text from XML
            # here using regular expression for Orchards corpus, XML
            methods_content_xml = gp.extract_methods_xml(scidoc.xml_root, re_to_match=gp.RE_ORCHARDS_METHODS_HEADINGS)
            section_titles_xml = [item[0] for item in methods_content_xml]  # a list
            relevant_text_list_xml = [item[1] for item in methods_content_xml]  # a list
            relevant_text_xml = '\n\n'.join(txt for txt in relevant_text_list_xml)
            print("Section titles xml:")
            print(section_titles_xml)
            print("Relevant content xml, length: %s" %len(relevant_text_xml))
        else:
            print("No xml document for %s" %filename_raw)
            section_titles_xml = []
            relevant_text_xml = ''
            
        ### Continue with XML or TXT? ###
        # use XML content unless:
        #   - we found no relevant headings in XML, or
        #   - we have insufficient content in the XML text portions
        use_xml = True
        if (len(section_titles_xml) == 0) or (len(relevant_text_xml) < min_characters):
            use_xml = False

        print("Using XML: %s" %use_xml)
        scidoc.use_xml = use_xml
        section_titles = []
        relevant_text = ''
        if use_xml:
            scidoc.methods_sections = section_titles_xml
            scidoc.relevant_text = relevant_text_xml
        else:
            scidoc.methods_sections = section_titles_txt
            scidoc.relevant_text = relevant_text_txt
            
print("We have %s ScienceDocs." %len(documents))
print("We have %s pdf documents." %count_pdf_files)
print("We have %s txt documents." %count_txt_files)
print("We have %s xml documents." %count_xml_files)

Section titles text:
['Materials and methods', 'Study area and site selection']
Relevant content text, length: 80
Section titles xml:
['Materials and methods', 'Study area and site selection']
Relevant content xml, length: 1382
Using XML: True
Section titles text:
['Material and Methods']
Relevant content text, length: 522
Section titles xml:
['Material and Methods']
Relevant content xml, length: 0
Using XML: False
We have 2 ScienceDocs.
We have 2 pdf documents.
We have 2 txt documents.
We have 2 xml documents.


In [4]:
# serialize the ScienceDocs for ease of use in the next step
du.pickle_data(documents, path_to_pickle)

pickled data at science_articles.pkl


True