# Process Publishing Raw Data to Document

In [1]:
import os
import networkx as nx
import xml.etree.ElementTree as ET
import requests
import json
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [5]:
PARENT_PATH = Path.cwd().parent
if 'publishingchatgptpocweb' not in str(PARENT_PATH):
    PARENT_PATH = PARENT_PATH / 'publishingchatgptpocweb'

DATA_DIRECTORY = PARENT_PATH / 'data'
STATIC_DIRECTORY = PARENT_PATH / 'static'

JATS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'raw'
ARTICLES_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'articles'
CONCEPTS_DATA_DIRECTORY_PATH = DATA_DIRECTORY / 'processed' / 'concepts'
SPAQRQL_QUERY_FILE_PATH = STATIC_DIRECTORY / 'sparql' / 'sparql_query_template.sparql'
CONCEPTS_NOT_FOUND_FILE_PATH = DATA_DIRECTORY / 'processed' / 'CONCEPTS_DETAILS_NOT_FOUND.txt'

In [6]:
def parse_element(element, file, indent_level=0):
    indent_unit = 4  # Define the number of spaces for each indentation level

    def write_line(content, indent_level=0):
        file.write(' ' * (indent_level * indent_unit) + content + '\n')

    if element.tag == "article":
        write_line("Article:", indent_level)

    elif element.tag == "article-id":
        attrib = element.attrib.get('pub-id-type')
        text = element.text.strip()
        if attrib == 'CABI-pan':
            write_line(f"PAN: {text}", indent_level+1)
        elif attrib == 'doi':
            write_line(f"DOI: {text}", indent_level+1)
            write_line(f"Article Link/URL/Source: {'https://www.cabidigitallibrary.org/doi/' + text}", indent_level+1)

    elif element.tag == "title-group":
        article_title = element.find('article-title')
        if article_title is not None:
            title_text = ''.join(article_title.itertext()).strip()
            if title_text:
                write_line(f"Title: {title_text}", indent_level+1)
            
    elif element.tag == "abstract":
        abstract_summary = element.find('p')
        if abstract_summary is not None:
            summary_text = ''.join(abstract_summary.itertext()).strip()
            if summary_text:
                write_line(f"Abstract Summary: {summary_text}", indent_level+1)

    elif element.tag == "pub-date":
        write_line("Publishing Date:", indent_level+1)
        for child in element:
            if child.text:
                write_line(f"{child.tag.capitalize()}: {child.text.strip()}", indent_level + 2)

    elif element.tag == "isbn" and element.text:
        write_line("ISBN:", indent_level+1)
        write_line(element.text.strip(), indent_level + 2)

    elif element.tag == "publisher-name" and element.text:
        write_line("Publisher Name:", indent_level+1)
        write_line(element.text.strip(), indent_level + 2)

    elif element.tag == "publisher-loc":
        write_line("Publisher Location:", indent_level+1)
        for child in element:
            if child.text:
                write_line(f"{child.tag.capitalize()}: {child.text.strip()}", indent_level + 2)

    elif element.tag == "subj-group" and element.attrib.get('subj-group-type') == "cabi-codes":
        write_line("Subjects or Categories:", indent_level+1)
        for compound_subject in element.findall('compound-subject'):
            labels = [label.text for label in compound_subject.findall('compound-subject-part[@content-type="label"]') if label.text]
            for label in labels:
                write_line(f"- {label.strip()}", indent_level + 2)

    elif element.tag == "person-group" and element.attrib.get('person-group-type') == "author":
        author_names = []
        for child in element:
            name_parts = [sub_child.text for sub_child in child if sub_child.tag in ['given-names', 'surname'] and sub_child.text]
            if name_parts:
                author_names.append(' '.join(name_parts))
        if author_names:
            write_line("Authors:", indent_level+1)
            for name in author_names:
                write_line(f"- {name}", indent_level + 2)

    elif element.tag == "kwd-group" and element.attrib.get('kwd-group-type') == "CABI-keyword":
        write_line("Keywords/Thesaurus Concepts:", indent_level+1)
        
        vocab_map = {
            'preferredTerm': "Preferred Terms (Non-geographic, Non-organism contents from thesaurus):",
            'organismTerm': "Organism Terms (Organism names from thesaurus):",
            'geographicTerm': "Geographic Terms (Geographic Tags indicating content about a place):"
        }
        terms = {key: [] for key in vocab_map}
    
        for kwd in element.findall('kwd'):
            vocab = kwd.attrib.get('vocab')
            if vocab and vocab in vocab_map and kwd.text:
                terms[vocab].append(kwd.text.strip())

        for vocab, description in vocab_map.items():
            if terms[vocab]:
                write_line(description, indent_level + 2)
                for term in terms[vocab]:
                    write_line(f"- {term}", indent_level + 3)

    
    for child in element:
        parse_element(child, file, indent_level)


def convert_jats_to_texts():
    if not os.path.exists(ARTICLES_DATA_DIRECTORY_PATH):
        os.makedirs(ARTICLES_DATA_DIRECTORY_PATH)
    
    for root_dir, sub_dirs, files in os.walk(JATS_DATA_DIRECTORY_PATH):
        for file_name in files:
            if file_name.endswith('.xml'):
                xm_file_path = os.path.join(root_dir, file_name)
                tree = ET.parse(xm_file_path)
                root = tree.getroot()
              
                output_text_file_path = os.path.join(ARTICLES_DATA_DIRECTORY_PATH, file_name.replace('.xml','.txt'))
                with open(output_text_file_path, 'w', encoding='utf-8') as f:
                    parse_element(root, f)


def fetch_concept_hierarchy(concept, query_template):
    url = "https://id.cabi.org/PoolParty/sparql/cabt"
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    query = query_template.format(keyword=concept)
    data = {'query': query, 'content-type': 'application/json-ld'}
    response = requests.post(url, headers=headers, data=data)
    
    try:
        response_json = response.json()
        if not response_json.get('results', {}).get('bindings'):                        
            return None
        return response_json
    except json.JSONDecodeError:
        print(f"Failed to decode JSON. Response text: {response.text}")
        return None
        
def extract_terms_from_xml(xml_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    vocab_map = {
        'preferredTerm': "Preferred Terms (Non-geographic, Non-organism contents from thesaurus)",
        'organismTerm': "Organism Terms (Organism names from thesaurus)",
        'geographicTerm': "Geographic Terms (Geographic Tags indicating content about a place)"
    }

    filtered_terms = []

    kwd_group = root.find(".//kwd-group[@kwd-group-type='CABI-keyword']")
    if kwd_group is not None:
        for kwd in kwd_group.findall('kwd'):
            vocab_type = kwd.get('vocab')
            if vocab_type in vocab_map:
                filtered_terms.append(kwd.text)

    return filtered_terms

def write_concept_to_file(output_text_file_path, concept_dict):      
    # print('Creating new Concept document:', output_text_file_path)
    with open(output_text_file_path, 'w', encoding='utf-8') as file:    
        for key, value in concept_dict.items():
            file.write(f"{key}:\n")
            for subkey, subvalue in value.items():
                if isinstance(subvalue, list):
                    file.write(f"    {subkey}:\n")
                    for item in subvalue:
                        file.write(f"        Concept:\n")
                        for k, v in item.items():  # Changed this line
                            file.write(f"            {k}: {v}\n")
                else:
                    file.write(f"    {subkey}:\n")
                    for k, v in subvalue.items():
                        file.write(f"        {k}: {v}\n")


def build_concept_dict(concept_name, concept_data):
    concept_dict = {
        "Thesaurus Concept": {
            "Concept": None,
            "Broader Concept": None,
            "Narrower Concepts": set(),
            "Related Concepts": set()
        }
    }

    for binding in concept_data.get('results', {}).get('bindings', []):
        # get all the necessary data
        concept_uri = binding.get('concept', {}).get('value')
        broader_name = binding.get('broaderLabel', {}).get('value')
        broader_uri = binding.get('broaderConcept', {}).get('value')
        narrower_name = binding.get('narrowerLabel', {}).get('value')
        narrower_uri = binding.get('narrowerConcept', {}).get('value')
        related_name = binding.get('relatedLabel', {}).get('value')
        related_uri = binding.get('relatedConcept', {}).get('value')

        # set the main concept
        if concept_name:
            concept_dict["Thesaurus Concept"]["Concept"] = {
                "name": concept_name,
                "uri": binding.get('concept', {}).get('value')
            }
        
        # set the broader concept
        if broader_name and broader_uri:
            concept_dict["Thesaurus Concept"]["Broader Concept"] = {
                "name": broader_name,
                "uri": broader_uri
            }

        # add unique narrower concepts
        if narrower_name and narrower_uri:
            concept_dict["Thesaurus Concept"]["Narrower Concepts"].add(
                json.dumps({"name": narrower_name, "uri": narrower_uri})
            )

        # add unique related concepts
        if related_name and related_uri:
            concept_dict["Thesaurus Concept"]["Related Concepts"].add(
                json.dumps({"name": related_name, "uri": related_uri})
            )

    # convert sets to lists of dicts
    concept_dict["Thesaurus Concept"]["Narrower Concepts"] = [
        json.loads(item) for item in concept_dict["Thesaurus Concept"]["Narrower Concepts"]
    ]
    concept_dict["Thesaurus Concept"]["Related Concepts"] = [
        json.loads(item) for item in concept_dict["Thesaurus Concept"]["Related Concepts"]
    ]

    return concept_dict


def convert_concepts_to_texts():
    if not os.path.exists(CONCEPTS_DATA_DIRECTORY_PATH):
        os.makedirs(CONCEPTS_DATA_DIRECTORY_PATH)

    with open(str(SPAQRQL_QUERY_FILE_PATH), 'r') as file:
        query_template = file.read().replace('\n', ' ').strip()
    
    concepts_set = set()

    # Extracts all the unique concepts/keywords of ineterests from the JATS Directory XMLs
    for root_dir, sub_dirs, files in os.walk(JATS_DATA_DIRECTORY_PATH):
        for file_name in files:
            if file_name.endswith('.xml'):
                xml_file_path = os.path.join(root_dir, file_name)
                concepts = extract_terms_from_xml(xml_file_path)
                concepts_set.update(concepts)

    # Create documents for each concepts
    for concept_name in concepts_set:
        concept_file_name = concept_name.replace(' ', '_')
        output_text_file_path = os.path.join(CONCEPTS_DATA_DIRECTORY_PATH, f'{concept_file_name}.txt')
        # if not os.path.isfile(output_text_file_path):
        concept_data = fetch_concept_hierarchy(concept_name, query_template)
        if concept_data is not None:
            concept_dict = build_concept_dict(concept_name, concept_data)
            write_concept_to_file(output_text_file_path, concept_dict)
        else:
            # print(f"No results found for the concept: {concept_name}")
            with open(str(CONCEPTS_NOT_FOUND_FILE_PATH), 'a', encoding='utf-8') as file:
                file.write(concept_name + '\n')

In [8]:
convert_jats_to_texts()
convert_concepts_to_texts()

In [None]:
# xml_file_path = os.path.join(JATS_DATA_DIRECTORY_PATH, '20203180098','20203180098.xml')
# # tree = ET.parse(xm_file_path)
# # root = tree.getroot()

# # output_text_file_path = os.path.join(JATS_DATA_DIRECTORY_PATH, '20203180098', '20203180098.txt')
# # with open(output_text_file_path, 'w', encoding='utf-8') as f:
# #     parse_element(root, f)

# concepts = extract_terms_from_xml(xml_file_path)
# concepts