# Parsing

## Imports

In [6]:

import pprint
import csv
import numpy as np
import xml.etree.ElementTree as ET
import spacy
from spacy.lang.en import English
import geonamescache
from rapidfuzz.distance import Levenshtein

## Parsing

In [3]:
tree = ET.parse("./input_data/pubmed_data.xml")
root = tree.getroot()

print(root[:20])

[<Element 'PubmedArticle' at 0x146747790>, <Element 'PubmedArticle' at 0x146769f30>, <Element 'PubmedArticle' at 0x146776750>, <Element 'PubmedArticle' at 0x1467818a0>, <Element 'PubmedArticle' at 0x146789670>, <Element 'PubmedArticle' at 0x146795c10>, <Element 'PubmedArticle' at 0x1467a5940>, <Element 'PubmedArticle' at 0x1467acae0>, <Element 'PubmedArticle' at 0x1467be7a0>, <Element 'PubmedArticle' at 0x1467d4ae0>, <Element 'PubmedArticle' at 0x146916bb0>, <Element 'PubmedArticle' at 0x146921440>, <Element 'PubmedArticle' at 0x146939a80>, <Element 'PubmedArticle' at 0x146946b10>, <Element 'PubmedArticle' at 0x146951d50>, <Element 'PubmedArticle' at 0x14695b4c0>, <Element 'PubmedArticle' at 0x146972610>, <Element 'PubmedArticle' at 0x14697eca0>, <Element 'PubmedArticle' at 0x146999120>, <Element 'PubmedArticle' at 0x1469bec50>]


## Exploration

### How many articles are there in the dataset?

In [3]:
def count_articles(root_element: ET.Element) -> int:
    """Returns the number of articles in the dataset."""
    # article_count = len([child.tag for child in root]) counts two extra because...?
    article_count = len(root_element.findall("./PubmedArticle"))
    return article_count

print(count_articles(root))

3549


### For a given article, extract the following fields:
- Title
- Year
- PMID
- Keywords (potentially a list)
- MESH descriptor identifiers (a string with seven characters, beginning with a D and followed by six numbers, e.g. D000368; potentially a list)

In [4]:
def extract_article_details(article_element: ET.Element) -> dict:
    """
    Extracts certain fields for a given article.
    Returns them in a dictionary.
    """
    title = article_element.find('.//MedlineCitation//ArticleTitle').text
    pmid = article_element.find('.//MedlineCitation//PMID').text
    year = article_element.find('.//MedlineCitation//Year').text

    keywords = article_element.findall(
        './/MedlineCitation/KeywordList/Keyword')
    keywords_list = [keyword.text for keyword in keywords]

    mesh_headings = article_element.findall(
        './/MedlineCitation/MeshHeadingList/MeshHeading')
    mesh_descriptor_ids = [mesh_heading.find(
        "./DescriptorName").attrib['UI'] for mesh_heading in mesh_headings]

    extracted_fields = {'title': title, 'year': year, 'pmid': pmid}
    # Check if keywords_list is not empty before updating the dictionary
    if keywords_list:
        extracted_fields['keywords_list'] = keywords_list
    # Check if mesh_descriptor_ids is not empty before updating the dictionary
    if mesh_descriptor_ids:
        extracted_fields['mesh_ids'] = mesh_descriptor_ids
    return extracted_fields


article23 = extract_article_details(root[23])
pprint.PrettyPrinter(sort_dicts=False).pprint(article23)

{'title': '[Clinical value of myositis antibodies in patients with connective '
          'tissue disease-associated interstitial lung diseases].',
 'year': '2019',
 'pmid': '31594111',
 'keywords_list': ['Anti-synthetase antibody',
                   'Connective tissue disease',
                   'Lung disease, interstitial',
                   'Polymyositis/dermatomyositis (PM/DM)'],
 'mesh_ids': ['D000328',
              'D000368',
              'D001323',
              'D001324',
              'D002681',
              'D003240',
              'D003882',
              'D005260',
              'D006801',
              'D017563',
              'D008297',
              'D008875',
              'D009220',
              'D017285',
              'D012698']}


### For a given author, extract the following fields:
- First name
- Last name
- Initials
- GRID identifier
- Affiliations (potentially a list)

In [4]:
def extract_author_details(author_element: ET.Element) -> dict:
    """
    Extracts certain fields for a given author.
    Returns them in a dictionary.
    """
    author_details = {}

    for field in ['ForeName', 'LastName', 'Initials']:
        field_element = author_element.find(f'.//{field}')
        if field_element is not None:
            author_details.update({field: field_element.text})

    if author_element.find('.//Identifier[@Source="GRID"]') is not None:
        grid_identifier = author_element.find(
            './/Identifier[@Source="GRID"]').text
        author_details.update({'identity': grid_identifier})

    # an author can have multiple AffiliationInfos
    if author_element.findall('.//AffiliationInfo/Affiliation') is not None:
        affiliation_list = [affiliation.text for affiliation in author_element.findall(
            './/AffiliationInfo/Affiliation')]
        author_details.update({'affiliations': affiliation_list})

    return author_details


author0 = root[208].find(".//AuthorList").findall(".//Author")[0]
print(author0.find('.//Identifier[@Source="GRID"]').text)

pprint.PrettyPrinter(sort_dicts=False).pprint(
    extract_author_details(author0))

grid.5395.a
Affiliation National Research Council - Clinical Institute of Physiology, Pisa, Italy.
Affiliation 4Rheumatology Unit, Department of Clinical and Experimental Medicine, University of Pisa, Pisa, Italy.


In [10]:
def extract_author_details(author_element: ET.Element) -> dict:
    forename = author_element.find('.//ForeName')
    forename = forename.text if forename is not None else np.nan

    lastname = author_element.find('.//LastName')
    lastname = lastname.text if lastname is not None else np.nan

    initials = author_element.find('.//Initials')
    initials = initials.text if initials is not None else np.nan

    reported_identifier = author_element.find('.//Identifier[@Source="GRID"]')
    reported_identifier = reported_identifier.text if reported_identifier is not None else np.nan

    return {
        'forename': forename,
        'lastname': lastname,
        'initials': initials,
        'reported_identity': reported_identifier}

author0 = root[208].find(".//AuthorList").findall(".//Author")[0]
pprint.PrettyPrinter(sort_dicts=False).pprint(
    extract_author_details(author0))

{'forename': 'Antonella',
 'lastname': 'Cecchettini',
 'initials': 'A',
 'reported_identity': 'grid.5395.a'}


In [12]:
def extract_author_details(author_element: ET.Element) -> dict:
    forename = author_element.find('.//ForeName')
    if forename is not None:
        forename = forename.text

    lastname = author_element.find('.//LastName')
    if lastname is not None:
        lastname = lastname.text

    initials = author_element.find('.//Initials')
    if initials is not None:
        initials = initials.text

    reported_identifier = author_element.find('.//Identifier[@Source="GRID"]')
    if reported_identifier is not None:
        reported_identifier = reported_identifier.text

    return {
        'forename': forename,
        'lastname': lastname,
        'initials': initials,
        'reported_identity': reported_identifier}

author0 = root[208].find(".//AuthorList").findall(".//Author")[0]
pprint.PrettyPrinter(sort_dicts=False).pprint(
    extract_author_details(author0))

{'forename': 'Antonella',
 'lastname': 'Cecchettini',
 'initials': 'A',
 'reported_identity': 'grid.5395.a'}


In [8]:

author0 = root[208].find(".//AuthorList").findall(".//Author")[0]
print(author0.find('.//Identifier[@Source="GRID"]').text)

    # Print elements obtained using .iter()
for elem in author0.iter('.//AffiliationInfo/Affiliation'):
    print(elem.tag, elem.text)

# Print elements obtained using .iterfind()
for elem in author0.iterfind('.//AffiliationInfo/Affiliation'):
        print(elem.tag, elem.text)

grid.5395.a
Affiliation National Research Council - Clinical Institute of Physiology, Pisa, Italy.
Affiliation 4Rheumatology Unit, Department of Clinical and Experimental Medicine, University of Pisa, Pisa, Italy.


In [6]:
# gets nested dictionary for countries
gc = geonamescache.GeonamesCache().get_countries()
# converts to list
gc_list = [gc.get(value) for value in gc]

gc_countries = []
for dict in gc_list:
    gc_countries.append(dict.get('name'))

print(gc_countries)

['Andorra', 'United Arab Emirates', 'Afghanistan', 'Antigua and Barbuda', 'Anguilla', 'Albania', 'Armenia', 'Angola', 'Antarctica', 'Argentina', 'American Samoa', 'Austria', 'Australia', 'Aruba', 'Aland Islands', 'Azerbaijan', 'Bosnia and Herzegovina', 'Barbados', 'Bangladesh', 'Belgium', 'Burkina Faso', 'Bulgaria', 'Bahrain', 'Burundi', 'Benin', 'Saint Barthelemy', 'Bermuda', 'Brunei', 'Bolivia', 'Bonaire, Saint Eustatius and Saba ', 'Brazil', 'Bahamas', 'Bhutan', 'Bouvet Island', 'Botswana', 'Belarus', 'Belize', 'Canada', 'Cocos Islands', 'Democratic Republic of the Congo', 'Central African Republic', 'Republic of the Congo', 'Switzerland', 'Ivory Coast', 'Cook Islands', 'Chile', 'Cameroon', 'China', 'Colombia', 'Costa Rica', 'Cuba', 'Cabo Verde', 'Curacao', 'Christmas Island', 'Cyprus', 'Czechia', 'Germany', 'Djibouti', 'Denmark', 'Dominica', 'Dominican Republic', 'Algeria', 'Ecuador', 'Estonia', 'Egypt', 'Western Sahara', 'Eritrea', 'Spain', 'Ethiopia', 'Finland', 'Fiji', 'Falkland

In [7]:
affiliation_elements = root.findall('.//AffiliationInfo/Affiliation')
affiliations = [affiliation.text for affiliation in affiliation_elements]

nlp = spacy.load("en_core_web_sm")
countries = []

for affiliation_text in affiliations[:100]:
    doc = nlp(affiliation_text)
    for entity in doc.ents:
        if entity.label_ == "GPE" and entity.text in gc_countries:
            countries.append(entity.text)

print(countries)

['Greece', 'Norway', 'Norway', 'Norway', 'Norway', 'Norway', 'Norway', 'Norway', 'Norway', 'China', 'China', 'China', 'China', 'China', 'China', 'China', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Pakistan', 'Pakistan', 'Pakistan', 'Pakistan', 'France', 'France', 'Spain', 'France', 'Germany', 'Italy', 'Norway', 'Norway', 'Norway', 'France', 'Serbia', 'Japan', 'France', 'Spain', 'France', 'Japan', 'France', 'France', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Italy', 'Germany', 'Germany', 'Germany', 'Germany', 'Germany', 'Germany', 'Germany', 'Germany', 'Germany', 'Colombia', 'Germany', 'Germany', 'Germany', 'Germany', 'Sweden', 'Sweden', 'Sweden', 'United States', 'United States']


## Extracting institution for each affiliation

In [27]:
def extract_institution_from_affiliation_text(affiliation_text: str) -> str:
    """
    Takes in a single affiliation.
    Returns a country if its name is contained within the affiliation text.
    """
    doc = nlp(affiliation_text)

    entities = []
    for entity in doc.ents:
        if entity.label_ == "ORG":
            entities.append(entity.text)
    return entities

In [28]:
affiliation_elements = root.findall('.//AffiliationInfo/Affiliation')
affiliations = [affiliation.text for affiliation in affiliation_elements]

nlp = spacy.load("en_core_web_sm")
institutions = []

for affiliation_text in affiliations[:100]:
    if extract_institution_from_affiliation_text(affiliation_text):
        institution = extract_institution_from_affiliation_text(
            affiliation_text)
        institutions.append(institution)
        

print(institutions)
    

[['Department of Pathophysiology', 'School of Medicine', 'Kapodistrian University of Athens'], ['Department of Rheumatology', 'Amsterdam Rheumatology & Immunology Center'], ['Department of Radiology and Nuclear Medicine'], ['Department of Pulmonary Medicine', 'Amsterdam Cardiovascular Sciences'], ['Department of Radiology and Nuclear Medicine'], ['Department of Radiology and Nuclear Medicine'], ['Department of Rheumatology', 'Amsterdam Rheumatology & Immunology Center'], ['Department of Rheumatology', 'Amsterdam Rheumatology & Immunology Center'], ['Fuenlabrada'], ['Department of Ophthalmology', 'Østfold Hospital'], ['Clinical Immunology Unit', 'Department of Internal Medicine', 'Stavanger University Hospital'], ['Department of Ophthalmology, Haukeland University Hospital', 'Department of Clinical Medicine', 'University of Bergen'], ['Broegelmann Research Laboratory', 'Department of Clinical Science, University of Bergen', 'Department of Rheumatology, Haukeland University Hospital'], [

## Fuzzy matching on institution name


In [42]:
def load_grid_institutes() -> dict:
    """
    Loads the GRID institute data as a list of dictionaries.
    """
    with open("./input_data/grid_data/grid_institutes.csv", encoding="utf_8") as file:
        grid_institute_dicts = csv.DictReader(file)
        return [institute_dict for institute_dict in grid_institute_dicts]

load_grid_institutes()[:10]

[{'grid_id': 'grid.1001.0',
  'name': 'Australian National University',
  'wikipedia_url': 'http://en.wikipedia.org/wiki/Australian_National_University',
  'email_address': '',
  'established': '1946'},
 {'grid_id': 'grid.1002.3',
  'name': 'Monash University',
  'wikipedia_url': 'http://en.wikipedia.org/wiki/Monash_University',
  'email_address': '',
  'established': '1958'},
 {'grid_id': 'grid.1003.2',
  'name': 'University of Queensland',
  'wikipedia_url': 'http://en.wikipedia.org/wiki/University_of_Queensland',
  'email_address': '',
  'established': '1909'},
 {'grid_id': 'grid.1004.5',
  'name': 'Macquarie University',
  'wikipedia_url': 'http://en.wikipedia.org/wiki/Macquarie_University',
  'email_address': '',
  'established': '1964'},
 {'grid_id': 'grid.1005.4',
  'name': 'UNSW Sydney',
  'wikipedia_url': 'http://en.wikipedia.org/wiki/University_of_New_South_Wales',
  'email_address': '',
  'established': '1949'},
 {'grid_id': 'grid.1006.7',
  'name': 'Newcastle University',
 

In [47]:
def match_institution_details_on_name(institution_elements: list[str]) -> str:
    """
    Takes in an institution's elements as input.
    Attempts to fuzzy match elements with grid data institutes by similarity.
    Returns details of matched institute.
    """
    id = "GRID ID not found."
    name = "Institution name not found."
    
    if institution_elements:
        grid_institute_data = load_grid_institutes()
        
        for element in institution_elements:

            for grid_institute in grid_institute_data:
                if Levenshtein.normalized_similarity(grid_institute["name"], element) > 0.9:
                    # can have multipl mnatches here
                    # need to modify to collect as a list and return highest similarity match
                    id = grid_institute["grid_id"]
                    name = grid_institute["name"]
                    break

    return {'grid_id': id, 'name': name}

In [46]:
match = match_institution_details_on_name(['Department of Pathophysiology', 'School of Medicine', 'Monah University'])
print(match)

{'grid_id': 'grid.1002.3', 'name': 'Monash University'}


## Fuzzy matching-based extraction of institution for each affiliation

In [48]:
affiliation_elements = root.findall('.//AffiliationInfo/Affiliation')
affiliations = [affiliation.text for affiliation in affiliation_elements]

nlp = spacy.load("en_core_web_sm")
institutions = []

for affiliation_text in affiliations[:100]:
    if extract_institution_from_affiliation_text(affiliation_text):
        institution = extract_institution_from_affiliation_text(
            affiliation_text)
        grid_institution = match_institution_details_on_name(institution)['name']
        institutions.append(grid_institution)

print(institutions)
    

['Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Institution name not found.', 'Stavanger University Hospital', 'University of Bergen', 'Institution name not found.', 'Stavanger University Hospital', 'Wuhan University', 'Wuhan University', 'Wuhan University', 'Wuhan University', 'Institution name not found.', 'Wuhan University', 'Wuhan University', 'Institution name not found.', 'Institution name not found.', 'University of Udine', 'Institution name not found.', "University of L'Aquila", 'University of Perugia', 'Institution name not found.', 'Shifa International Hospital', 'Shifa International Hospital', 'Shifa International Hospital', 'Shifa International Hospital', 'Shifa International Hospital', 'Institution name not found.', 'Institution name not found.