## This notebook processes the full dataset

## Imports

In [247]:
import pandas as pd
import xml.etree.ElementTree as ET
import re
# pip install spacy
# python -m spacy download en_core_web_sm
import spacy
import pycountry
from rapidfuzz import process, fuzz
from rapidfuzz.distance import Levenshtein
import rapidfuzz

## Read File

In [None]:
tree = ET.parse('../raw_data/pubmed_result_sjogren.xml')
root = tree.getroot()

# Create base dataframe

In [249]:
def parse_article_metadata(article):
    """Returns article pmid, title and year."""
    pmid_value = article.find(".//PMID").text
    title_value = article.find(".//ArticleTitle").text
    year_value = article.find(".//PubDate/Year")

    return pmid_value, title_value, year_value.text if year_value is not None else ""

In [250]:
def parse_author_info(author):
    """Returns author information"""
    last_name = author.find("LastName")
    first_name = author.find("ForeName")

    if last_name is not None and first_name is not None:
        return f"{last_name.text} {first_name.text}"
    elif last_name is not None:
        return last_name.text
    elif first_name is not None:
        return first_name.text
    else:
        return ""

In [251]:
def get_email(affiliation_text):
    """Extracts the email from affiliation name using regex"""
    email_address = re.search(r"[\w.]+@[\w.]+\w+", affiliation_text)
    return email_address.group() if email_address else ""

In [252]:
def get_zipcode(affiliation_text):
    """Extracts zipcode from affiliation name using regex"""
    zip_code = re.search(
        r"[A-Za-z]{1,2}\d[A-Za-z\d]? ?\d[A-Za-z]{2}|\d{5}(-\d{4})?|[A-Z]\d[A-Z] \d[A-Z]\d", affiliation_text)
    return zip_code.group() if zip_code else ""

In [253]:

def get_keywords(article):
    """Returns keywords for each article"""
    keywords_element = article.findall(".//KeywordList/Keyword")
    return [keyword.text for keyword in keywords_element] if keywords_element else [""]

In [254]:
def get_mesh_identifiers(article):
    """Returns mesh identifiers UI for each article"""
    mesh_elements = article.findall(
        ".//MeshHeadingList/MeshHeading/DescriptorName")
    return [mesh.get("UI") for mesh in mesh_elements] if mesh_elements else [""]

In [None]:
pmid, title, author_name, affiliation_name, year = [], [], [], [], []
mesh_identifiers, keywords, email, zipcode = [], [], [], []

for article in root.findall("PubmedArticle"):
    pmid_value, title_value, year_value = parse_article_metadata(article)

    for author in article.findall(".//AuthorList/Author"):
        for affiliation in author.findall("AffiliationInfo/Affiliation"):
            pmid.append(pmid_value)
            title.append(title_value)
            year.append(year_value)

            author_name.append(parse_author_info(author))

            affiliation_text = affiliation.text
            affiliation_name.append(affiliation_text)

            email.append(get_email(affiliation_text))
            zipcode.append(get_zipcode(affiliation_text))

            keywords.append(get_keywords(article))
            mesh_identifiers.append(get_mesh_identifiers(article))

pubmed_df = pd.DataFrame({
    "Article PMID": pmid,
    "Article title": title,
    "Article keywords": keywords,
    "Article MESH identifiers": mesh_identifiers,
    "Article Year": year,
    "Author full name": author_name,
    "Author email": email,
    "Affiliation name": affiliation_name,
    "Affiliation zipcode": zipcode
})

pubmed_df.to_csv("../cleaned_data/pubmed_output.csv", index=False)

#took 4 secs

# Challenge 3: NLP

In [256]:
# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")


**Extract country and institution name from affiliation**

In [257]:
def extract_entities(affiliation_text):
    """Returns the entities of each affiliation(First affiliaiton name column)"""
    doc = nlp(affiliation_text)
    entities = doc.ents

    return entities


entities = extract_entities(affiliation_text)


def extract_gpe_entities(entities):
    gpe_entities = {
        entity.text for entity in reversed(entities) if entity.label_ == "GPE"}
    return gpe_entities


def extract_org_entities(entities):
    org_entities = {
        entity.text for entity in entities if entity.label_ == "ORG"}
    return org_entities

In [258]:
valid_countries = {country.name for country in pycountry.countries}

def extract_country(gpe_entities):
    """Returns valid country name from GPE entities for each affiliation"""
    # Consider these example entities: ['Amsterdam', 'Amsterdam', 'The Netherlands'] ['San Sebastián de', 'Madrid']['Republic of China']
    if gpe_entities:
        return next((gpe for gpe in gpe_entities if gpe in valid_countries), None)
    return None

#no for loop took longer and the next gpe with list comp took 15 mins

In [259]:
def get_institution_name(org_entities):
    """Returns the institution name for each affiliation"""
    valid_institutions = ["University", "Institute",
                          "College", "Hospital", "Center"]

    for institution in org_entities:
        for valid_institution in valid_institutions:
            if valid_institution in institution:
                return institution
    return None

**Extracting country from affiliation name**
- Multiple tokens from extracting GPE(countries, cities, names)
- Some have no country names but have city names
- Some countries have "The" in front
- Not sure why Madrid didn't work
- Some have repeated names

**Extracting institution name from affiliation name**
- multiple tokens 
- Looking at the institutes.csv, I should get the ones with university in them?

## Challenge 4: RapidFuzz

Assumptions:
- Threshold similarity score of 90% considered a match
- matching based on names only 
- case and minor differences: handling case sensitivity and small variations default

In [None]:
pubmed_df = pd.read_csv("../cleaned_data/pubmed_output.csv")
grid_df = pd.read_csv("../raw_data/institutes.csv")

In [261]:
grid_map = grid_df.set_index('name').to_dict()['grid_id']


def match_org_to_grid(org_entities):
    """Matches the org_entities to GRID dataset institutions with RapidFuzz"""
    best_match = None
    best_match_id = None

    for org in org_entities:
        match = process.extractOne(org, grid_map.keys(
        ), scorer=Levenshtein.normalized_similarity, score_cutoff=0.9)
        if match and match[1] >= 0.9:
            best_match = match[0]
            best_match_id = grid_map[best_match]
            break

    return (best_match, best_match_id)

In [None]:
countries = []
institution_names = []
matched_institutions_grid = []
matched_grid_ids = []

pubmed_df["Affiliation name"] = pubmed_df["Affiliation name"].astype(str)
pubmed_df["nlp"] = pubmed_df["Affiliation name"].apply(
    extract_entities)


pubmed_df["Country"] = pubmed_df["nlp"].apply(extract_gpe_entities)
pubmed_df["Institution name"] = pubmed_df["nlp"].apply(extract_org_entities)

pubmed_df["Country"] = pubmed_df["Country"].apply(extract_country)

grid = pubmed_df["Institution name"].apply(
    match_org_to_grid)

pubmed_df[["Institution GRID name", "Institution GRID id"]
          ] = pd.DataFrame(grid.to_list())

pubmed_df.drop(columns="Institution name")
pubmed_df.drop(columns="nlp")

pubmed_df.to_csv("../cleaned_data/pubmed_output.csv", index=False)

In [None]:
# # Create and save new columns(affiliation country and affiliaiton institution name) to csv

# countries = []
# institution_names = []
# matched_institutions_grid = []
# matched_grid_ids = []

# for article in root.findall("PubmedArticle"):
#     for author in article.findall(".//AuthorList/Author"):
#         for affiliation in author.findall("AffiliationInfo/Affiliation"):
#             affiliation_text = affiliation.text
#             gpe_entities, org_entities = extract_entities(affiliation_text)

#             #challenge 3
#             country = extract_country(gpe_entities)
#             # institution_name = get_institution_name(org_entities)

#             #challenge 4 
#             grid_name, grid_id = match_org_to_grid(org_entities)

#             countries.append(country)
#             # institution_names.append(institution_name)
#             matched_institutions_grid.append(grid_name)
#             matched_grid_ids.append(grid_id)

# pubmed_df["Affiliation country"] = countries
# # pubmed_df["Affiliation institution name"] = institution_names
# pubmed_df["Affiliation institution name (GRID)"] = matched_institutions_grid
# pubmed_df["Affiliation institution GRID ID"] = matched_grid_ids

# pubmed_df.to_csv("../cleaned_data/pubmed_output.csv", index=False)

# # 2 mins
# # 15m 54

# #Levenshtein 0.8 -> 8m 53s
# # Levenshtein 0.9/no processor/ next(country) -> 6m 2.7s
# # dictionary ORG - 5m 45s
# # 5.27