# Extracting data from scientific articles (xmls) and disambiguating authors by grouping

## Author: Mitodru Niyogi

In [1]:
import os
#import xml.etree.ElementTree as ET
import re
import itertools as it

import pandas as pd
import numpy as np


from src.preprocess import clean_text, preprocess_dataset
from src.preprocess import extract_article_title_xml, extract_all_articles, extract_bibliography_of_article, standardize_author_names
from src.preprocess import extract_ref_authors, extract_all_articles_authors, extract_all_cited_authors_of_paper
from src.preprocess import create_articles_authors_info_dict, create_dataset_csv, print_sort_alphabetic


# Define the directory path containing the A++ files
#dir_path = '/path/to/folder/containing/xml/files'
dir_path = '../test-articles/'

### Extract a canonical alphabetical list of all the article titles.

In [2]:
titles = extract_all_articles(dir_path)

In [3]:
print_sort_alphabetic(titles, 'Title')

Title: 1 Adaptation options in agriculture to climate change: a typology
Title: 2 Agrobacterial Transformation of Uninjured Plants
Title: 3 An overview of the recent advances in spray-drying
Title: 4 Application of a quasi-median network analysis for the visualization of character conflicts to a population sample of mitochondrial DNA control region sequences from southern Germany (Ulm)
Title: 5 Assessing the invasive potential of Eucalyptus globulus in Australia: quantification of wildling establishment from plantations
Title: 6 Climate Change and Forest Communities: Prospects for Building Institutional Adaptive Capacity in the Congo Basin Forests
Title: 7 Climate change adaptation planning in remote, resource-dependent communities: an Arctic example
Title: 8 Climate change, food security, and livelihoods in sub-Saharan Africa
Title: 9 Community-based vulnerability assessment of Tuktoyaktuk, NWT, Canada to environmental and socio-economic changes
Title: 10 Die Wirkung von Polymerzusätz

## Extract a canonical alphabetic list of all articles authors.

In [4]:
authors = extract_all_articles_authors(dir_path)

In [None]:
print_sort_alphabetic(authors, 'Author')

## Extract a canonical alphabetical list of all the authors of cited articles.

In [6]:
# returns list of list of citations
# every index refers to the paper and its index refers to list of citations
#and its index refers to list of authors
ref_authors_all = extract_ref_authors(dir_path)

In [7]:
# aull cited authors for each artcile
papers_bib = extract_all_cited_authors_of_paper(ref_authors_all)

In [8]:
# sorted consolidated list of reference authors
consolidated_all_ref_authors_sorted = sorted(list(it.chain.from_iterable(papers_bib)))

In [None]:
# printing 
consolidated_all_ref_authors_sorted

### Disambiguating authors by grouping

Disambiguating authors by extracting more information about the author. As we already saw that there are many authors with the same name. Here, I have created a dictionary with author name as key with list of publications, affiliations, journal subject, article id, article title, journal title as list of tuples to disambiguate authors with the same name. 

In [10]:
# disambiguating authors by grouping 
authors_dict = create_articles_authors_info_dict(dir_path)

In [17]:
print(json.dumps(authors_dict, indent=4, ensure_ascii=False))

{
    "Patrick T. Ronaldson": [
        [
            "10.1023/B:PHAM.0000026433.27773.47",
            "Involvement of P-Glycoprotein in the Transport of Saquinavir and Indinavir in Rat Brain Microvessel Endothelial and Microglia Cell Lines",
            "Pharmaceutical Research",
            "Biomedicine",
            "Department of Pharmaceutical Sciences, Leslie Dan Faculty of Pharmacy, University of Toronto"
        ]
    ],
    "Gloria Lee": [
        [
            "10.1023/B:PHAM.0000026433.27773.47",
            "Involvement of P-Glycoprotein in the Transport of Saquinavir and Indinavir in Rat Brain Microvessel Endothelial and Microglia Cell Lines",
            "Pharmaceutical Research",
            "Biomedicine",
            "Department of Pharmaceutical Sciences, Leslie Dan Faculty of Pharmacy, University of Toronto"
        ],
        [
            "10.1023/B:PHAM.0000036905.82914.8e",
            "Functional Expression and Localization of P-glycoprotein in the Central Nervo

Here we can see that there are two persons Gloria Lee with different affiliations but in the same research field. There are 7 authors with the  name 'Barry Smit'

In [176]:
# creating csv file with authors information
create_dataset_csv(authors_dict, dir_path)

In [22]:
df = pd.read_csv(dir_path+'authors.csv')

In [24]:
df

Unnamed: 0,Author,ArticleID,Article Title,Journal Title,Journal Subject,Affiliations
0,Patrick T. Ronaldson,10.1023/B:PHAM.0000026433.27773.47,Involvement of P-Glycoprotein in the Transport...,Pharmaceutical Research,Biomedicine,"Department of Pharmaceutical Sciences, Leslie ..."
1,Gloria Lee,10.1023/B:PHAM.0000026433.27773.47,Involvement of P-Glycoprotein in the Transport...,Pharmaceutical Research,Biomedicine,"Department of Pharmaceutical Sciences, Leslie ..."
2,Gloria Lee,10.1023/B:PHAM.0000036905.82914.8e,Functional Expression and Localization of P-gl...,Pharmaceutical Research,Biomedicine,Laboratoire de Biophysique des Matériaux Alime...
3,Shannon Dallas,10.1023/B:PHAM.0000026433.27773.47,Involvement of P-Glycoprotein in the Transport...,Pharmaceutical Research,Biomedicine,"Department of Pharmaceutical Sciences, Leslie ..."
4,Reina Bendayan,10.1023/B:PHAM.0000026433.27773.47,Involvement of P-Glycoprotein in the Transport...,Pharmaceutical Research,Biomedicine,"Department of Pharmaceutical Sciences, Leslie ..."
...,...,...,...,...,...,...
132,Amanda Caron,10.1007/s10113-012-0297-2,"Climate change adaptation planning in remote, ...",Regional Environmental Change,Environment,"Department of Geography, McGill University"
133,Bill Patrick Kudlak,10.1007/s10113-012-0297-2,"Climate change adaptation planning in remote, ...",Regional Environmental Change,Environment,Community of Paulatuk
134,Yanfeng Geng,10.1007/s10626-013-0176-0,Multi-intersection Traffic Light Control with ...,Discrete Event Dynamic Systems,Mathematics,Division of Systems Engineering and Center for...
135,Christos G. Cassandras,10.1007/s10626-013-0176-0,Multi-intersection Traffic Light Control with ...,Discrete Event Dynamic Systems,Mathematics,Division of Systems Engineering and Center for...
