In [1]:
import requests
import pandas as pd
import yaml
import xml.etree.ElementTree as et
from lxml import etree
import xml.dom.minidom as md

In [None]:
# create the last csv file with the file extract_from_xml_to_csv.py

In [3]:
# get all the authors and their institutions
paper_url='https://api.openalex.org/works/https://doi.org/{DOI}'
author_url = "https://api.openalex.org/authors/{ID}"

years = range(16,24)
results = pd.DataFrame(columns=['doi','Name','Name_Variants','Institution'])
author_ids = []

for year in years:
    for doi in pd.read_csv("G"+str(year)+".csv")["doi"]:   
        response = requests.get(url = paper_url.replace("{DOI}", doi))
        if response.status_code != 200:
            print("not response")
            break
        for author in response.json()['authorships']:
            author_ids.append(author['author']['id'].replace("https://openalex.org/",""))
            r = requests.get(url = author_url.replace("{ID}", author['author']['id'].replace("https://openalex.org/",""))).json()
            # names.append(r['display_name'])
            # inst.append(r['last_known_institution']['display_name'])
            results.loc[len(results.index)] = [doi, r['display_name'], r['display_name_alternatives'], r['last_known_institution']['display_name'] if r['last_known_institution'] else '']
    print(year)    
    
results.to_csv('author_detail.csv', index=False)

16
17
18
19
20
21
22
23


In [3]:
# create the inst_code.csv file using new author_detail.csv file

author_detail = pd.read_csv('author_detail.csv')
sorted_institution = author_detail['Institution'].value_counts().index
pd.DataFrame({'Institution':sorted_institution, 'code':range(1, len(sorted_institution)+1)}).to_csv("inst_code.csv", index=False)
# change the position of Uwaterloo and Usaskatchewan manually

In [4]:
# set a code to all records in the author_detail.csv file 
author_detail = pd.read_csv('author_detail.csv')
inst_code = pd.read_csv('inst_code.csv')

author_detail['code'] = ['0000' if pd.isna(item) else '{:04d}'.format(inst_code[inst_code['Institution'] == item]['code'].item()) if item else '' for item in author_detail['Institution']]
author_detail.to_csv('author_detail.csv', index=False)

# manually edited in this step. removed duplicates ...

In [5]:
# update xml files with unified authors (name variants corrected)
# replace the output files with the files in data/xml/ directory

author_detail = pd.read_csv('author_detail.csv')
xml_folder = '../data/xml/'
years = range(2016,2024)

for idx, year in enumerate(years):
    tree = et.parse(xml_folder+"G"+str(year)[-2:]+".xml")
    root = tree.getroot()
    for paper in root.findall(".//paper"):
        for author in paper.findall(".//author"):
            paper.remove(author)

        for author in author_detail[author_detail['doi'] == paper.find('.//doi').text]['Name'].iloc[::-1]:
            element = et.Element("author")            
            tag_subelement = et.SubElement(element, "first")
            tag_subelement.text = author.rsplit(' ',1)[0]
            tag_subelement = et.SubElement(element, "last")
            tag_subelement.text = author.rsplit(' ',1)[1]
            paper.insert(1,element)
            

    tree = et.ElementTree(root)
    file_name = "G"+str(year)[-2:]+".xml"   
    with open(file_name, "w", encoding="UTF-8") as f:
        f.write(etree.tostring(etree.XML(et.tostring(root, encoding="UTF-8", xml_declaration=True), parser=etree.XMLParser(remove_blank_text=True))).decode())

    xml_pretty_str = md.parse(file_name)
    xml_pretty_str = xml_pretty_str.toprettyxml(encoding='UTF-8').decode()
    with open(file_name, "w", encoding="utf-8") as f:
        f.write(xml_pretty_str)  

    # print(et.tostring(root, encoding='utf8').decode('utf8'))

In [28]:
# update name_variants.yaml
# replace the output file with the file in data/yaml/ directory

author_detail = pd.read_csv('author_detail.csv')

dict_file = []

for idx, item in author_detail[['Name', 'code']].drop_duplicates(subset=['Name'], keep='last').iterrows():
    if item['Name'] == 'Arash Rafat':
        print({'canonical' : {'first': item['Name'].rsplit(' ',1)[0], 'last': item['Name'].rsplit(' ',1)[1]}, 'id':item['Name'].rsplit(' ',1)[0].replace('.', '')+'-'+item['Name'].rsplit(' ',1)[1].replace('.', ''), 'comment':'uni{:04d}'.format(item['code'])})
    dict_file.append({'canonical' : {'first': item['Name'].rsplit(' ',1)[0], 'last': item['Name'].rsplit(' ',1)[1]}, 'id':item['Name'].rsplit(' ',1)[0].replace('.', '')+'-'+item['Name'].rsplit(' ',1)[1].replace('.', ''), 'comment':'uni{:04d}'.format(item['code'])})


with open(r'name_variants.yaml', 'w') as file:
    documents = yaml.dump(dict_file, file, default_flow_style=None)

{'canonical': {'first': 'Arash', 'last': 'Rafat'}, 'id': 'Arash-Rafat', 'comment': 'uni0006'}


In [7]:
# produce a line of code for 50 first universities
# put it in the second line of hugo\layouts\index.html

inst_code = pd.read_csv('inst_code.csv')

out_str = '{{ $universities := dict'
for idx, inst in inst_code.iloc[:50].iterrows():
    out_str += ' "'+'uni{:04d}'.format(inst['code'])+'" "'+inst['Institution']+'"'    
out_str += '}}' 

out_str

'{{ $universities := dict "uni0001" "University of Saskatchewan" "uni0002" "University of Waterloo" "uni0003" "Global Institute for Water Security" "uni0004" "McMaster University" "uni0005" "Environment and Climate Change Canada" "uni0006" "Wilfrid Laurier University" "uni0007" "University of Calgary" "uni0008" "University of Guelph" "uni0009" "Canmore Museum and Geoscience Centre" "uni0010" "Australian National University" "uni0011" "University of Montreal" "uni0012" "University of British Columbia" "uni0013" "Northern Arizona University" "uni0014" "Natural Resources Canada" "uni0015" "Universität Innsbruck" "uni0016" "Woods Hole Research Center" "uni0017" "University of Alberta" "uni0018" "Finnish Meteorological Institute" "uni0019" "University of Quebec at Montreal" "uni0020" "University of Manitoba" "uni0021" "University of Alaska Fairbanks" "uni0022" "University of Toronto" "uni0023" "Jet Propulsion Lab" "uni0024" "University of Arizona" "uni0025" "Lawrence Berkeley National Labor