# Get all names from the institutional repository

In [29]:
from bs4 import BeautifulSoup
import requests
import time
import pandas as pd

In [53]:
n = 0
page_list = []
while n < 32001:
    page_url = "https://drum.lib.umd.edu/browse?rpp=10000&sort_by=-1&type=author&offset=" + str(n) +"&etal=-1&order=ASC"
    page_list.append(page_url)
    n+=10000

In [55]:
listall = []
for n in range(len(page_list)):
    i  = page_list[n]
    page = requests.get(i)
    soup = BeautifulSoup(page.content, 'html.parser')
    results = soup.find_all("td")
    for r in results:
        line = [r.text.split("[")[0].strip(), r.text.split("[")[1].strip("]")]
        listall.append(line)
    if n % 10 == 0: 
        time.sleep(6)

In [60]:
my_df = pd.DataFrame(listall)
my_df.columns = ['Name', 'Num of works']
my_df.to_csv('all_name.csv', index=False)

# 10 authors with the highest number of works

In [61]:
my_df.sort_values(by='Num of works', ascending=False)

In [34]:
df = pd.read_csv("all_name_int_work.csv")

In [64]:
df.sort_values(by='Num of works', ascending=False)[:10]

Unnamed: 0,Name,Num of works
23137,Program on International Policy Attitudes (PIPA),265
1653,"Baras, John S.",233
26421,"Shneiderman, Ben",187
29486,UNSPECIFIED,155
18084,"Makowski, Armand M.",88
15589,"Krishnaprasad, Perinkulam S.",85
22774,"Plaisant, Catherine",72
27694,"Stewart, G. W.",69
11288,"Hanna, William John",68
9969,"Goeringer, Paul",65


# 100 authors with the highest number of works

Use 100 authors as an example to show how to reconcile name authorities with VIAF, LCNAF, Wikidata, ORCID. In real project, one needs to normolize and clean up the data first.

In [69]:
df100 = df.sort_values(by='Num of works', ascending=False)[:100]

In [72]:
name100 = df100.Name.tolist()

remove 'UNSPECIFIED' from the list and change 'Program on International Policy Attitudes (PIPA)' to 'Program on International Policy Attitudes'

In [76]:
name100.remove('UNSPECIFIED')
name100.remove('Program on International Policy Attitudes (PIPA)')
name100.append("Program on International Policy Attitudes")

### Using VIAF

VIAF Auto Suggest API (Suggest Authority Terms based on a text passed in a query): 

http://www.viaf.org/viaf/AutoSuggest?query=[query string]

For example: we want to get VIAF ID for Baras, John S.

http://www.viaf.org/viaf/AutoSuggest?query=John S. Baras 

We get response: 

{
  "query": "john s. baras",
  "result": [
    {
      "term": "John S Baras",
      "displayForm": "John S Baras",
      "nametype": "personal",
      "lc": "n90614736",
      "viafid": "35102971",
      "score": "1170",
      "recordID": "35102971"
    }
  ]
}

In [98]:
d = {}
for name in name100:
    d[name] = {}
    if "," in name:
        new_name = name.split(",")[1].strip()+" "+name.split(",")[0].strip()
        query = 'http://www.viaf.org/viaf/AutoSuggest?query=' + new_name
    else:
        query = 'http://www.viaf.org/viaf/AutoSuggest?query=' + name
    results = requests.get(query)
    viaf = []
    if results.json()['result'] is not None:
        for i in results.json()['result']:
            if 'viafid' in i:
                if i['viafid'] not in viaf:
                    viaf.append(i['viafid'])
    if len(viaf) > 0:
        d[name]['viaf'] = viaf

In [97]:
d

{'Baras, John S.': {'viaf': ['35102971']},
 'Shneiderman, Ben': {'viaf': ['108743798',
   '870156991005861180004',
   '310043169',
   '6386161098976929640005',
   '309041480',
   '12147270675035700007']},
 'Makowski, Armand M.': {'viaf': ['26622521']},
 'Krishnaprasad, Perinkulam S.': {},
 'Plaisant, Catherine': {'viaf': ['63676352']},
 'Stewart, G. W.': {'viaf': ['54221844', '80256344', '77349889']},
 'Hanna, William John': {'viaf': ['116078242']},
 'Goeringer, Paul': {},
 'Abed, Eyad H.': {'viaf': ['60187023']},
 'Saltz, Joel': {'viaf': ['274045353', '20950897', '306854581', '307030621']},
 'Tchangalova, Nedelina': {},
 'Tits, A.L.': {},
 'Qu, Gang': {'viaf': ['267319545', '310711540', '304988298', '313489789']},
 'Liu, K.J. Ray': {'viaf': ['12547689']},
 'Baykoucheva, Svetla': {'viaf': ['317277903']},
 'Basili, Victor R.': {'viaf': ['20402112',
   '6582147484343349360007',
   '9670154076015211860009',
   '3495147484329549360006']},
 'Geraniotis, Evaggelos A.': {},
 'Harhalakis, Geor