In [1]:
import sys, os, json, time
from collections import Counter

In [2]:
import pandas as pd
import numpy as np

In [3]:
with open('secret.conf', 'r') as f:
    for line in f:
        line = line.strip().split(' ')
        if line[0] == 'export':
            equal_sign_position = line[1].find('=')
            k = line[1][:equal_sign_position]
            v = line[1][equal_sign_position+1:].strip('"')
            os.environ[k] = v
            

In [4]:
from ms_academic import MicrosoftAcademic

In [5]:
api = MicrosoftAcademic()

In [6]:
api.attributes = 'Id,AA.AuId,AA.AuN,AA.AfN,AA.AfId'
api.query_threshold = 9e6

In [7]:
with open('coauthorship_largest_cc.json', 'r') as f:
    j = json.load(f)

In [8]:
x = j['nodes'][5]
x['author_name']

u"Fr\\'ed\\'eric Gilbert"

In [10]:
an = x['author_name_detex'].lower()
r = api.interpret_and_evaluate(an)

In [11]:
print(an)
from unidecode import unidecode
print(unidecode(an))

frédéric gilbert
frederic gilbert


In [12]:
an = unidecode(an)
r = api.interpret_and_evaluate(an)

In [13]:
from fuzzywuzzy import process
l_AuN = []
for i, aa in enumerate(r[1]['AA']):
    print(i, aa['AuN'])
    l_AuN.append(aa['AuN'])
process.extractOne(an, l_AuN)

(0, u'frederic gilbert')
(1, u'alexander r harris')
(2, u'robert m i kapsa')


(u'frederic gilbert', 100)

In [14]:
from fuzzywuzzy import fuzz
for i, aa in enumerate(r[1]['AA']):
    print(fuzz.ratio(aa['AuN'], an))

100
35
31


In [15]:
fuzz.ratio('don gilbert', an)

67

In [21]:
def query_author_affil(author_name, author_id, authors):
    found = False
    r = api.interpret_and_evaluate(author_name)
    if author_name not in authors:
        authors[author_name] = {
            'author_id': author_id,
            'author_id_mag': Counter(), 
            'affil_name': Counter(), 
            'affil_id': Counter()
        }
    for item in r:
        for aa in item['AA']:
            #if aa['AuN'] == author_name:
            if fuzz.ratio(aa['AuN'], author_name) > 85:
                found = True
                author_id_mag = aa.get('AuId')
                affil_name = aa.get('AfN')
                affil_id = aa.get('AfId')
                authors[author_name]['author_id_mag'][author_id_mag] += 1
                authors[author_name]['affil_name'][affil_name] += 1
                authors[author_name]['affil_id'][affil_id] += 1
    return found

start = time.time()
authors = {}
none_found = []
errors = []
i = 0
sleep_dur = 10  # 10 seconds
sleep_interval = 10  # every 10 queries
for node in j['nodes']:
    #author_name = node['author_name'].lower()
    author_name = node['author_name_detex'].lower()
    author_name = unidecode(author_name)
    author_id = node['id']
    found = False
    try:
        found = query_author_affil(author_name, author_id, authors)
    except Exception as e:
        errors.append( (i, author_name, e) )
        # try one more time
        print("error. sleeping {} seconds".format(sleep_dur))
        time.sleep(sleep_dur)
        try:
            found = query_author_affil(author_name, author_id, authors)
        except Exception as e:
            errors.append( (i, author_name, e) )
    if not found:
        none_found.append(author_name)
    i += 1
    if i % sleep_interval == 0:
        print("{} queries in {:.1f} seconds. sleeping {} seconds".format(i, time.time()-start, sleep_dur))
        time.sleep(sleep_dur)

10 queries in 4.2 seconds. sleeping 10 seconds
20 queries in 18.5 seconds. sleeping 10 seconds
30 queries in 32.7 seconds. sleeping 10 seconds
40 queries in 45.6 seconds. sleeping 10 seconds
50 queries in 60.5 seconds. sleeping 10 seconds
60 queries in 75.6 seconds. sleeping 10 seconds
70 queries in 88.4 seconds. sleeping 10 seconds
80 queries in 105.6 seconds. sleeping 10 seconds
90 queries in 118.6 seconds. sleeping 10 seconds
100 queries in 131.7 seconds. sleeping 10 seconds
110 queries in 147.4 seconds. sleeping 10 seconds
120 queries in 163.4 seconds. sleeping 10 seconds
130 queries in 177.1 seconds. sleeping 10 seconds
140 queries in 191.7 seconds. sleeping 10 seconds
150 queries in 204.7 seconds. sleeping 10 seconds
160 queries in 217.4 seconds. sleeping 10 seconds
170 queries in 232.1 seconds. sleeping 10 seconds
180 queries in 245.4 seconds. sleeping 10 seconds
190 queries in 260.5 seconds. sleeping 10 seconds
200 queries in 274.0 seconds. sleeping 10 seconds
210 queries in 28

In [22]:
len(authors)

236

In [23]:
len(none_found)

40

In [24]:
errors

[]

In [25]:
import cPickle as pickle
with open('author_affil_largest_cc_fuzz_mag-2.pickle', 'wb') as outf:
    pickle.dump(authors, outf)

In [26]:
rows = []
for node in j['nodes']:
    cl_bottom = node['cl_bottom']
    cl_top = node['cl_top']
    author_name = node['author_name']
    cl_depth = len(cl_bottom.split(':'))
    if author_name.lower() in authors:
        affil_name = authors[author_name.lower()]['affil_name']
        if affil_name:
            affil_name = affil_name.most_common()[0][0]
        else:
            affil_name = ''
        affil_id = authors[author_name.lower()]['affil_id']
        if affil_id:
            affil_id = affil_id.most_common()[0][0]
        else:
            affil_id = ''
        rows.append( (author_name, cl_bottom, cl_top, cl_depth, affil_name, affil_id) )
df = pd.DataFrame(rows, columns='author_name cl_bottom cl_top cl_depth affil_name affil_id'.split())

In [27]:
df[df.cl_depth>1].sort_values(['cl_depth', 'cl_bottom'], ascending=False)

Unnamed: 0,author_name,cl_bottom,cl_top,cl_depth,affil_name,affil_id
8,Takafumi Saito,2:1:2:2,2,4,yamagata university,112524849
71,Jun Sese,2:1:2:2,2,4,university of tokyo,74801974
93,Takayuki Itoh,2:1:2:2,2,4,ochanomizu university,26120043
143,Aika Terada,2:1:2:2,2,4,tokyo institute of technology,114531698
160,Rina Nakazawa,2:1:2:2,2,4,ochanomizu university,26120043
17,Michael Ogawa,2:1:2:1,2,4,university of california davis,84218800
69,Chris Muelder,2:1:2:1,2,4,university of california davis,84218800
75,Kwan-Liu Ma,2:1:2:1,2,4,,
91,Diane Felmlee,2:1:2:1,2,4,university of california davis,84218800
113,Robert Faris,2:1:2:1,2,4,university of california davis,84218800


In [28]:
df[df.author_name.isin(['Aaron Barsky', 'Tamara Munzner'])]

Unnamed: 0,author_name,cl_bottom,cl_top,cl_depth,affil_name,affil_id
7,Aaron Barsky,2:1:1:4,2,4,university of british columbia,141945490
156,Tamara Munzner,2:1:1:2,2,4,university of british columbia,141945490


In [29]:
df[df.author_name.str.contains('Katy')]

Unnamed: 0,author_name,cl_bottom,cl_top,cl_depth,affil_name,affil_id
217,Katy Borner,2:2:1,2,3,indiana university,592451


In [66]:
errors

[]

In [67]:
df.affil_name.value_counts()

                                                                    37
university of stuttgart                                              8
l abri                                                               7
graz university of technology                                        6
monash university                                                    6
university of maryland college park                                  5
university of arizona                                                5
university of trier                                                  5
university of california davis                                       5
french institute for research in computer science and automation     5
boston university                                                    5
european bioinformatics institute                                    5
centre national de la recherche scientifique                         5
kaiserslautern university of technology                              4
micros

In [68]:
df.affil_id.value_counts()

              37
100066346      8
872287936      7
4092182        6
138006243      5
84218800       5
1303153112     5
56590836       5
66946132       5
89864525       5
1290206253     5
1326498283     5
1341412227     4
141945490      4
111088046      4
153267046      3
121883995      3
315704651      3
1294671590     3
83019370       3
99065089       3
200769079      3
102322142      2
223464139      2
26120043       2
114531698      2
20231570       2
1314251682     2
2250955327     2
161046081      2
              ..
98358874       1
98677209       1
55176711       1
130750295      1
17974374       1
74973139       1
47720641       1
1312106833     1
133960621      1
55732556       1
204983213      1
154570441      1
78570951       1
114027177      1
63772739       1
102149020      1
90610280       1
208215962      1
200777214      1
78577930       1
74801974       1
97565354       1
30147112       1
112524849      1
188538660      1
27804330       1
22299242       1
219193219     

In [77]:
gb = df.groupby('affil_id')
gbnu = gb.affil_name.nunique()

In [80]:
df[df.affil_id.isin(gbnu[gbnu!=1].index)]

Unnamed: 0,author_name,cl_bottom,cl_top,cl_depth,affil_name,affil_id
2,Adam Perer,4:2,4,2,ibm,1341412227
6,Nan Cao,3:1,3,2,ibm,1341412227
7,Aaron Barsky,2:1:1:4,2,4,university of british columbia,141945490
40,Jun Zhu,3:5,3,2,tsinghua university,99065089
45,Baining Guo,3:5,3,2,microsoft,1290206253
50,Tim Dwyer,1:2,1,2,monash university,1290206253
55,Lei Shi,3:1,3,2,electronics and telecommunications research in...,99065089
76,Gonzalo Ramos,1:6,1,2,microsoft,1290206253
80,Mary Czerwinski,1:6,1,2,microsoft,1290206253
85,Jennifer Gardy,2:1:1:4,2,4,university of british columbia,141945490
