In [1]:
import sys, os, json, time
from collections import Counter

In [2]:
import pandas as pd
import numpy as np

In [3]:
with open('secret.conf', 'r') as f:
    for line in f:
        line = line.strip().split(' ')
        if line[0] == 'export':
            equal_sign_position = line[1].find('=')
            k = line[1][:equal_sign_position]
            v = line[1][equal_sign_position+1:].strip('"')
            os.environ[k] = v
            

In [10]:
from ms_academic import MicrosoftAcademic

In [11]:
api = MicrosoftAcademic()

In [12]:
api.attributes = 'Id,AA.AuId,AA.AuN,AA.AfN,AA.AfId'
api.query_threshold = 9e6

In [13]:
with open('coauthorship_largest_cc.json', 'r') as f:
    j = json.load(f)

In [36]:
x = j['nodes'][5]
x['author_name']

u"Fr\\'ed\\'eric Gilbert"

In [37]:
an = x['author_name_detex'].lower()
r = api.interpret_and_evaluate(an)

In [43]:
print(an)
from unidecode import unidecode
print(unidecode(an))

frédéric gilbert
frederic gilbert


In [44]:
an = unidecode(an)
r = api.interpret_and_evaluate(an)

In [49]:
from fuzzywuzzy import process
l_AuN = []
for i, aa in enumerate(r[1]['AA']):
    print(i, aa['AuN'])
    l_AuN.append(aa['AuN'])
process.extractOne(an, l_AuN)

(0, u'frederic gilbert')
(1, u'alexander r harris')
(2, u'robert m i kapsa')


(u'frederic gilbert', 100)

In [50]:
from fuzzywuzzy import fuzz
for i, aa in enumerate(r[1]['AA']):
    print(fuzz.ratio(aa['AuN'], an))

100
35
31


In [53]:
fuzz.ratio('don gilbert', an)

67

In [None]:
def query_author_affil(author_name, authors):
    found = False
    r = api.interpret_and_evaluate(author_name)
    if author_name not in authors:
        authors[author_name] = {'author_id': Counter(), 'affil_name': Counter(), 'affil_id': Counter()}
    for item in r:
        for aa in item['AA']:
            #if aa['AuN'] == author_name:
            if fuzz.ratio(aa['AuN'], author_name) > 85:
                found = True
                author_id = aa.get('AuId')
                affil_name = aa.get('AfN')
                affil_id = aa.get('AfId')
                authors[author_name]['author_id'][author_id] += 1
                authors[author_name]['affil_name'][affil_name] += 1
                authors[author_name]['affil_id'][affil_id] += 1
    return found

start = time.time()
authors = {}
none_found = []
errors = []
i = 0
sleep_dur = 10  # 10 seconds
sleep_interval = 10  # every 10 queries
for node in j['nodes']:
    #author_name = node['author_name'].lower()
    author_name = node['author_name_detex'].lower()
    author_name = unidecode(author_name)
    found = False
    try:
        found = query_author_affil(author_name, authors)
    except Exception as e:
        errors.append( (i, author_name, e) )
        # try one more time
        print("error. sleeping {} seconds".format(sleep_dur))
        time.sleep(sleep_dur)
        try:
            found = query_author_affil(author_name, authors)
        except Exception as e:
            errors.append( (i, author_name, e) )
    if not found:
        none_found.append(author_name)
    i += 1
    if i % sleep_interval == 0:
        print("{} queries in {:.1f} seconds. sleeping {} seconds".format(i, time.time()-start, sleep_dur))
        time.sleep(sleep_dur)

10 queries in 4.5 seconds. sleeping 10 seconds


In [20]:
len(authors)

456

In [21]:
len(none_found)

156

In [22]:
errors

[(218, u'kim marriott', KeyError('expr'))]

In [23]:
import cPickle as pickle
with open('author_affil_mag-2.pickle', 'wb') as outf:
    pickle.dump(authors, outf)

In [24]:
rows = []
for node in j['nodes']:
    cl_bottom = node['cl_bottom']
    cl_top = node['cl_top']
    author_name = node['author_name']
    cl_depth = len(cl_bottom.split(':'))
    if author_name.lower() in authors:
        affil_name = authors[author_name.lower()]['affil_name']
        if affil_name:
            affil_name = affil_name.most_common()[0][0]
        else:
            affil_name = ''
        affil_id = authors[author_name.lower()]['affil_id']
        if affil_id:
            affil_id = affil_id.most_common()[0][0]
        else:
            affil_id = ''
        rows.append( (author_name, cl_bottom, cl_top, cl_depth, affil_name, affil_id) )
df = pd.DataFrame(rows, columns='author_name cl_bottom cl_top cl_depth affil_name affil_id'.split())

In [25]:
df[df.cl_depth>1].sort_values(['cl_depth', 'cl_bottom'], ascending=False)

Unnamed: 0,author_name,cl_bottom,cl_top,cl_depth,affil_name,affil_id
20,Takafumi Saito,2:1:2:2,2,4,yamagata university,112524849
126,Jun Sese,2:1:2:2,2,4,university of tokyo,74801974
164,Takayuki Itoh,2:1:2:2,2,4,ibm,1341412227
260,Aika Terada,2:1:2:2,2,4,tokyo institute of technology,114531698
299,Rina Nakazawa,2:1:2:2,2,4,ochanomizu university,26120043
36,Michael Ogawa,2:1:2:1,2,4,university of california davis,84218800
120,Chris Muelder,2:1:2:1,2,4,university of california davis,84218800
130,Kwan-Liu Ma,2:1:2:1,2,4,,
162,Diane Felmlee,2:1:2:1,2,4,,
201,Robert Faris,2:1:2:1,2,4,university of california davis,84218800


In [47]:
df[df.author_name.isin(['Aaron Barsky', 'Tamara Munzner'])]

Unnamed: 0,author_name,cl_bottom,cl_top,cl_depth,affil_name,affil_id
22,Aaron Barsky,2:1:1:4,2,4,university of british columbia,141945490
305,Tamara Munzner,2:1:1:2,2,4,university of british columbia,141945490


In [48]:
errors

[(9, u'amyra meidiana', KeyError('expr')),
 (298, u'yaniv frishman', KeyError('expr'))]

In [49]:
df.affil_name.value_counts()

                                                                    152
university of stuttgart                                               9
l abri                                                                7
monash university                                                     6
french institute for research in computer science and automation      6
centre national de la recherche scientifique                          5
university of tubingen                                                5
university of trier                                                   5
university of california davis                                        5
university of calgary                                                 5
bielefeld university                                                  5
university of maryland college park                                   5
graz university of technology                                         5
university of perugia                                           

In [50]:
df.affil_id.value_counts()

              152
100066346       9
872287936       7
1326498283      6
1290206253      6
56590836        6
89864525        5
20121455        5
1341412227      5
168635309       5
84218800        5
8087733         5
27483092        5
4092182         5
66946132        5
130701444       4
153267046       4
138006243       4
189712700       4
129604602       4
121934539       3
121883995       3
202697423       3
111088046       3
149899117       3
83019370        3
1303153112      3
1325886976      3
200769079       3
168864056       3
             ... 
55176711        1
44265643        1
114027177       1
149704539       1
1324840837      1
202367325       1
52357470        1
1304085615      1
17974374        1
87048295        1
90610280        1
130769515       1
219193219       1
160606119       1
139264467       1
895027328       1
70931966        1
1324220072      1
135310074       1
165779595       1
97018004        1
32394136        1
198357145       1
208215962       1
102149020 