In wikipedia-with-filtering.ipynb, I recorded whether a name led to a disambiguation page (in the 'dbig' column), but didn't investigate further, and treated it as them not having an article (`wiki_bool = False`). In this notebook, I attempt to dig deeper into those cases, by following the links listed in the disambig page for their name, to see if *any* of those articles are likely to be a match (again, based on matching keywords related to their field).

NB: This relies on some local modifications I made to my installed version of Python's wikipedia library (https://pypi.org/project/wikipedia/) which:
* Fixed https://github.com/goldsmith/Wikipedia/issues/79 (sometimes incorrect links given for dab entries)
* Causes `DisambiguationError.options` to ignore links found under a "See also" section heading.

In [1]:
import pandas as pd

df = pd.read_csv('gscholar_complete_v3.csv', index_col=0)

In [2]:
pd.set_option('max_colwidth', 800)

In [3]:
def wikify_name(name):
    name = name.replace('#', '')
    return name.title()

# The (presumed) title of the person's wikipedia article. If we find them by following a dab link,
# we'll update this column to match the disambiguated title (e.g. "Jane Smith (physicist)")
df['wiki_name'] = df.name.apply(wikify_name)

In [5]:
mini = df[df.dbig == True].sample(5, random_state=1)
mini

Unnamed: 0,h-index,h5-index,i10-index,i10-5-index,institution,name,url,n citations,n5 citations,field,gender,gender_prob,count_name,any_wiki_exists,dbig,firstline,wiki_bool,wiki_length,wiki_name
9033,8,8,8,8,Elon University,Benjamin Evans,https://scholar.google.com//citations?user=46Pb6pYAAAAJ&hl=en,890,712,1,male,1.0,1475.0,True,True,,False,0.0,Benjamin Evans
10222,62,49,97,78,Harvard University,Thomas Kane,https://scholar.google.com//citations?user=LHHh_88AAAAJ&hl=en,17921,9501,2,male,1.0,3753.0,True,True,,False,0.0,Thomas Kane
5199,23,16,31,21,Dartmouth College,Hans Mueller,https://scholar.google.com//citations?user=LbjfVjIAAAAJ&hl=en,2405,1055,1,male,0.99,431.0,True,True,,False,0.0,Hans Mueller
21098,8,7,7,5,Michigan State University,Christopher Long,https://scholar.google.com//citations?user=hZwN2WQAAAAJ&hl=en,289,182,3,male,1.0,2339.0,True,True,,False,0.0,Christopher Long
15648,18,14,26,22,London School of Economics and Political Science,Richard Bradley,https://scholar.google.com//citations?user=L6_MRGoAAAAJ&hl=en,735,480,2,male,1.0,4381.0,True,True,,False,0.0,Richard Bradley


In [6]:
import wikipedia as wp

# Copied from wikipedia.ipynb
def matches_field(page, field):
    keywds = {
        1: {'physics', 'physicist'}, # TODO: deal with false positive match on metaphysics
        2: {'economics', 'economist'},
        3: {'philosophy', 'philosopher'},
    }
    # I think it's actually better to use the introduction ('summary') rather than the full article
    # content for this purpose. If they are an economist or physicist or whatever, it'll definitely
    # be mentioned in the lead. Looking any deeper is just inviting false positives.
    content = page.summary.lower()
    for kw in keywds[field]:
        if kw in content:
            return True
        # Check cats too as a safety-measure (e.g. found an example where someone was described only as
        # a 'cosmologist' in the lead, but belonged to physicist categories)
        for cat in page.categories:
            if kw in cat:
                return True
    
    return False

def disambig_row(row):
    if not row.dbig:
        return row
    name = row.wiki_name
    try:
        # auto_suggest=False will still follow redirects, but won't fall back to first search result if
        # no page exists
        pg = wp.page(name, auto_suggest=False)
        print("WARNING: Expected disambig error for name {}, but didn't get one.".format(name))
    except wp.exceptions.DisambiguationError as e:
        for option in e.options:
            # Sometimes these are listed under a 'see also' section of a dab page. Annoyingly, they're included.
            if 'disambiguation' in option:
                continue
            try:
                # Weird that I need to disable auto_suggest again even when using an exact title match, but
                # apparently I do.
                pg = wp.page(option, auto_suggest=False)
            except wp.exceptions.DisambiguationError as e2:
                print("Double-dab for option {} of name {}".format(option, name))
                continue
            if matches_field(pg, row.field):
                if row.wiki_bool:
                    print("WARNING: name {} matches multiple referents: {}, {}".format(
                        name, row.wiki_name, option
                    ))
                row.wiki_bool = True
                row.wiki_name = option
                row.wiki_length = len(pg.content)
                row.firstline = pg.summary[:160]
    except Exception as e:
        print("Unexpected exception", type(e))
        print(e)
        return row
    return row

def disambig_row2(row):
    """disambig_row with pokemon error handling"""
    try:
        return disambig_row(row)
    except Exception as e:
        print("Unexpected exception for name={}, ".format(row['name']), type(e))
        print(e)
        return row

In [9]:
df2 = df.apply(disambig_row2, axis=1)

Double-dab for option Thomas Hartmann of name Thomas Hartman
Unexpected exception for name=Robert Fisher,  <class 'requests.exceptions.ConnectionError'>
HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?prop=extracts&explaintext=&exintro=&titles=Rob+Fisher+%28motorcyclist%29&format=json&action=query (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f31ba6ca208>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
Unexpected exception <class 'requests.exceptions.ConnectionError'>
HTTPConnectionPool(host='en.wikipedia.org', port=80): Max retries exceeded with url: /w/api.php?prop=info%7Cpageprops&action=query&inprop=url&redirects=&titles=Feng+Gao&format=json&ppprop=disambiguation (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7f31ba828668>: Failed to establish a new connection: [Errno 101] Network is unreachable',))
Unexpected exception for name=Jian Zhang,  

In [11]:
df2.wiki_bool.sum()

1397

In [12]:
df2.to_csv('gscholar_complete_v4.csv')

In [None]:
assert False

In [None]:
mini = df[df.dbig == True].sample(15, random_state=1)
dmini = mini.apply(disambig_row, axis=1)

cols = ['name', 'field', 'wiki_bool', 'wiki_name', 'url']
dmini[cols]

In [None]:
cols = ['name', 'field', 'wiki_bool', 'wiki_name', 'url']
dmini[cols]

In [None]:
dmini