Like wikipedia.ipynb, but add a filtering step where we check the content of the putative wiki page for keywords related to the researcher's field.

False-false positive examples:
* Leonard Feldman (stub, only describes as engineer)
* David McClelland (physicist) (primary topic for his name is a psychologist)
* Shaul Mukamel (wiki calls him chemist, not physicist)
* Micha Tomkiewicz (lead just calls her scientist, cats include 'physical chemist')

In [10]:
import pandas as pd
import urllib
import requests
import bs4
import csv

In [11]:
scientists = []
#with open('gscholar/files/profData_withGender_physics_economics_philo.csv', "r") as f:
with open('gscholar_newfields_readyforwiki.csv', "r") as f:
    reader = csv.reader(f)
    # Skip header|
    next(reader)

    # get the link loaded in the csv
    for row in reader:
        name = row[11]
        field = row[3]
        sc = (name, field)
        scientists.append(sc)

# should be 23179
len(scientists)

102621

In [28]:
#biology         chemistry   label%3Aphysics label%3Asociology       linguistics       mathematics          medicine 
#            21126             20845              4198              3212              3165             20221             10286 
#       philosophy          politics        psychology 
#             4501              6925              8142 

import wikipedia as wp

def matches_field(page, field):
    """Return whether the subject of the given Wiki page is likely to be a member of the given
    field according to some keyword-matching heuristics.
    """
    keywds = {
        'biology':{'biology', 'biologist'},
        'chemistry':{'chemistry','chemist'},
        'mathematics':{'mathematics','mathematician'},
        'medicine':{'medicine','MD'},
        'politics':{'politics','political scientist'},
        'psychology':{'psychology','psychologist'},
        'label%3Aphysics': {'physics', 'physicist'}, # TODO: deal with false positive match on metaphysics
        'philosophy': {'philosophy', 'philosopher'},
        'label%3Asociology':{'sociology','sociologist'},
        'linguistics':{'linguistics','linguist'}
    }
    # I think it's actually better to use the introduction ('summary') rather than the full article
    # content for this purpose. If they are an economist or physicist or whatever, it'll definitely
    # be mentioned in the lead. Looking any deeper is just inviting false positives. (e.g. some footballer
    # who studied physics, or a politician whose mom was an economist, or whatever)
    content = page.summary.lower()
    for kw in keywds[field]:
        if kw in content:
            return True
        # Check cats too as a safety-measure (e.g. found an example where someone was described only as
        # a 'cosmologist' in the lead, but belonged to physicist categories)
        for cat in page.categories:
            if kw in cat:
                return True
    
    return False
        

def fetch_wiki_data(name, field):
    """Given the name of a scientist and their field (1=physics etc.), return
    a row of data about that scientist, stored in a dictionary mapping column names
    to values.
    """
    res = dict(name=name, any_wiki_exists=False, wiki_exists=False, wiki_length=0, 
               dbig=False, firstline='',
              )
    if '#' in name:
        # Turns out this breaks the wiki API
        print("Uh oh...", name)
        name = name.replace('#', '')
    try:
        # auto_suggest=False will still follow redirects, but won't fall back to first search result if
        # no page exists
        pg = wp.page(name.title(), auto_suggest=False)
    except wp.exceptions.DisambiguationError as disambig:
        res['any_wiki_exists'] = True
        res['dbig'] = True
    except wp.exceptions.PageError as e:
        # No page for this person's name. Keep the default column values.
        pass
    else:
        res['any_wiki_exists'] = True
        res['firstline'] = pg.summary
        res['wiki_exists'] = matches_field(pg, field)
        res['wiki_length'] = len(pg.content)
    return res

In [19]:
print(
    fetch_wiki_data('Stephen Hawking', 'label%3Aphysics'),
    fetch_wiki_data('Stephen Hawking', 'psychology'), # no field match
    fetch_wiki_data('Rakesh Agrawal', 'label%3Aphysics'), # disambig
    fetch_wiki_data('Rakesh Agrawalasdf', 'label%3Aphysics'), # nopage
    sep='\n',
)



  lis = BeautifulSoup(html).find_all('li')


{'name': 'Stephen Hawking', 'any_wiki_exists': True, 'wiki_exists': True, 'wiki_length': 59865, 'dbig': False, 'firstline': 'Stephen William Hawking  (8 January 1942 – 14 March 2018) was an English theoretical physicist, cosmologist, and author who was director of research at the Centre for Theoretical Cosmology at the University of Cambridge at the time of his death. He was the Lucasian Professor of Mathematics at the University of Cambridge between 1979 and 2009.\nHis scientific works included a collaboration with Roger Penrose on gravitational singularity theorems in the framework of general relativity and the theoretical prediction that black holes emit radiation, often called Hawking radiation. Hawking was the first to set out a theory of cosmology explained by a union of the general theory of relativity and quantum mechanics. He was a vigorous supporter of the many-worlds interpretation of quantum mechanics.Hawking achieved commercial success with several works of popular science

In [29]:
print(
    scientists[:10],
    sep='\n',
)

[('Karl Marx', 'label%3Asociology'), ('Karl Marx', 'label%3Asociology'), ('Erving Goffman', 'label%3Asociology'), ('Erving Goffman', 'label%3Asociology'), ('Gary Becker', 'label%3Asociology'), ('Gary Becker', 'label%3Asociology'), ('Robert Merton', 'label%3Asociology'), ('Robert Merton', 'label%3Asociology'), ('Theodor Adorno', 'label%3Asociology'), ('Theodor Adorno', 'label%3Asociology')]


In [33]:
lim = 150000

rows = []
for name, field in scientists[:lim]:
    rows.append(fetch_wiki_data(name, field))
    
df = pd.DataFrame(rows)



  lis = BeautifulSoup(html).find_all('li')


KeyError: 'fullurl'

In [27]:
df.head()

Unnamed: 0,any_wiki_exists,dbig,firstline,name,wiki_exists,wiki_length
0,True,False,Karl Marx (German: [ˈkaɐ̯l ˈmaɐ̯ks]; 5 May 181...,Karl Marx,True,80353
1,True,False,Karl Marx (German: [ˈkaɐ̯l ˈmaɐ̯ks]; 5 May 181...,Karl Marx,True,80353
2,True,False,Erving Goffman (11 June 1922 – 19 November 198...,Erving Goffman,True,37800
3,True,False,Erving Goffman (11 June 1922 – 19 November 198...,Erving Goffman,True,37800
4,True,False,"Gary Stanley Becker (; December 2, 1930 – May ...",Gary Becker,True,19847
5,True,False,"Gary Stanley Becker (; December 2, 1930 – May ...",Gary Becker,True,19847
6,True,True,,Robert Merton,False,0
7,True,True,,Robert Merton,False,0
8,True,False,Theodor W. Adorno (; German: [ʔaˈdɔɐ̯no]; born...,Theodor Adorno,True,68289
9,True,False,Theodor W. Adorno (; German: [ʔaˈdɔɐ̯no]; born...,Theodor Adorno,True,68289


In [62]:
print(
    len(df),
    df.any_wiki_exists.sum(),
    df.wiki_exists.sum(),
    df.dbig.sum(),
    sep='\n',
)

23178
3093
1210
696


In [None]:
# For consistency with previously used column headings in wikipedia.ipynb
df = df.rename(columns = {'wiki_exists': 'wiki_bool'})

In [98]:
# Index for people who have a wiki article under their name, but the person described
# seemingly doesn't match their field
fpos = (df.any_wiki_exists) & (~df.dbig) & (~df.wiki_bool)
pd.set_option('max_colwidth', 800)
df[fpos].sample(30, random_state=1)

Unnamed: 0,any_wiki_exists,dbig,firstline,name,wiki_bool,wiki_length
9210,True,False,"David Aaron Kessler (born May 13, 1951) is an American pediatrician, lawyer, author, and administrator (both academic and governmental). He was the Commissione",David A. Kessler,False,6179
5583,True,False,"Jairo Velasco Jr. (born 21 January 1974) is a former Spanish professional tennis player. His father, Jairo Velasco Sr., was a Colombian tennis player, who immig",Jairo Velasco Jr.,False,1418
10808,True,False,Gordon Munro (17 December 1893 – 16 April 1951) was an Australian rules footballer who played with St Kilda in the Victorian Football League (VFL).,Gordon Munro,False,274
15856,True,False,Michael Jakobsen (born 2 January 1986) is a Danish professional footballer who plays as a defender for Adelaide United in the A-League. He was named 2002 Danish,Michael Jakobsen,False,5351
9641,True,False,"Zhang Jin (born 19 May 1974), also known as Max Zhang, is a Chinese actor and a former wushu athlete who won the Best Supporting Actor at the 33rd Hong Kong Fil",Jin Zhang,False,762
18835,True,False,Alok Kumar Shakya (Hindi: आलोक कुमार शाक्य) is an Indian politician and a member of the Sixteenth Legislative Assembly of Uttar Pradesh in India. He represents,Alok Kumar,False,1452
2741,True,False,"Dominic Ryan (born 28 March 1990) is an Irish former professional rugby union player who last played for Leicester Tigers. He was a back row player, playing at",Dominic Ryan,False,1160
21702,True,False,"There have been 43 executions in North Carolina, under the current statute, since it was adopted in 1977. All were for murder and were performed using lethal in",Kenneth Boyd,False,511
3628,True,False,"Pascal Simon (born September 27, 1956) is a retired French road racing cyclist. A native of Mesnil St. Loup, he was a professional cyclist from 1979 to 1991. Pa",Pascal Simon,False,864
1483,True,False,"Chris O'Dea is a documentary filmmaker with a focus on new media and global perspectives. He is a Master of Fine Arts graduate of the UCLA School of Theater, F",Chris ODea,False,892


In [73]:
df.to_csv('wiki_data_23K_v3.csv')

In [74]:
gscholar_data = pd.read_csv('gscholar/files/profData_withGenderCount_v3_physics_economics_philo.csv').iloc[:,3:]

In [75]:
gscholar_complete = gscholar_data.merge(df, on = 'name', how = 'left')

In [76]:
gscholar_complete.head()

Unnamed: 0,h-index,h5-index,i10-index,i10-5-index,institution,name,url,n citations,n5 citations,field,gender,gender_prob,count_name,any_wiki_exists,dbig,firstline,wiki_bool,wiki_length
0,117,83,309,245,University of Vienna,Georg Kresse,https://scholar.google.com//citations?user=Pn8ouvAAAAAJ&hl=en,206933,121867,1,male,1.0,64.0,False,False,,False,0.0
1,117,72,471,338,Rice University,Gustavo E. Scuseria,https://scholar.google.com//citations?user=6ZiRSwQAAAAJ&hl=en,216643,105708,1,male,1.0,1161.0,False,False,,False,0.0
2,198,115,1018,554,University of California Santa Barbara,Alan Heeger,https://scholar.google.com//citations?user=tvQY9iIAAAAJ&hl=en,202047,71412,1,male,1.0,2079.0,True,False,"Alan Jay Heeger (born January 22, 1936) is an American physicist, academic and Nobel Prize laureate in chemistry.",True,3013.0
3,176,140,1433,1052,"Royal Holloway, University of London",Glen Cowan,https://scholar.google.com//citations?user=ljQwt8QAAAAJ&hl=en,243320,134033,1,male,0.98,409.0,True,False,"Glen Cowan is a professor of Particle Physics at Royal Holloway, University of London. He has made a considerable contribution to the ATLAS experiment at the La",True,973.0
4,197,162,1551,1234,University of Kansas,Alice Bean,https://scholar.google.com//citations?user=gVCA0GoAAAAJ&hl=en,197821,128791,1,female,1.0,1414.0,False,False,,False,0.0


In [77]:
#gscholar_complete.to_csv('gscholar_complete_v2.csv')
gscholar_complete.to_csv('gscholar_complete_v3.csv')

In [79]:
dummy = gscholar_complete.copy()
dummy['wiki_bool'] = dummy['any_wiki_exists']
dummy.to_csv('gscholar_complete_v7.csv')

In [80]:
# Version where target variable (wiki_bool) is set to True if dbig=True. Upper bound on effect of dbig.
db_pass = gscholar_complete.copy()
db_pass['wiki_bool'] = db_pass['wiki_bool'] | db_pass['dbig']
db_pass.to_csv('gscholar_complete_v8.csv')

In [92]:
db_pass.head(1500).tail(20).head(3)

Unnamed: 0,h-index,h5-index,i10-index,i10-5-index,institution,name,url,n citations,n5 citations,field,gender,gender_prob,count_name,any_wiki_exists,dbig,firstline,wiki_bool,wiki_length
1480,55,40,154,95,Aarhus University,Philip Hofmann,https://scholar.google.com//citations?user=m4xCMsAAAAAJ&hl=en,10869,6549,1,male,1.0,1097.0,True,True,,True,0.0
1481,40,17,83,31,,Poul Olesen,https://scholar.google.com//citations?user=rda2ozIAAAAJ&hl=en,10867,1762,1,male,1.0,30.0,False,False,,False,0.0
1482,43,23,112,51,,Elizabeth Simmons,https://scholar.google.com//citations?user=qyvbiOQAAAAJ&hl=en,10864,2943,1,female,1.0,3508.0,False,False,,False,0.0


In [93]:
gscholar_complete.head(1500).tail(20).head(3)

Unnamed: 0,h-index,h5-index,i10-index,i10-5-index,institution,name,url,n citations,n5 citations,field,gender,gender_prob,count_name,any_wiki_exists,dbig,firstline,wiki_bool,wiki_length
1480,55,40,154,95,Aarhus University,Philip Hofmann,https://scholar.google.com//citations?user=m4xCMsAAAAAJ&hl=en,10869,6549,1,male,1.0,1097.0,True,True,,False,0.0
1481,40,17,83,31,,Poul Olesen,https://scholar.google.com//citations?user=rda2ozIAAAAJ&hl=en,10867,1762,1,male,1.0,30.0,False,False,,False,0.0
1482,43,23,112,51,,Elizabeth Simmons,https://scholar.google.com//citations?user=qyvbiOQAAAAJ&hl=en,10864,2943,1,female,1.0,3508.0,False,False,,False,0.0
