In [1]:
from requests import get
from bs4 import BeautifulSoup
from re import compile
from pandas import DataFrame
from gensim.utils import simple_preprocess
import wikipedia

In [2]:
# Set the base URL for the Wikipedia pages
base_url = 'https://en.wikipedia.org'
links_url = 'https://en.wikipedia.org/wiki/List_of_computer_scientists'

In [3]:
# make the request to the server
request = get(links_url)

# get the html content
response = BeautifulSoup(request.content, 'html')

In [4]:
content = response.find("div", {"class": "mw-parser-output"}).find_all("ul")

In [5]:
scientists = {}
for header in content:
    for scientist in header.find_all("li"):
        link = scientist.find("a", attrs={'href': compile("^/wiki/")})
        if link: scientists[link.text] = link.get("href")

In [6]:
# scientist_wiki_links = {}
# for s, l in zip(scientists[:-11], links[:-11]):
#     scientist_wiki_links[s] = base_url + l

In [14]:
scientists

{'Atta ur Rehman Khan': '/wiki/Atta_ur_Rehman_Khan',
 'Wil van der Aalst': '/wiki/Wil_van_der_Aalst',
 'Scott Aaronson': '/wiki/Scott_Aaronson',
 'Rediet Abebe': '/wiki/Rediet_Abebe',
 'Hal Abelson': '/wiki/Hal_Abelson',
 'Serge Abiteboul': '/wiki/Serge_Abiteboul',
 'Samson Abramsky': '/wiki/Samson_Abramsky',
 'Leonard Adleman': '/wiki/Leonard_Adleman',
 'Manindra Agrawal': '/wiki/Manindra_Agrawal',
 'Luis von Ahn': '/wiki/Luis_von_Ahn',
 'Alfred Aho': '/wiki/Alfred_Aho',
 'Frances E. Allen': '/wiki/Frances_E._Allen',
 'Gene Amdahl': '/wiki/Gene_Amdahl',
 'David P. Anderson': '/wiki/David_P._Anderson',
 'Lisa Anthony': '/wiki/Lisa_Anthony',
 'Andrew Appel': '/wiki/Andrew_Appel',
 'Cecilia R. Aragon': '/wiki/Cecilia_R._Aragon',
 'Bruce Arden': '/wiki/Bruce_Arden',
 'Sanjeev Arora': '/wiki/Sanjeev_Arora',
 'Winifred "Tim" Alice Asprey': '/wiki/Winifred_Asprey',
 'John Vincent Atanasoff': '/wiki/John_Vincent_Atanasoff',
 'Shakuntala Atre': '/wiki/Shakuntala_Atre',
 'Charles Babbage': '/wi

In [15]:
def search_wiki_awards(name):
    try:
        # Search for the Wikipedia page of the person
        page = wikipedia.page(name)
    except (wikipedia.exceptions.PageError, wikipedia.exceptions.DisambiguationError):
        try:
        # Search for the Wikipedia page of the person
            page = wikipedia.page('/wiki/'+name)
        except:
            return 0
        # Get the content of the page
    content = page.content
        
    try:
        # Find the section on education
        awards_text = content.split("== Awards ==")[1].split("==")[0]

        return len([s for s in awards_text.splitlines() if len(s) > 3])
    except IndexError:
        return 0

In [16]:
data = []
i = 1

for scientist, page_url in scientists.items():
    # if i == 10: break
    i += 1
    
    # helper variables
    education = ''
    awards = 0
    
    # make the request to the server
    request = get(base_url + page_url)

    # get the html content
    response = BeautifulSoup(request.content, 'html.parser')

    # main content
    content = response.find("div", {"class": "mw-parser-output"})

    # table info of scientist
    table = response.find("table", {"class": "infobox biography vcard"})
        
    # get headers and paragraphs
    header = content.find_all("h2")
    par = content.find_all("p")
    
    # if table exists
    if table:
        # for each row of the body table
        for row in table.find_all("tr"):
            # if there is a Awards header
            if row.find("th", {"class": "infobox-label"}, text=compile("Awards")):
                # get number  of awards
                awards = len(row.find_all("li"))
                
    if not awards:        
        awards = search_wiki_awards(scientist)
    
    ids = [
            "Education", "Education_and_early_life", "Education_and_career", "Early_life_and_education",
            "Early_life", "Life", "Life_and_career", "Life_and_work", "Biography", "Career"
          ]
    
    done = False
    # for each header and paragraph
    for h, p in zip(header, par):
        # for each possible span id
        for _id in ids:
            # if header contains one span with _id
            if h.find("span", {"id": _id}):
                education = p.text # get info
                done = True
                break
        if done: break
            
    # append final data
    data += [(scientist, awards, " ".join(simple_preprocess(education)))]

In [17]:
df = DataFrame(data, columns=['Name', 'Awards', 'Education'])

In [18]:
df

Unnamed: 0,Name,Awards,Education
0,Atta ur Rehman Khan,8,khan was bright sparks scholar and received hi...
1,Wil van der Aalst,0,willibrordus martinus pancratius van der aalst...
2,Scott Aaronson,4,scott joel aaronson born may is an american th...
3,Rediet Abebe,3,abebe research develops mathematical and compu...
4,Hal Abelson,1,he directed the first implementation of the la...
...,...,...,...
676,List of programmers,0,
677,List of programming language researchers,0,
678,List of Russian IT developers,0,
679,List of Slovenian computer scientists,0,


In [19]:
df = df[df['Education']  != '']

In [20]:
df

Unnamed: 0,Name,Awards,Education
0,Atta ur Rehman Khan,8,khan was bright sparks scholar and received hi...
1,Wil van der Aalst,0,willibrordus martinus pancratius van der aalst...
2,Scott Aaronson,4,scott joel aaronson born may is an american th...
3,Rediet Abebe,3,abebe research develops mathematical and compu...
4,Hal Abelson,1,he directed the first implementation of the la...
...,...,...,...
660,Edward Yourdon,0,yourdon obtained his in applied mathematics fr...
661,Moti Yung,6,yung earned his phd from columbia university i...
663,Hans Zantema,0,born in goingarijp the netherlands zantema rec...
665,Stanley Zdonik,0,stanley zdonik zəˈdɒnɪk zə don ik is computer ...


In [21]:
df.to_csv('List_of_computer_scientists.csv', index=False)