In [9]:
import requests
import bs4
from re import compile
from pandas import DataFrame, read_excel

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_computer_scientists'

In [3]:
# make the request to the server
request = requests.get(url)

# get the html content
soup = bs4.BeautifulSoup(request.content, 'html')

In [4]:
content = soup.find("div", {"class": "mw-parser-output"}).find_all("ul")
content[1:3]

[<ul><li><a href="/wiki/Atta_ur_Rehman_Khan" title="Atta ur Rehman Khan">Atta ur Rehman Khan</a> – <a class="mw-redirect" href="/wiki/Mobile_Cloud_Computing" title="Mobile Cloud Computing">Mobile Cloud Computing</a>, <a class="mw-redirect" href="/wiki/Cybersecurity" title="Cybersecurity">Cybersecurity</a>, <a class="mw-redirect" href="/wiki/IoT" title="IoT">IoT</a></li>
 <li><a href="/wiki/Wil_van_der_Aalst" title="Wil van der Aalst">Wil van der Aalst</a> – <a href="/wiki/Business_process_management" title="Business process management">business process management</a>, <a href="/wiki/Process_mining" title="Process mining">process mining</a>, <a class="mw-redirect" href="/wiki/Petri_nets" title="Petri nets">Petri nets</a></li>
 <li><a href="/wiki/Scott_Aaronson" title="Scott Aaronson">Scott Aaronson</a> – <a href="/wiki/Quantum_computing" title="Quantum computing">quantum computing</a> and <a href="/wiki/Computational_complexity_theory" title="Computational complexity theory">complexity 

In [5]:
scientists = []
links = []
for header in content:
    for scientist in header.find_all("li"):
        link = scientist.find("a", attrs={'href': compile("^/wiki/")})
        if link: 
            scientists += [link.text]
            links += [link.get("href")]

In [6]:
scientist_wiki_links = {}
for s, l in zip(scientists[:-11], links[:-11]):
    scientist_wiki_links[s] = 'https://en.wikipedia.org/' + l

In [7]:
scientist_wiki_links

{'Atta ur Rehman Khan': 'https://en.wikipedia.org//wiki/Atta_ur_Rehman_Khan',
 'Wil van der Aalst': 'https://en.wikipedia.org//wiki/Wil_van_der_Aalst',
 'Scott Aaronson': 'https://en.wikipedia.org//wiki/Scott_Aaronson',
 'Rediet Abebe': 'https://en.wikipedia.org//wiki/Rediet_Abebe',
 'Hal Abelson': 'https://en.wikipedia.org//wiki/Hal_Abelson',
 'Serge Abiteboul': 'https://en.wikipedia.org//wiki/Serge_Abiteboul',
 'Samson Abramsky': 'https://en.wikipedia.org//wiki/Samson_Abramsky',
 'Leonard Adleman': 'https://en.wikipedia.org//wiki/Leonard_Adleman',
 'Manindra Agrawal': 'https://en.wikipedia.org//wiki/Manindra_Agrawal',
 'Luis von Ahn': 'https://en.wikipedia.org//wiki/Luis_von_Ahn',
 'Alfred Aho': 'https://en.wikipedia.org//wiki/Alfred_Aho',
 'Frances E. Allen': 'https://en.wikipedia.org//wiki/Frances_E._Allen',
 'Gene Amdahl': 'https://en.wikipedia.org//wiki/Gene_Amdahl',
 'David P. Anderson': 'https://en.wikipedia.org//wiki/David_P._Anderson',
 'Lisa Anthony': 'https://en.wikipedia.o

In [97]:
data = []
i = 1
for scientist, link in scientist_wiki_links.items():

    i += 1
    # helper variables
    education = ''
    awards = 0
    
    # make the request to the server
    request = requests.get(link)

    # get the html content
    soup = bs4.BeautifulSoup(request.content, 'html')
    
    # content
    content = soup.find("div", {"class": "mw-parser-output"})
    
    # table info of scientist
    table = soup.find("table", {"class": "infobox biography vcard"})

    # get headers and paragraphs
    header = content.find_all("h2")
    par = content.find_all("p")
    
    # if table exists
    if table:
        # for each row of the body table
        for row in table.find_all("tr"):
            # if there is a Awards header
            if row.find("th", {"class": "infobox-label"}, text=compile("Awards")):
                # get number  of awards
                awards = len(row.find_all("li"))

    # for each header and paragraph
    for h, p in zip(header, par):
        # if header contains one of the above title
        if h.find("span", {"id": "Education"}):
            # get info
            education = p.text
        elif h.find("span", {"id": "Early_life_and_education"}):
            # get info
            education = p.text
        elif h.find("span", {"id": "Early_life"}):
            education = p.text
        elif h.find("span", {"id": "Biography"}):
            education = p.text
    
    # append final data
    data += [(scientist, awards, education)]

16
41
44
50
52
53
60
67
71
74
89
92
96
98
100
103
105
108
114
131
134
140
146
147
149
168
170
177
182
188
207
212
216
220
225
230
232
240
241
251
257
258
264
272
277
304
309
312
327
328
338
340
342
352
364
371
400
412
418
420
422
427
439
441
452
453
463
464
476
479
489
501
502
506
509
517
523
526
533
544
547
549
554
558
561
574
582
592
594
595
597
599
612
616
622
634
650
665


In [98]:
df = DataFrame(data, columns=['Name', 'Awards', 'Education'])

In [109]:
df.to_excel('List_of_computer_scientists.xlsx', index=False)

In [102]:
df.to_csv('List_of_computer_scientists.csv', index=False, header=True, line_terminator='\n')

In [108]:
df[df['Education'] == '']

Unnamed: 0,Name,Awards,Education
8,Manindra Agrawal,0,
9,Luis von Ahn,3,
10,Alfred Aho,7,
12,Gene Amdahl,0,
17,Bruce Arden,0,
...,...,...,...
662,Lotfi Zadeh,0,
664,Arif Zaman,0,
666,Hussein Zedan,0,
667,Shlomo Zilberstein,0,
