In [1]:
#-pip install requests beautifulsoup4 pandas spacy
#python -m spacy download en_core_web_sm


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy

# Load the spacy model for text vectorization
nlp = spacy.load('en_core_web_sm')

# The main list of computer scientists' Wikipedia URL
main_page_url = 'https://en.wikipedia.org/wiki/List_of_computer_scientists'

# Function to get the list of scientists' Wikipedia page URLs
def get_scientists_urls(main_page_url):
    response = requests.get(main_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    list_items = soup.select('.mw-parser-output > ul > li')
    base_url = 'https://en.wikipedia.org'
    scientist_urls = [base_url + item.find('a')['href'] for item in list_items if item.find('a')]
    return scientist_urls

# Function to extract data from individual Wikipedia page
def extract_data_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the name considering the presence of parenthesis
    title = soup.find('h1', {'id': 'firstHeading'}).text
    # Check if the title contains parentheses and extract accordingly
    if '(' in title:
        name = title.split('(')[0].strip().split(' ')[-1]  # Get last word before parenthesis
    else:
        name = title.split(' ')[-1]  # Get last word of the title if no parenthesis

  # Find the 'Awards' row in the infobox (WORKS)
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        awards_row = infobox.find('th', string='Awards')
        if awards_row:
            awards_list = awards_row.find_next_sibling('td')
            if awards_list:
                awards_count = len(awards_list.find_all('li'))
            else:
                awards_count = 0
        else:
            awards_count = 0
    else:
        awards_count = 0

    
    # Find the 'Alma mater' row in the infobox
    infobox = soup.find('table', {'class': 'infobox'})
    education_vector = []  # Initialize an empty list to store 'Alma mater' names
    if infobox:
        # Find 'th' elements with 'infobox-label', then iterate to match 'Alma mater' with non-breaking spaces
        for th in infobox.find_all('th', {'class': 'infobox-label'}):
            # Use .get_text() and replace to handle non-breaking spaces and compare
            if 'Alma mater' in th.get_text().replace(u'\xa0', u' '):
                # If found, get the next sibling 'td' element containing the universities
                alma_mater_data = th.find_next_sibling('td')
                if alma_mater_data:
                    # Get all anchor tags within the 'Alma mater' data cell
                    alma_mater_links = alma_mater_data.find_all('a')
                    # Extract the text from each anchor tag and add it to the education_vector list
                    education_vector = [link.get_text() for link in alma_mater_links if link.get_text().strip()]
                break  # Stop the loop after finding the 'Alma mater' row
    if not education_vector:
          for th in infobox.find_all('th', {'class': 'infobox-label'}):
            # Use .get_text() and replace to handle non-breaking spaces and compare
            if 'Education' in th.get_text().replace(u'\xa0', u' '):
                # If found, get the next sibling 'td' element containing the universities
                alma_mater_data = th.find_next_sibling('td')
                if alma_mater_data:
                    # Get all anchor tags within the 'Alma mater' data cell
                    alma_mater_links = alma_mater_data.find_all('a')
                    # Extract the text from each anchor tag and add it to the education_vector list
                    education_vector = [link.get_text() for link in alma_mater_links if link.get_text().strip()]
                break  # Stop the loop after finding the 'Alma mater' row
    if not education_vector:
        education_section = soup.find('span', {'class': 'mw-headline', 'id': 'Education'})
        if education_section:
            # Get the container of the education section which is usually a preceding sibling of h2 containing the 'Education' span
            edu_container = education_section.find_parent('h2').find_next_sibling(lambda tag: tag.name in ["ul", "p", "div"])
            if edu_container:
                university_links = edu_container.find_all('a', string=lambda text: 'University' in text)
                if university_links:
                    # Add the text of the first valid 'University' link
                    education_vector.append(university_links[0].get_text())
    
    if not education_vector:
        education_section = soup.find('span', {'class': 'mw-headline', 'id': 'Biography'})
        if education_section:
            # Get the container of the education section which is usually a preceding sibling of h2 containing the 'Education' span
            edu_container = education_section.find_parent('h2').find_next_sibling(lambda tag: tag.name in ["ul", "p", "div"])
            if edu_container:
                university_links = edu_container.find_all('a', string=lambda text: 'University' in text)
                if university_links:
                    # Add the text of the first valid 'University' link
                    education_vector.append(university_links[0].get_text())
    

    
    
    return {
        'Surname': name,
        '#Awards': awards_count,
        'Education': education_vector  # Convert numpy array to list for easier handling
    }

# Get the list of individual Wikipedia URLs for the scientists
scientists_urls = get_scientists_urls(main_page_url)

# List to hold the data
data = []

# Iterate over the URLs and extract data
for url in scientists_urls[:]:  # Limiting to first 10 for demonstration
    try:
        scientist_data = extract_data_from_page(url)
        data.append(scientist_data)
        print(f"Data extracted for {scientist_data['Surname']}")
    except Exception as e:
        print(f"Failed to extract data for URL {url}: {e}")

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Optionally, save the DataFrame to a CSV file
df.to_csv('computer_scientists_data.csv', index=False)


Data extracted for Khan
Data extracted for Aalst
Data extracted for Aaronson
Data extracted for Abebe
Data extracted for Abelson
Data extracted for Abiteboul
Data extracted for Abramsky
Data extracted for Adleman
Data extracted for Agrawal
Data extracted for Ahn
Data extracted for Aho
Data extracted for Allen
Data extracted for Amdahl
Data extracted for Anderson
Failed to extract data for URL https://en.wikipedia.org/wiki/Lisa_Anthony: 'NoneType' object has no attribute 'find_all'
Data extracted for Appel
Data extracted for Aragon
Data extracted for Arden
Data extracted for Jones
Data extracted for Arora
Data extracted for Asprey
Data extracted for Atanasoff
Data extracted for Atre
Data extracted for Babbage
Data extracted for Bachman
Data extracted for Backhouse
Data extracted for Backus
Data extracted for Bacon
Data extracted for Bader
Data extracted for Bahl
Data extracted for Barr
Data extracted for Bartik
Data extracted for Barto
Data extracted for Bauer
Data extracted for Bayer
D

In [7]:
df.head(30)

Unnamed: 0,Surname,#Awards,Education
0,Khan,0,"[University of Malaya, COMSATS University]"
1,Aalst,0,[]
2,Aaronson,4,"[Cornell University, University of California,..."
3,Abebe,3,"[Cornell University, University of Cambridge, ..."
4,Abelson,1,"[Princeton University, Massachusetts Institute..."
5,Abiteboul,4,[University of Southern California]
6,Abramsky,4,"[King's College, Cambridge, Queen Mary Univers..."
7,Adleman,0,"[University of California, Berkeley]"
8,Agrawal,0,[Indian Institute of Technology Kanpur]
9,Ahn,3,"[Duke University, BS, Carnegie Mellon University]"


In [5]:
# Get the list of individual Wikipedia URLs for the scientists
scientists_urls = get_scientists_urls(main_page_url)

# For demonstration print out the first few URLs
scientists_urls[:10]  

['https://en.wikipedia.org/wiki/Atta_ur_Rehman_Khan',
 'https://en.wikipedia.org/wiki/Wil_van_der_Aalst',
 'https://en.wikipedia.org/wiki/Scott_Aaronson',
 'https://en.wikipedia.org/wiki/Rediet_Abebe',
 'https://en.wikipedia.org/wiki/Hal_Abelson',
 'https://en.wikipedia.org/wiki/Serge_Abiteboul',
 'https://en.wikipedia.org/wiki/Samson_Abramsky',
 'https://en.wikipedia.org/wiki/Leonard_Adleman',
 'https://en.wikipedia.org/wiki/Manindra_Agrawal',
 'https://en.wikipedia.org/wiki/Luis_von_Ahn']