In [1]:
#-pip install requests beautifulsoup4 pandas spacy
#python -m spacy download en_core_web_sm


In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import spacy

# Load the spacy model for text vectorization
nlp = spacy.load('en_core_web_sm')

# The main list of computer scientists' Wikipedia URL
main_page_url = 'https://en.wikipedia.org/wiki/List_of_computer_scientists'

# Function to get the list of scientists' Wikipedia page URLs
def get_scientists_urls(main_page_url):
    response = requests.get(main_page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    list_items = soup.select('.mw-parser-output > ul > li')
    base_url = 'https://en.wikipedia.org'
    scientist_urls = [base_url + item.find('a')['href'] for item in list_items if item.find('a')]
    return scientist_urls

# Function to extract data from individual Wikipedia page
def extract_data_from_page(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Assume the name is the title
    title = soup.find('h1', {'id': 'firstHeading'}).text
    name = title

  # Find the 'Awards' row in the infobox (WORKS)
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        awards_row = infobox.find('th', string='Awards')
        if awards_row:
            awards_list = awards_row.find_next_sibling('td')
            if awards_list:
                awards_count = len(awards_list.find_all('li'))
            else:
                awards_count = 0
        else:
            awards_count = 0
    else:
        awards_count = 0

    
    # Vectorization of the education text
    # Assume the education information might be in a paragraph that mentions 'education'
    education_vector = None
    for paragraph in soup.find_all('p'):
        if 'education' in paragraph.text.lower():
            education_vector = nlp(paragraph.text).vector
            break

    # If education text is not found, use a default zero vector
    if education_vector is None:
        education_vector = [0] * len(nlp(' ').vector)

    return {
        'URL': url,
        'Name': name,
        '#Awards': awards_count,
        'Education_Vector': education_vector.tolist()  # Convert numpy array to list for easier handling
    }

# Get the list of individual Wikipedia URLs for the scientists
scientists_urls = get_scientists_urls(main_page_url)

# List to hold the data
data = []

# Iterate over the URLs and extract data
for url in scientists_urls[:10]:  # Limiting to first 10 for demonstration
    try:
        scientist_data = extract_data_from_page(url)
        data.append(scientist_data)
        print(f"Data extracted for {scientist_data['Name']}")
    except Exception as e:
        print(f"Failed to extract data for URL {url}: {e}")

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df.head())

# Optionally, save the DataFrame to a CSV file
df.to_csv('computer_scientists_data.csv', index=False)


Failed to extract data for URL https://en.wikipedia.org/wiki/Atta_ur_Rehman_Khan: 'list' object has no attribute 'tolist'
Failed to extract data for URL https://en.wikipedia.org/wiki/Wil_van_der_Aalst: 'list' object has no attribute 'tolist'
Data extracted for Scott Aaronson
Failed to extract data for URL https://en.wikipedia.org/wiki/Rediet_Abebe: 'list' object has no attribute 'tolist'
Data extracted for Hal Abelson
Failed to extract data for URL https://en.wikipedia.org/wiki/Serge_Abiteboul: 'list' object has no attribute 'tolist'
Failed to extract data for URL https://en.wikipedia.org/wiki/Samson_Abramsky: 'list' object has no attribute 'tolist'
Failed to extract data for URL https://en.wikipedia.org/wiki/Leonard_Adleman: 'list' object has no attribute 'tolist'
Failed to extract data for URL https://en.wikipedia.org/wiki/Manindra_Agrawal: 'list' object has no attribute 'tolist'
Data extracted for Luis von Ahn
                                            URL            Name  #Awards 

In [3]:
# Get the list of individual Wikipedia URLs for the scientists
scientists_urls = get_scientists_urls(main_page_url)

# For demonstration purposes, let's print out the first few URLs
scientists_urls[:]  # Print the first 10 URLs

['https://en.wikipedia.org/wiki/Atta_ur_Rehman_Khan',
 'https://en.wikipedia.org/wiki/Wil_van_der_Aalst',
 'https://en.wikipedia.org/wiki/Scott_Aaronson',
 'https://en.wikipedia.org/wiki/Rediet_Abebe',
 'https://en.wikipedia.org/wiki/Hal_Abelson',
 'https://en.wikipedia.org/wiki/Serge_Abiteboul',
 'https://en.wikipedia.org/wiki/Samson_Abramsky',
 'https://en.wikipedia.org/wiki/Leonard_Adleman',
 'https://en.wikipedia.org/wiki/Manindra_Agrawal',
 'https://en.wikipedia.org/wiki/Luis_von_Ahn',
 'https://en.wikipedia.org/wiki/Alfred_Aho',
 'https://en.wikipedia.org/wiki/Frances_E._Allen',
 'https://en.wikipedia.org/wiki/Gene_Amdahl',
 'https://en.wikipedia.org/wiki/David_P._Anderson',
 'https://en.wikipedia.org/wiki/Lisa_Anthony',
 'https://en.wikipedia.org/wiki/Andrew_Appel',
 'https://en.wikipedia.org/wiki/Cecilia_R._Aragon',
 'https://en.wikipedia.org/wiki/Bruce_Arden',
 'https://en.wikipedia.org/wiki/Angie_Jones',
 'https://en.wikipedia.org/wiki/Sanjeev_Arora',
 'https://en.wikipedia.o