In [1]:
# Day 22 of 30 Days of Python Exercise
# Scrape the following website and store the data as json file(url = 'http://www.bu.edu/president/boston-university-facts-stats/').

import requests
from bs4 import BeautifulSoup
import json

url = 'http://www.bu.edu/president/boston-university-facts-stats/'

# Get data from the URL
response = requests.get(url)

# Get all contents of the URL
content = response.content

# Create BeautifulSoup object
soup = BeautifulSoup(content, 'html.parser')

facts_section = soup.find('section', {'class': 'facts-categories'})

# Check if the facts_section is found
if facts_section:
    data = {}

    # Find all elements with class 'facts-wrapper' within facts_section
    for wrapper in facts_section.find_all(class_='facts-wrapper'):
        category = wrapper.find('h5').text.strip()
        values = {}

        # Find all list items within the current wrapper
        for item in wrapper.find_all('li', class_='list-item'):
            label = item.find('p', class_='text').text.strip()
            value = item.find('span', class_='value').text.strip()
            values[label] = value

        data[category] = values

    # Store the data as a JSON file
    output_file = 'facts_data.json'
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)
    print(f'Data has been scraped and stored in {output_file}')
else:
    print('Unable to find the facts-stats section on the webpage.')


Data has been scraped and stored in facts_data.json


In [5]:
# Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file

from bs4 import BeautifulSoup
import json

url = "https://archive.ics.uci.edu/dataset/2/adult"

response = requests.get(url)

# Checking (status code 200 means successful)
if response.status_code == 200:
    # Parsing the HTML content using BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')

    # Finding the table on the page
    table = soup.find('table')

    # Extraction of data from the table
    data = []
    headers = [header.text.strip() for header in table.find_all('th')]
    rows = table.find_all('tr')[1:]  # Skip the header row

    for row in rows:
        row_data = [cell.text.strip() for cell in row.find_all('td')]
        data.append(dict(zip(headers, row_data)))

    # Storing the data as a JSON file
    output_file = 'adult_dataset.json'
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

    print(f"Data has been scraped and stored as {output_file}.")
else:
    print("Error: Unable to fetch the webpage. Status code:", response.status_code)

Data has been scraped and stored as adult_dataset.json.


In [9]:
# Scrape the presidents table and store the data as json(https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States). The table is not very structured and the scrapping may take very long time.

import requests
from bs4 import BeautifulSoup
import json

url = 'https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States'

# Get data from the URL
response = requests.get(url)

# Create BeautifulSoup object
soup = BeautifulSoup(response.content, 'html.parser')

# Find the presidents table
presidents_table = soup.find('table', {'class': 'wikitable'})

# Check if the table is found
if presidents_table:
    data = []

    # Extract rows from the table
    rows = presidents_table.find_all('tr')

    # Extract data from each row
    for row in rows[1:]:  # Start from the second row to skip the header
        columns = row.find_all(['th', 'td'])

        # Ensure the row has the expected number of columns
        if len(columns) == 8:
            row_data = {
                "Number": columns[0].text.strip(),
                "Portrait": columns[1].find('img')['src'] if columns[1].find('img') else None,
                "Name": columns[2].text.strip(),
                "Term": columns[3].text.strip(),
                "Party": columns[5].text.strip(),
                "Election": columns[6].text.strip(),
                "Vice President": columns[7].text.strip(),
            }

            data.append(row_data)

    # Store the data as a JSON file
    output_file = 'presidents_data.json'
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, indent=4)
    print(f'Data has been scraped and stored in {output_file}')
else:
    print('Unable to find the presidents table on the webpage.')


Data has been scraped and stored in presidents_data.json
