#### Exercises: Day 22

In [2]:
import requests
from bs4 import BeautifulSoup
import json

url = 'http://www.bu.edu/president/boston-university-facts-stats/'

def scrape_bu_website(url):
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Extracting relevant data from the website
        data = {
            'title': soup.title.string,
            'facts_stats': []
        }

        for fact_stat in soup.find_all('div', class_='fact-stat'):
            title = fact_stat.find('div', class_='fact-stat-title').text.strip()
            value = fact_stat.find('div', class_='fact-stat-value').text.strip()

            data['facts_stats'].append({
                'title': title,
                'value': value
            })

        return data

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from the website: {e}")
        return None

def save_data_as_json(data, json_filename):
    try:
        with open(json_filename, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        print(f"Data saved successfully to {json_filename}")
    except Exception as e:
        print(f"Error saving data to JSON file: {e}")

# Example usage:
website_data = scrape_bu_website(url)

if website_data:
    save_data_as_json(website_data, 'bu_facts_stats.json')


Data saved successfully to bu_facts_stats.json


In [3]:
# Extract the table in this url (https://archive.ics.uci.edu/ml/datasets.php) and change it to a json file

import pandas as pd

url = 'https://archive.ics.uci.edu/ml/datasets.php'

def extract_table_to_json(url, json_filename):
    try:
        # Use pandas to read the HTML table from the URL
        tables = pd.read_html(url)
        
        if not tables:
            print("No tables found on the page.")
            return

        # Assume we want the first table on the page
        df = tables[0]

        # Convert the DataFrame to JSON
        json_data = df.to_json(orient='records')

        # Save the JSON data to a file
        with open(json_filename, 'w', encoding='utf-8') as json_file:
            json_file.write(json_data)

        print(f"Table data saved successfully to {json_filename}")

    except Exception as e:
        print(f"Error extracting table data: {e}")

# usage:
json_filename = 'uci_datasets.json'
extract_table_to_json(url, json_filename)


Error extracting table data: lxml not found, please install it


In [None]:
# Scrape the presidents table and store the data as json

import requests
import json

# Wikipedia API endpoint for the list of U.S. presidents
api_url = 'https://en.wikipedia.org/w/api.php'

# Parameters for the API request
params = {
    'action': 'parse',
    'page': 'List_of_presidents_of_the_United_States',
    'section': 1,  # The presidents table is in section 1 of the page
    'prop': 'wikitext',
    'format': 'json'
}

def get_presidents_data(api_url, params):
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        data = response.json()
        return data['parse']['wikitext']['*']

    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from Wikipedia API: {e}")
        return None

def parse_wikitext(wikitext):
    # Parse the wikitext to extract relevant information
    # This is a simplified parsing, and you may need to adjust it based on your specific requirements
    lines = wikitext.split('\n')
    presidents_data = []

    for line in lines:
        if '|' in line and 'colspan=' not in line:
            cells = line.split('|')
            name = cells[1].strip()
            start_year = cells[3].strip()
            end_year = cells[5].strip() if cells[5].strip() != 'Incumbent' else None

            presidents_data.append({
                'name': name,
                'start_year': start_year,
                'end_year': end_year
            })

    return presidents_data

def save_data_as_json(data, json_filename):
    try:
        with open(json_filename, 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=4)
        print(f"Data saved successfully to {json_filename}")
    except Exception as e:
        print(f"Error saving data to JSON file: {e}")

# usage:
wikitext_data = get_presidents_data(api_url, params)

if wikitext_data:
    parsed_data = parse_wikitext(wikitext_data)
    save_data_as_json(parsed_data, 'us_presidents.json')
