<a href="https://colab.research.google.com/github/emiliawisnios/Social-and-Public-Policy-python/blob/main/Notebooks/Social_and_Public_Policy_Coding_Python_21_28_11_24.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Collection from the Web

Today we will cover web scrapping and API usage.

Next time we will focus on OCR.

# First, we need to install required libraries. Run this cell first:


In [1]:
!pip install requests beautifulsoup4 pandas



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## HTML Basics


In [3]:
simple_html = """
<html>
    <body>
        <h1>Congressional Representatives</h1>
        <div class="representative">
            <h2>Jane Smith</h2>
            <p class="party">Democratic Party</p>
            <p class="state">California</p>
        </div>
    </body>
</html>
"""

In [4]:
# Let's parse this HTML
soup = BeautifulSoup(simple_html, 'html.parser')
print("Finding the title:")
print(soup.find('h1').text)

Finding the title:
Congressional Representatives


## Basic Web Scraping Exercise

In [5]:
# Let's try scraping a real (but simple) website
def get_webpage_content(url):
    """
    Safely fetch webpage content
    """
    try:
        # Add headers to mimic a browser request
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'}
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Check for errors
        return response.text
    except requests.RequestException as e:
        print(f"An error occurred: {e}")
        return None


## Exercise 1: Scrape Wikipedia's List of Current U.S. Senators

In [26]:
def clean_text(element):
    """Helper function to clean text from elements"""
    if element is None:
        return ""
    # Remove hidden spans and unnecessary whitespace
    for hidden in element.find_all('span', style='display:none'):
        hidden.decompose()
    # Remove ForceAgeToShow spans
    for age in element.find_all('span', class_='noprint ForceAgeToShow'):
        age.decompose()
    return element.get_text(strip=True)

def get_cell_value(cells, index, default=""):
    """Safely get cell value at index"""
    try:
        return clean_text(cells[index]) if cells[index] else default
    except IndexError:
        return default

def scrape_senators_example():
    url = "https://en.wikipedia.org/wiki/List_of_current_United_States_senators"
    content = get_webpage_content(url)

    if content:
        soup = BeautifulSoup(content, 'html.parser')
        senators_data = []

        # Find the table with id "senators"
        table = soup.find('table', {'id': 'senators'})
        if table:
            # Get all rows except the header row
            rows = table.find('tbody').find_all('tr')[1:]  # Skip header row
            current_state = None

            for row in rows:
                cells = row.find_all(['td', 'th'])
                if not cells:  # Skip empty rows
                    continue

                try:
                    # Handle state (might be in rowspan)
                    first_cell = cells[0]
                    if first_cell.name == 'td' and first_cell.has_attr('rowspan'):
                        current_state = clean_text(first_cell)
                        state = current_state
                    else:
                        state = current_state if current_state else clean_text(first_cell)

                    # Find the name cell (it's a th)
                    name_cell = row.find('th')
                    if name_cell:
                        name_link = name_cell.find('a')
                        if name_link:
                            name = clean_text(name_link)

                            # Find party cell - it's two cells after the name
                            party_index = None
                            for i, cell in enumerate(cells):
                                if cell.name == 'th' and name_link in cell.find_all('a'):
                                    party_index = i + 2
                                    break

                            if party_index is not None:
                                senator_info = {
                                    'State': state,
                                    'Name': name,
                                    'Party': get_cell_value(cells, party_index),
                                    'Born': get_cell_value(cells, party_index + 1),
                                    'Occupation': get_cell_value(cells, party_index + 2),
                                    'Previous_Office': get_cell_value(cells, party_index + 3),
                                    'Education': get_cell_value(cells, party_index + 4),
                                    'Assumed_Office': get_cell_value(cells, party_index + 5)
                                }
                                # Validate the data before adding
                                if senator_info['Name'] and senator_info['Party']:
                                    senators_data.append(senator_info)

                except Exception as e:
                    print(f"Error processing row: {e}")
                    continue

        # Convert to DataFrame and clean up
        df = pd.DataFrame(senators_data)

        # Clean up party column - remove any footnotes
        df['Party'] = df['Party'].str.replace(r'\[.*\]', '', regex=True)

        # Remove any rows without essential information
        df = df.dropna(subset=['Name', 'Party'])

        return df

    return None

# Example usage with error checking
print("Fetching current senators...")
try:
    senators_df = scrape_senators_example()
    if senators_df is not None and not senators_df.empty:
        print("\nFirst few senators:")
        print(senators_df[['State', 'Name', 'Party']].head())
        print(f"\nTotal senators found: {len(senators_df)}")
    else:
        print("No data was retrieved.")
except Exception as e:
    print(f"An error occurred: {e}")

Fetching current senators...

First few senators:
     State              Name        Party
0  Alabama  Tommy Tuberville   Republican
1  Alabama       Katie Britt   Republican
2   Alaska    Lisa Murkowski   Republican
3   Alaska      Dan Sullivan   Republican
4  Arizona    Kyrsten Sinema  Independent

Total senators found: 100


### EXERCISE 1:
Modify the above code to get additional information about senators.
Try adding these fields to the senator_info dictionary:
- Assumed office date
- Born (age)

In [None]:
# Your code here

## Understanding Web Scraping Ethics

IMPORTANT ETHICAL CONSIDERATIONS:

1. Always check robots.txt first
   - Visit website.com/robots.txt before scraping
   - Example: https://en.wikipedia.org/robots.txt

2. Be gentle with websites:
   - Add delays between requests
   - Don't overwhelm servers
   
3. Check Terms of Service
   - Some websites prohibit scraping
   - Others have specific APIs you should use instead

4. Best Practices:
   - Identify your scraper (use proper User-Agent)
   - Cache data when possible
   - Don't distribute copyrighted content

In [27]:
import time

def polite_scraper(urls, delay=1):
    """
    A polite scraper that waits between requests
    """
    results = []
    for url in urls:
        # Get content
        content = get_webpage_content(url)
        if content:
            results.append(content)

        # Be polite, wait before next request
        time.sleep(delay)
    return results

## APIs

APIs (Application Programming Interfaces) are easier and more reliable
than web scraping. Many political data sources have APIs:

- Congress.gov API
- OpenSecrets API
- Federal Election Commission (FEC) API
- Data.gov APIs

In [28]:
import requests
import json

def get_api_data(url, params=None):
    """
    Generic function to get data from an API
    """
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.RequestException as e:
        print(f"Error accessing API: {e}")
        return None

# Example: Using the Open States API (note: you'll need an API key)
def get_state_bills_example(state, api_key):
    """
    Get recent bills from a specific state
    """
    url = f"https://v3.openstates.org/bills"
    params = {
        "jurisdiction": state,
        "apikey": api_key
    }
    return get_api_data(url, params)

### EXERCISE 2:
Register for an API key at data.gov and try to:
1. Get data about government agencies
2. Convert the response into a pandas DataFrame
3. Save the results to a CSV file

Template code below:

In [29]:
def get_government_data(api_key):
    base_url = "https://api.data.gov/..."  # Students will need to fill this in
    params = {
        "api_key": api_key,
        # Add any other parameters
    }

    # Get the data
    data = get_api_data(base_url, params)

    # Convert to DataFrame
    # df = pd.DataFrame(...)

    # Save to CSV
    # df.to_csv(...)

    return data

## Handling API Rate Limits

In [None]:
def rate_limited_api_call(url, calls_per_second=1):
    """
    Make API calls while respecting rate limits
    """
    start_time = time.time()
    response = requests.get(url)

    # Wait if we're making calls too quickly
    elapsed = time.time() - start_time
    if elapsed < 1/calls_per_second:
        time.sleep(1/calls_per_second - elapsed)

    return response

## Homework

Scrap some content from the website of your choice.