<a href="https://colab.research.google.com/github/cpatiffanynguyen/usda/blob/main/webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import concurrent.futures
from google.colab import drive
import re
import os

extension_centers = []

def fetch_and_parse_html(link):
    try:
        response = requests.get(link)
        if response.status_code == 200:
            return BeautifulSoup(response.content, 'html.parser')
    except requests.RequestException as e:
        print(f"Error fetching {link}: {e}")
    return None

def get_lat_lng_add(address):
    endpoint = f"https://nominatim.openstreetmap.org/search?q={address}&format=json"
    r = requests.get(endpoint)
    if r.status_code not in range(200, 299):
        print(f"Failed to get data: Status code {r.status_code}")
        return
    try:
        results = r.json()
        if not results:
            print("No results found")
            return
        return results[0]['lat'], results[0]['lon']
    except Exception as e:
        print(f"An error occurred: {e}")
        return

def get_lat_lng_from_zip(zip_code):
    endpoint = f"https://nominatim.openstreetmap.org/search?q={zip_code}&format=json"
    try:
        r = requests.get(endpoint)
        if r.status_code == 200:
            results = r.json()
            if results:
                latitude = results[0].get('lat')
                longitude = results[0].get('lon')
                if latitude and longitude:
                    return latitude, longitude
    except Exception as e:
        print(f"An error occurred: {e}")
    return None, None  # Return None for both latitude and longitude if unavailable

def get_lat_lng(zip_or_address):
    """Try to fetch latitude and longitude based on either zip code or address."""
    base_url = "https://nominatim.openstreetmap.org/search"
    params = {
        'q': zip_or_address,
        'format': 'json'
    }
    try:
        response = requests.get(base_url, params=params)
        if response.status_code == 200:
            results = response.json()
            if results:
                return results[0]['lat'], results[0]['lon']
    except Exception as e:
        print(f"Error fetching coordinates for {zip_or_address}: {e}")
    return None, None

def fetch_coordinates_for_center(address, zip_code):
    """Attempts to fetch coordinates first using the zip code, then the address."""
    lat, lng = get_lat_lng(zip_code)
    if lat and lng:
        return lat, lng
    return get_lat_lng(address)  # Try with address if zip code fails



def Alabama(link):
    soup = fetch_and_parse_html(link)
    if soup:
        county_links = [link + county_link['href'] for county_link in soup.find_all('a', class_='btn btn-default btn-county')]
        for county_link in county_links:
            county_soup = fetch_and_parse_html(county_link)
            if county_soup:
                address_tag = county_soup.find('strong', string='Address:')
                canonical_link = county_soup.find('link', rel='canonical')
                name = canonical_link['href'].rstrip('/').split('/')[-1].capitalize()
                address = ''
                if address_tag:
                    address_lines = []
                    sibling = address_tag.next_sibling
                    while sibling and len(address_lines) < 3:
                        if sibling.string:
                            address_lines.append(sibling.string.strip())
                        sibling = sibling.next_sibling
                    match = re.search(r'(.+?\b\d{5}\b)', ' '.join(address_lines))
                    if match:
                        address = match.group(1).strip()
                    else:
                        address = "No valid address found"
                lat, lng = get_lat_lng_from_zip(address[-5:])
                if lat and lng:
                    extension_centers.append((name, address, address[-5:], 'Alabama', lat, lng))
                else:
                    print(f"Failed to get lat/lng for address: {address}")
    else:
        print("Alabama web page error")


def Alaska(link):
    soup = fetch_and_parse_html(link)
    if soup:
        location_links = soup.find_all('a', href=lambda href: href and "ces/districts" in href)

        for i in range(len(location_links)):
            if i <= 1 or i >= len(location_links) - 2:
                continue
            location_link = location_links[i]
            location_name = location_link.text.strip()
            location_url = link[:-1] + location_link['href'][14:]

            location_page_soup = fetch_and_parse_html(location_url)
            if location_page_soup:
                address_tags = location_page_soup.find_all(
                    'a',
                    href=lambda href: href and ("goo.gl/maps" in href or "google.com/maps" in href) if href else False
                )

                for address_tag in address_tags:
                    full_address = ''
                    current_element = address_tag

                    while current_element:
                        if current_element.name == 'a' and (current_element['href'].startswith("https://goo.gl/maps") or current_element['href'].startswith("https://www.google.com/maps")):
                            full_address += current_element.get_text(strip=True) + ' '
                        current_element = current_element.find_next_sibling()
                        if current_element and current_element.name not in ['a', 'br']:
                            break

                    if full_address:
                        full_address = full_address.strip()
                        zip_code_pattern = re.compile(r'\b\d{5}(?:-\d{4})?\b')
                        zip_code_match = zip_code_pattern.search(full_address)
                        zip_code = zip_code_match.group() if zip_code_match else "Zip code not found"
                        extension_centers.append((location_name, full_address, zip_code, 'Alaska', get_lat_lng_from_zip(zip_code)[0], get_lat_lng_from_zip(get_lat_lng_from_zip)[1]))
                        break

    return extension_centers


def Connecticut(link):
    soup = fetch_and_parse_html(link)
    if soup:
        location_links = soup.find_all('a', href=lambda href: href and "https://cahnr.uconn.edu/extension/locations/" in href)

        for i, location_link in enumerate(location_links):
            if i == 0 or i == len(location_links) - 1:
                continue
            location_name = location_link.text.strip()
            location_url = location_link['href']  # Extract the URL

            # Fetch and parse the individual location page
            location_page_soup = fetch_and_parse_html(location_url)
            if location_page_soup:
                address_parts = location_page_soup.find_all('p', style="text-align: left;")
                full_address = ''
                zip_code = ''
                zip_code_pattern = re.compile(r'\b\d{5}(?:-\d{4})?\b')

                for part in address_parts:
                    text = part.get_text(strip=True)
                    full_address += text + ' '
                    # Check if the text contains a zip code
                    zip_code_match = zip_code_pattern.search(text)
                    if zip_code_match:
                        zip_code = zip_code_match.group()  # Extract the zip code
                        break  # Stop after finding the zip code
                address = full_address.strip()
                if not address:
                    address = "No address found"
                if not zip_code:
                    zip_code = "No zip code found"
            else:
                address = "Address not found"

            extension_centers.append((location_name, address, zip_code, 'Connecticut', get_lat_lng_from_zip(zip_code)[0], get_lat_lng_from_zip(zip_code)[1]))

def Arizona(link):
    return

def Arkansas(link):
    soup = fetch_and_parse_html(link)
    if soup:
        county_links = [(curr_url.get_text(), link + curr_url['href'][10:]) for col_class in ["col1", "col2", "col3", "col4"] if soup.find(class_=col_class) for curr_url in soup.find(class_=col_class).find_all('a')]
        print(county_links)

def California(link):
    return

def Colorado(link):
    return


state_functions = {
    'https://ssl.acesag.auburn.edu/directory-new/': Alabama,
    'https://www.uaf.edu/ces/districts/': Alaska,
    'https://cahnr.uconn.edu/extension/locations/': Connecticut,
    # 'https://www.uaex.uada.edu/counties/' : Arkansas
}

def process_link(link):
    local_centers = []
    if not pd.isna(link) and link in state_functions:
        local_centers = state_functions[link](link)
    return local_centers

if __name__ == '__main__':
    csv_url = 'https://docs.google.com/spreadsheets/d/18p6w3btY2km9nEKq0Z8W0BAbHh8L5Z3k/export?format=csv&gid=926683666'

    try:
        df = pd.read_csv(csv_url)
        extension_links = df['Local Office Directory']

        with concurrent.futures.ThreadPoolExecutor() as executor:
            results = executor.map(process_link, extension_links)

        for result in results:
            if result is not None:
                extension_centers.extend(result)

        print("Number of centers:", len(extension_centers))

        if extension_centers:
            df_extension_centers = pd.DataFrame(extension_centers, columns=['county', 'address', 'zipcode', 'state', 'latitude', 'longitude'])

            # Update latitude and longitude where missing
            for idx, row in df_extension_centers[df_extension_centers['latitude'].isnull() | df_extension_centers['longitude'].isnull()].iterrows():
                lat, lng = fetch_coordinates_for_center(row['address'], row['zipcode'])
                df_extension_centers.at[idx, 'latitude'] = lat
                df_extension_centers.at[idx, 'longitude'] = lng

            df_extension_centers = df_extension_centers.drop_duplicates(subset=['address', 'zipcode', 'state'])
            df_extension_centers = df_extension_centers.sort_values(by='state', ascending=True)

            drive.mount('/content/drive', force_remount=True)
            path = '/content/drive/My Drive/testing.csv'

            df_extension_centers.to_csv(path, index=False)
            print(f"File written successfully to {path}")
        else:
            print("No extension centers to write.")

    except Exception as e:
        print("An error occurred:", e)



Number of centers: 200
Mounted at /content/drive
File written successfully to /content/drive/My Drive/testing.csv
