## MVP 3 - Simpson's Guest Characters Map
This MVP scrapes the wikipedia pages for the simpsons guest stars, finds their birthplace from wikipedia, and then uses a geocoding API to find the latitude and longitude of the birthplaces. Then, the count of guest voices is mapped with donuts. 

In [None]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import csv
from collections import Counter
import time
import pandas as pd
import folium
from sklearn.cluster import DBSCAN
import numpy as np
from geopy.distance import geodesic
import math

### Get list of actors wikipedia pages

In [None]:
# Define the Wikipedia API endpoint and pages
endpoint = "https://en.wikipedia.org/w/api.php"
pages = ["List_of_The_Simpsons_guest_stars_(seasons_1–20)", "List_of_The_Simpsons_guest_stars_(seasons_21–present)"]

# Create a set of unique actor names
actor_pages = set()
for page in pages:
    # Specify parameters for the API request
    params = {
        "action": "parse",
        "page": page,
        "format": "json",
        "prop": "text",
    }

    # Send the request
    response = requests.get(endpoint, params=params)
    if response.status_code == 200:
        data = response.json()
        # Get main html content
        html_content = data['parse']['text']['*']
        # Parse the html
        soup = BeautifulSoup(html_content, 'html.parser')
        # Find the specific table by class
        table = soup.find("table", {"class": "sortable wikitable plainrowheaders"})
        # If there is a table, go through it
        if table:
            # Loop through each row 
            for row in table.find_all("tr"):
                # Extract the title from the `scope="row"` column
                header = row.find("th", {"scope": "row"})
                if header:
                    # Extract the title text
                    link_tag = header.find("a", href=True)
                    if link_tag:
                        # Get the page title of the actor
                        page = link_tag['href'][6:]
                        page_decode = urllib.parse.unquote(page)
                        actor_pages.add(page_decode)

# Convert set to list for unique actor names
actors = list(actor_pages)
print(len(actors))
for link in actors:
    print(link)

### Get Birthplaces

In [None]:
%%time

#Start a list to store birth places
birth_places = []
birth_places_cleaned = []


# Define the Wikipedia API endpoint
endpoint = "https://en.wikipedia.org/w/api.php"
for actor in actors:
    try:
        # Specify parameters for the API request
        params = {
            "action": "parse",
            "page": f"{actor}", 
            "format": "json",
            "prop": "text",
            "redirects": 1
            # To get the main HTML content
        }

        # Send the GET request
        response = requests.get(endpoint, params=params)

        if response.status_code == 200:
            data = response.json()
            html_content = data['parse']['text']['*']  # Get the main HTML content

            # Parse the HTML with Beautiful Soup
            soup = BeautifulSoup(html_content, 'html.parser')

            #Start finding birthplaces
            # First we check the birthplace div container
            birthplace = soup.find('div', {'class': 'birthplace'})
            if birthplace:
                #Get birthplace text and append to list
                birthplace = birthplace.get_text(strip=True)
                birth_places.append(birthplace)
            # Otherwise we check the wikipedia link to the birthplace and export that
            else:
                # Find the box containing the birth information
                infobox = soup.find('table', {'class': 'infobox'})
                # Find the row following "Born" for information information
                born_row = infobox.find('th', string="Born")
                # Extract the city link
                if born_row:
                    #Find the next td section after Born is mentioned
                    birth_info_cell = born_row.find_next('td')
                    # Find the first <a> tag for city
                    birth_info = birth_info_cell.find('a')
                    if birth_info:
                        #Format and append
                        birth_city_info = birth_info['href'][6:]
                        birth_places.append(birth_city_info)
                # We also need to check if the page uses Born: with the colon. 
                else:
                    born_row = infobox.find('th', string="Born:")
                    # Extract the city link
                    if born_row:
                        birth_info_cell = born_row.find_next('td')
                        # Find the first <a> tag for city
                        birth_info = birth_info_cell.find('a')
                        if birth_info:
                            #Format and append
                            birth_city_info = birth_info['href'][6:]
                            birth_places.append(birth_city_info)
    except:
        print(f"Invalid URL for {actor}")
                    
print(birth_places)    

### Clean up birth places

In [None]:
# Remove periods from birth places to prevent breaking geocoding API
cleaned_places = []
for place in birth_places:
    cleaned_place = place.replace(".","")
    cleaned_places.append(cleaned_place)
    
unique_places = list(set(cleaned_places))

### Convert Birthplaces and Count to CSV

In [None]:
# Count occurrences of each city
city_counts = Counter(cleaned_places)

# Prepare data for csv
city_data = list(city_counts.items())  # List of tuples (city_name, count)
# Choose csv name
csv_filename = "birthplaces.csv"

# Write to csv
with open(csv_filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    # Header row
    writer.writerow(["City", "Count"])
    # Write each city and count in separate rows
    writer.writerows(city_data)

print(f"Places and count have been saved to {csv_filename}")

### Read Data from CSV (if needed)

In [None]:
# Specify the csv
csv_filename = "birthplaces.csv"

# List to store city names
cities_list = []

# Read the data from the CSV
with open(csv_filename, mode='r', newline='', encoding='utf-8') as file:
    reader = csv.reader(file)
    # Skip the header
    next(reader)
    # Read the rows
    for row in reader:
        city_name = row[0]
        city_count = int(row[1])
        # Repeat the city name as many times as the count
        cities_list.extend([city_name] * city_count)

### Get Lat and Long from Geocoding API for each city and add to dictionary 

In [None]:
# Count places again
place_counts = Counter(cleaned_places)

# Dictionary to store place, count, lat, and lon
place_data = {}

# Loop through list to acccess geocoding API and get lat and lon based on place name
for place in unique_places:
    try:
        # API allows for one request a second
        time.sleep(1)
        api = f'https://geocode.maps.co/search?q={place}&api_key={key}'
        response = requests.get(api)
        if response.status_code == 200:
            data = response.json()
            location_data = data[0]
            lat = location_data['lat']
            lon = location_data['lon']
            # Add to dictionary with count, lat, and lon
            place_data[place] = {
                "count": place_counts[place],
                "lat": lat,
                "lon": lon
            }
        else:
            print("Bad request.")
    except:
        print(f"No location found for {place}")

### Write Place, Count, Latitude, and Longitude to CSV

In [None]:
with open('place_data.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    # Write the header
    writer.writerow(['Place', 'Count', 'Latitude', 'Longitude'])
    # Write the data for each place
    for place, data in place_data.items():
        writer.writerow([place, data['count'], data['lat'], data['lon']])

print("CSV file created.")

### Create map of places

In [None]:
# Read csv with place data
df = pd.read_csv('place_data.csv')

# Prepare the data for DBSCAN
coords = df[['Latitude', 'Longitude']].values

# Apply DBSCAN clustering to group nearby cities to eachother
# Prevents too many small points
db = DBSCAN(eps=0.009, min_samples=1, metric='haversine').fit(np.radians(coords))

# Add the cluster labels to the dataframe
df['cluster'] = db.labels_

# Find the most common city (highest count) for each cluster
most_common = df.loc[df.groupby('cluster')['Count'].idxmax()]

# Group the cities by clusters and sum the counts for each cluster
cluster_counts = df.groupby('cluster').agg({'Count': 'sum', 'Latitude': 'mean', 'Longitude': 'mean'}).reset_index()

# Add the most common city to the cluster data
cluster_counts['Place'] = most_common.set_index('cluster').loc[cluster_counts['cluster'], 'Place']

# Create a base Folium map
simpsons_map = folium.Map(location=[39.8283, -98.5795], tiles='Esri.WorldGrayCanvas', zoom_start=4)

# Scaling function to prevent weird sizing
def scale_size(count):
    if count == 1:
        return 10
    elif count <= 2:
        return count * 7.5
    return 10 + math.log(count) * 14 

# Add clusters to the map
for _, row in cluster_counts.iterrows():
    size = scale_size(row['Count'])
    # Calculate icon size based on the 'Count' value.
    # Use log to allow for variation
    # Define a custom donut icon with dynamic size
    icon = folium.CustomIcon(
        icon_image='donut.png',
        icon_size=(size, size),
    )
    # Add a marker with the donut icon
    folium.Marker(
        location=[row['Latitude'], row['Longitude']],
        icon=icon,
        popup=f"{row['Count']} voice(s) from {row['Place']}",
    ).add_to(simpsons_map)


# Save the map and view
simpsons_map.save('simpsons_map.html')
simpsons_map