In [2]:
import plotly.graph_objects as go
import plotly.io as pio

# Define the updated HorizonAnalytics template
HorizonAnalytics = go.layout.Template(
    layout=go.Layout(
        paper_bgcolor='#0d1b2a',  # Background color
        plot_bgcolor='#0d1b2a',  # Background color
        height=800,
        width=800 * 1.618,
        margin=dict(l=120, r=40, t=100, b=80),  # Increased left margin to avoid cutoff
        xaxis=dict(
            anchor='y',
            showgrid=True,
            gridcolor='rgba(255, 255, 255, 0.2)',  # Softer grid lines for contrast
            tickfont=dict(
                size=36,  # Consistent with other elements
                family='Montserrat, sans-serif',
                color='#ffffff',
                weight="bold"
            ),
            title=dict(
                text='',
                font=dict(
                    size=48,  # Increase to match other elements
                    family='Montserrat, sans-serif',
                    color='#ffffff',
                    weight="bold"
                )
            ),
            linecolor='#ffffff',  # White axis lines for contrast
            linewidth=2
        ),
        yaxis=dict(
            anchor='x',
            showgrid=True,
            gridcolor='rgba(255, 255, 255, 0.2)',  # Softer grid lines
            tickfont=dict(
                size=36,  # Consistent with x-axis
                family='Montserrat, sans-serif',
                color='#ffffff',
                weight="bold"
            ),
            title=dict(
                text='',
                font=dict(
                    size=48,  # Increase to match x-axis
                    family='Montserrat, sans-serif',
                    color='#ffffff',
                    weight="bold"
                )
            ),
            linecolor='#ffffff',  # White axis lines
            linewidth=2,
            automargin=True  # Automatically adjust margin if needed
        ),
        font=dict(
            color='#ffffff',  # White font for all text
            size=36,  # Uniform font size
            family='Montserrat, sans-serif',
            weight="bold"
        ),
        # Refined colorway for better visibility and differentiation
        colorway=["#33D7FF", "#A463FF", "#FFD700", "#ff4081",
                  "#ffc107", "#00c4a0", "#a0aec0"],
        title=dict(
            text='',
            font=dict(
                size=48,  # Big Boost in Title Size
                color='#ffffff',
                family='Montserrat, sans-serif',
                weight="bold"
            ),
            x=0.5,  # Center title
            y=0.97  # Push title higher
        )
    ),
    data=dict(
        scatter=[
            go.Scatter(
                line=dict(width=8),  # **Increased line width for better visibility**
                marker=dict(size=10)  # **Ensure markers are visible**
            )
        ]
    )
)

# Register the updated HorizonAnalytics template
pio.templates['HorizonAnalytics'] = HorizonAnalytics
pio.templates.default = 'HorizonAnalytics'


## President

In [20]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

# Get today's date
today_date = datetime.today().strftime("%Y-%m-%d")

# Wikipedia URL
url = "https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States"

# Send request and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the table containing presidents
table = soup.find("table", {"class": "wikitable"})

# Function to clean names (remove birth/death years and footnotes)
def clean_name(name):
    return name.split("(")[0].strip()  # Remove birth/death info

# Function to clean and convert date strings into YYYY-MM-DD format
def convert_date(date_str):
    if not date_str:
        return None
    date_str = date_str.split("[")[0].strip()  # Remove footnotes
    try:
        formatted_date = datetime.strptime(date_str, "%B %d, %Y").strftime("%Y-%m-%d")
        return formatted_date
    except ValueError:
        print(f"⚠️ Failed to parse date: '{date_str}'")  # Debugging output for failed conversions
        return None  # Return None if parsing fails

# Extract data
data = []
for index, row in enumerate(table.find_all("tr")[1:]):  # Skip the header row
    cols = row.find_all("td")
    if len(cols) < 3:
        continue  # Skip rows without enough columns

    name = clean_name(cols[1].text.strip())

    # Extract raw term text
    term_text = cols[2].text.strip()

    # Split at en dash (–) to separate start and end dates
    term_dates = term_text.split("–")
    start_date_text = term_dates[0].strip()
    end_date_text = term_dates[1].strip() if len(term_dates) > 1 else None

    # print(f"🔍 Extracted raw start date text for {name}: '{start_date_text}'")  # Debugging output
    # print(f"🔍 Extracted raw end date text for {name}: '{end_date_text}'")  # Debugging output

    start_date = convert_date(start_date_text)
    
    # Set today's date if "Incumbent", otherwise parse normally
    if end_date_text and "Incumbent" in end_date_text:
        end_date = today_date
    else:
        end_date = convert_date(end_date_text)

    # print(f"✅ Processed start date for {name}: {start_date}")  # Debugging output
    # print(f"✅ Processed end date for {name}: {end_date}")  # Debugging output

    data.append({
        "name": name,
        "position": "President",
        "region": "Federal",
        "start_date": start_date,
        "end_date": end_date
    })

# Store data in a DataFrame
d_service_presidents = pd.DataFrame(data)

from IPython.display import display, HTML

# Display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))
    
# Set max rows to prevent truncation
pd.set_option("display.max_rows", None)

# Show the table
display_scrollable_dataframe(d_service_presidents.head(20))


Unnamed: 0,name,position,region,start_date,end_date
0,George Washington,President,Federal,1789-04-30,1797-03-04
1,John Adams,President,Federal,1797-03-04,1801-03-04
2,Thomas Jefferson,President,Federal,1801-03-04,1809-03-04
3,James Madison,President,Federal,1809-03-04,1817-03-04
4,James Monroe,President,Federal,1817-03-04,1825-03-04
5,John Quincy Adams,President,Federal,1825-03-04,1829-03-04
6,Andrew Jackson,President,Federal,1829-03-04,1837-03-04
7,Martin Van Buren,President,Federal,1837-03-04,1841-03-04
8,William Henry Harrison,President,Federal,1841-03-04,1841-04-04
9,John Tyler,President,Federal,1841-04-04,1845-03-04


In [21]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from IPython.display import display, HTML

# Wikipedia URL for Presidents' birthdates
url = "https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States_by_age"

# Send request and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the correct table containing presidents' birthdates
table = soup.find("table", {"class": "wikitable"})

# Function to clean names (remove footnotes and extra info)
def clean_name(name):
    return name.split("[")[0].strip()  # Remove footnotes

# Function to clean and convert DOB strings into YYYY-MM-DD format
def convert_dob(dob_str):
    if not dob_str:
        return None
    dob_str = dob_str.split("[")[0].strip()  # Remove footnotes
    try:
        return datetime.strptime(dob_str, "%b %d, %Y").strftime("%Y-%m-%d")  # Use `%b` for short month names
    except ValueError:
        print(f"⚠️ Failed to parse DOB: '{dob_str}'")  # Debugging output for failed conversions
        return None  # Return None if parsing fails

# Extract data
data = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cols = row.find_all("td")
    if len(cols) < 2:
        continue  # Skip rows without enough columns

    name = clean_name(cols[0].text.strip())  # First column contains name

    # Extract DOB from the second column
    dob_text = cols[1].text.strip()
    dob = convert_dob(dob_text)

    data.append({
        "name": name,
        "dob": dob
    })

# Store data in a DataFrame
d_date_of_birth_president = pd.DataFrame(data)

# Display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Set max rows to prevent truncation
pd.set_option("display.max_rows", None)

# Show the table
display_scrollable_dataframe(d_date_of_birth_president.head(20))


Unnamed: 0,name,dob
0,George Washington,1732-02-22
1,John Adams,1735-10-30
2,Thomas Jefferson,1743-04-13
3,James Madison,1751-03-16
4,James Monroe,1758-04-28
5,John Quincy Adams,1767-07-11
6,Andrew Jackson,1767-03-15
7,Martin Van Buren,1782-12-05
8,William Henry Harrison,1773-02-09
9,John Tyler,1790-03-29


In [19]:
import pandas as pd
from datetime import datetime

# Ensure no duplicate 'dob' column exists before merging
if "dob" in d_service_presidents.columns:
    d_service_presidents = d_service_presidents.drop(columns=["dob"])

# Merge service records with birthdates, ensuring no column name conflicts
d_service_presidents = d_service_presidents.merge(
    d_date_of_birth_president, on="name", how="left", suffixes=("", "_dob")
)

# Rename dob_dob to just "dob" (cleaner column name)
if "dob_dob" in d_service_presidents.columns:
    d_service_presidents.rename(columns={"dob_dob": "dob"}, inplace=True)

# Convert dates to datetime format
d_service_presidents["start_date"] = pd.to_datetime(d_service_presidents["start_date"])
d_service_presidents["end_date"] = pd.to_datetime(d_service_presidents["end_date"])
d_service_presidents["dob"] = pd.to_datetime(d_service_presidents["dob"])

# Create a list of all months from the first president's term start to March 2025
start_date = d_service_presidents["start_date"].min()
end_date = datetime(2025, 3, 1)
all_months = pd.date_range(start=start_date, end=end_date, freq="MS")  # MS = Month Start

# Create a DataFrame for all months
d_presidents = pd.DataFrame({"date": all_months})
d_presidents["year"] = d_presidents["date"].dt.year
d_presidents["month"] = d_presidents["date"].dt.strftime("%B")

# Function to calculate exact age (rounded to 1 decimal place)
def calculate_exact_age(dob, current_date):
    years = current_date.year - dob.year
    months = current_date.month - dob.month
    days = current_date.day - dob.day

    # Adjust for negative month/day difference
    if months < 0 or (months == 0 and days < 0):
        years -= 1
        months += 12

    # Convert months to fraction of a year
    age = years + (months / 12)
    return round(age, 1)  # Rounded to 1 decimal place

# Function to get the president in office for a given month
def get_president_info(date):
    row = d_service_presidents[(d_service_presidents["start_date"] <= date) & (d_service_presidents["end_date"] >= date)]
    if not row.empty:
        row = row.iloc[0]  # Get the first matching row
        age = calculate_exact_age(row["dob"], date)  # More precise age calculation
        return pd.Series([row["name"], age])
    return pd.Series([None, None])  # If no president found (shouldn't happen)

# Apply the function to get the president and their age
d_presidents[["name", "age"]] = d_presidents["date"].apply(get_president_info)

# Drop the "date" column (since we have year and month separately)
d_presidents.drop(columns=["date"], inplace=True)

# Display DataFrame
from IPython.display import display, HTML

# Function to display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Show the table
display_scrollable_dataframe(d_presidents.head(20))


Unnamed: 0,year,month,name,age
0,1789,May,George Washington,57.2
1,1789,June,George Washington,57.3
2,1789,July,George Washington,57.4
3,1789,August,George Washington,57.5
4,1789,September,George Washington,57.6
5,1789,October,George Washington,57.7
6,1789,November,George Washington,57.8
7,1789,December,George Washington,57.8
8,1790,January,George Washington,57.9
9,1790,February,George Washington,58.0


## Vice President

In [22]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from IPython.display import display, HTML

# Get today's date
today_date = datetime.today().strftime("%Y-%m-%d")

# Wikipedia URL for Vice Presidents
url = "https://en.wikipedia.org/wiki/List_of_vice_presidents_of_the_United_States"

# Send request and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the table containing vice presidents
table = soup.find("table", {"class": "wikitable"})

# Function to clean names (remove birth/death years, footnotes, and non-human entries)
def clean_name(name):
    name = name.split("(")[0].strip()  # Remove birth/death info
    name = name.split("[")[0].strip()  # Remove footnotes
    return name if "Office vacant" not in name and "Nullifier" not in name else None  # Ignore placeholders

# Function to clean and convert date strings into YYYY-MM-DD format
def convert_date(date_str):
    if not date_str:
        return None
    date_str = date_str.split("[")[0].strip()  # Remove footnotes
    try:
        return datetime.strptime(date_str, "%B %d, %Y").strftime("%Y-%m-%d")
    except ValueError:
        print(f"⚠️ Failed to parse date: '{date_str}'")  # Debugging output for failed conversions
        return None  # Return None if parsing fails

# Extract data
data = []
for index, row in enumerate(table.find_all("tr")[1:]):  # Skip the header row
    cols = row.find_all("td")
    if len(cols) < 3:
        continue  # Skip rows without enough columns

    name = clean_name(cols[1].text.strip())
    if not name:
        continue  # Skip invalid entries

    # Extract raw term text
    term_text = cols[2].text.strip()

    # Split at en dash (–) to separate start and end dates
    term_dates = term_text.split("–")
    start_date_text = term_dates[0].strip()
    end_date_text = term_dates[1].strip() if len(term_dates) > 1 else None

    start_date = convert_date(start_date_text)
    
    # Set today's date if "Incumbent", otherwise parse normally
    if end_date_text and "Incumbent" in end_date_text:
        end_date = today_date
    else:
        end_date = convert_date(end_date_text)

    data.append({
        "name": name,
        "position": "Vice President",
        "region": "Federal",
        "start_date": start_date,
        "end_date": end_date
    })

# Store data in a DataFrame
d_service_vice_presidents = pd.DataFrame(data)

# Display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Set max rows to prevent truncation
pd.set_option("display.max_rows", None)

# Show the table
display_scrollable_dataframe(d_service_vice_presidents.head(20))


Unnamed: 0,name,position,region,start_date,end_date
0,John Adams,Vice President,Federal,1789-04-21,1797-03-04
1,Thomas Jefferson,Vice President,Federal,1797-03-04,1801-03-04
2,Aaron Burr,Vice President,Federal,1801-03-04,1805-03-04
3,George Clinton,Vice President,Federal,1805-03-04,1812-04-20
4,Elbridge Gerry,Vice President,Federal,1813-03-04,1814-11-23
5,Daniel D. Tompkins,Vice President,Federal,1817-03-04,1825-03-04
6,John C. Calhoun,Vice President,Federal,1825-03-04,1832-12-28
7,Martin Van Buren,Vice President,Federal,1833-03-04,1837-03-04
8,Richard Mentor Johnson,Vice President,Federal,1837-03-04,1841-03-04
9,John Tyler,Vice President,Federal,1841-03-04,1841-04-04


In [23]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from IPython.display import display, HTML

# Wikipedia URL for Vice Presidents' birthdates
url = "https://en.wikipedia.org/wiki/List_of_vice_presidents_of_the_United_States_by_age"

# Send request and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the correct table containing vice presidents' birthdates
table = soup.find("table", {"class": "wikitable"})

# Function to clean names (remove footnotes and extra info)
def clean_name(name):
    return name.split("[")[0].strip()  # Remove footnotes

# Function to clean and convert DOB strings into YYYY-MM-DD format
def convert_dob(dob_str):
    if not dob_str:
        return None
    dob_str = dob_str.split("[")[0].strip()  # Remove footnotes
    try:
        return datetime.strptime(dob_str, "%b %d, %Y").strftime("%Y-%m-%d")  # Use `%b` for short month names
    except ValueError:
        print(f"⚠️ Failed to parse DOB: '{dob_str}'")  # Debugging output for failed conversions
        return None  # Return None if parsing fails

# Extract data
data = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cols = row.find_all("td")
    if len(cols) < 2:
        continue  # Skip rows without enough columns

    name = clean_name(cols[0].text.strip())  # First column contains name

    # Extract DOB from the second column
    dob_text = cols[1].text.strip()
    dob = convert_dob(dob_text)

    data.append({
        "name": name,
        "dob": dob
    })

# Store data in a DataFrame
d_date_of_birth_vice_presidents = pd.DataFrame(data)

# Display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Set max rows to prevent truncation
pd.set_option("display.max_rows", None)

# Show the table
display_scrollable_dataframe(d_date_of_birth_vice_presidents.head(20))

Unnamed: 0,name,dob
0,John Adams,1735-10-30
1,Thomas Jefferson,1743-04-13
2,Aaron Burr,1756-02-06
3,George Clinton,1739-07-26
4,Elbridge Gerry,1744-07-17
5,Daniel D. Tompkins,1774-06-21
6,John C. Calhoun,1782-03-18
7,Martin Van Buren,1782-12-05
8,Richard M. Johnson,1780-10-17
9,John Tyler,1790-03-29


In [25]:
import pandas as pd
from datetime import datetime

# Ensure no duplicate 'dob' column exists before merging
if "dob" in d_service_vice_presidents.columns:
    d_service_vice_presidents = d_service_vice_presidents.drop(columns=["dob"])

# Merge service records with birthdates, ensuring no column name conflicts
d_service_vice_presidents = d_service_vice_presidents.merge(
    d_date_of_birth_vice_presidents, on="name", how="left", suffixes=("", "_dob")
)

# Rename dob_dob to just "dob" (cleaner column name)
if "dob_dob" in d_service_vice_presidents.columns:
    d_service_vice_presidents.rename(columns={"dob_dob": "dob"}, inplace=True)

# Convert dates to datetime format
d_service_vice_presidents["start_date"] = pd.to_datetime(d_service_vice_presidents["start_date"])
d_service_vice_presidents["end_date"] = pd.to_datetime(d_service_vice_presidents["end_date"])
d_service_vice_presidents["dob"] = pd.to_datetime(d_service_vice_presidents["dob"])

# Create a list of all months from the first vice president's term start to March 2025
start_date = d_service_vice_presidents["start_date"].min()
end_date = datetime(2025, 3, 1)
all_months = pd.date_range(start=start_date, end=end_date, freq="MS")  # MS = Month Start

# Create a DataFrame for all months
d_vice_presidents = pd.DataFrame({"date": all_months})
d_vice_presidents["year"] = d_vice_presidents["date"].dt.year
d_vice_presidents["month"] = d_vice_presidents["date"].dt.strftime("%B")

# Function to calculate exact age (rounded to 1 decimal place)
def calculate_exact_age(dob, current_date):
    years = current_date.year - dob.year
    months = current_date.month - dob.month
    days = current_date.day - dob.day

    # Adjust for negative month/day difference
    if months < 0 or (months == 0 and days < 0):
        years -= 1
        months += 12

    # Convert months to fraction of a year
    age = years + (months / 12)
    return round(age, 1)  # Rounded to 1 decimal place

# Function to get the vice president in office for a given month
def get_vice_president_info(date):
    row = d_service_vice_presidents[(d_service_vice_presidents["start_date"] <= date) & 
                                    (d_service_vice_presidents["end_date"] >= date)]
    if not row.empty:
        row = row.iloc[0]  # Get the first matching row
        age = calculate_exact_age(row["dob"], date)  # More precise age calculation
        return pd.Series([row["name"], age])
    return pd.Series([None, None])  # If no VP found (shouldn't happen)

# Apply the function to get the vice president and their age
d_vice_presidents[["name", "age"]] = d_vice_presidents["date"].apply(get_vice_president_info)

# Drop the "date" column (since we have year and month separately)
d_vice_presidents.drop(columns=["date"], inplace=True)

# Display DataFrame
from IPython.display import display, HTML

# Function to display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Show the table
display_scrollable_dataframe(d_vice_presidents.head(20))

Unnamed: 0,year,month,name,age
0,1789,May,John Adams,53.6
1,1789,June,John Adams,53.7
2,1789,July,John Adams,53.8
3,1789,August,John Adams,53.8
4,1789,September,John Adams,53.9
5,1789,October,John Adams,54.0
6,1789,November,John Adams,54.1
7,1789,December,John Adams,54.2
8,1790,January,John Adams,54.2
9,1790,February,John Adams,54.3


## Supreme Court Justices

In [6]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from IPython.display import display, HTML
import re  # For regex cleaning

# Wikipedia URL for Supreme Court Justices
url = "https://en.wikipedia.org/wiki/List_of_justices_of_the_Supreme_Court_of_the_United_States"

# Send request and parse HTML
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the correct table containing Supreme Court Justices
table = soup.find("table", {"class": "wikitable"})

# Get today's date
today_date = datetime.today().strftime("%Y-%m-%d")

# Function to clean and convert date strings into YYYY-MM-DD format
def convert_date(date_str):
    if not date_str or date_str.strip() == "" or date_str.strip() == "Incumbent":
        return today_date  # Replace "Incumbent" or missing dates with today’s date
    
    date_str = re.sub(r"\[.*?\]", "", date_str)  # Remove footnotes (e.g., "[e]")
    date_str = re.sub(r"\(.*?\)", "", date_str)  # Remove text inside parentheses (e.g., "(Resigned)", "(Died)")
    date_str = date_str.strip()  # Remove trailing spaces

    try:
        return datetime.strptime(date_str, "%B %d, %Y").strftime("%Y-%m-%d")  # Convert to YYYY-MM-DD
    except ValueError:
        return None  # Return None if parsing fails

# Extract data
data = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cols = row.find_all("td")
    if len(cols) < 8:  # Ensure there are enough columns
        continue

    # Extract name from the correct column (cols[2])
    name_tag = cols[2].find("a")
    name = name_tag.text.strip() if name_tag else cols[2].text.strip()
    name = re.sub(r"\[.*?\]", "", name)  # Remove footnotes

    # Extract position from the correct column (cols[4])
    position = cols[4].text.strip()
    position = re.sub(r"\[.*?\]", "", position)  # Remove footnotes
    position = position.replace("ChiefJustice", "Chief Justice").replace("AssociateJustice", "Associate Justice")

    # Set fixed region
    region = "Federal"

    # Extract tenure column (Corrected to use cols[7])
    tenure_text = cols[7].text.strip()
    tenure_text = re.sub(r"\[.*?\]", "", tenure_text)  # Remove footnotes
    tenure_text = re.sub(r"\(.*?\)", "", tenure_text)  # Remove text inside parentheses

    # Split tenure by en dash (–) to separate start and end dates
    tenure_dates = tenure_text.split("–")

    start_date_text = tenure_dates[0].strip()  # The first date is always the start date
    end_date_text = tenure_dates[1].strip() if len(tenure_dates) > 1 else None  # The second date (if available) is the end date

    # Convert start and end dates
    start_date = convert_date(start_date_text)

    # ✅ Explicitly check for "Incumbent" and replace with today's date
    end_date = convert_date(end_date_text)

    data.append({
        "name": name,
        "position": position,
        "region": region,
        "start_date": start_date,
        "end_date": end_date
    })

# Store data in a DataFrame
d_service_supreme_court_justices = pd.DataFrame(data)

# Save to CSV (optional)
d_service_supreme_court_justices.to_csv("d_service_supreme_court_justices.csv", index=False)

# Function to display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Show the table
display_scrollable_dataframe(d_service_supreme_court_justices.tail(20))


Unnamed: 0,name,position,region,start_date,end_date
101,Harry Blackmun,Associate Justice,Federal,1970-06-09,1994-08-03
102,Lewis F. Powell Jr.,Associate Justice,Federal,1972-01-07,1987-06-26
103,William Rehnquist,Associate Justice,Federal,1972-01-07,1986-09-26
104,John Paul Stevens,Associate Justice,Federal,1975-12-19,2010-06-29
105,Sandra Day O'Connor,Associate Justice,Federal,1981-09-25,2006-01-31
106,William Rehnquist,Chief Justice,Federal,1986-09-26,2005-09-03
107,Antonin Scalia,Associate Justice,Federal,1986-09-26,2016-02-13
108,Anthony Kennedy,Associate Justice,Federal,1988-02-18,2018-07-31
109,David Souter,Associate Justice,Federal,1990-10-09,2009-06-29
110,Clarence Thomas,Associate Justice,Federal,1991-10-23,2025-03-01


In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
from IPython.display import display, HTML
import re  # For regex cleaning
import time  # To avoid rate limiting

# Wikipedia URL for Supreme Court Justices
main_url = "https://en.wikipedia.org/wiki/List_of_justices_of_the_Supreme_Court_of_the_United_States"

# Send request and parse HTML
response = requests.get(main_url)
soup = BeautifulSoup(response.text, "html.parser")

# Find the correct table containing Supreme Court Justices
table = soup.find("table", {"class": "wikitable"})

# Extract all justice names and their Wikipedia page links
justices = []
for row in table.find_all("tr")[1:]:  # Skip the header row
    cols = row.find_all("td")
    if len(cols) < 3:  # Ensure there are enough columns
        continue

    name_tag = cols[2].find("a")  # The name and Wikipedia link are in column 2
    if name_tag:
        name = name_tag.text.strip()
        link = "https://en.wikipedia.org" + name_tag["href"]  # Full Wikipedia URL
        justices.append((name, link))

# Function to extract the date of birth from a justice's Wikipedia page
def extract_date_of_birth(wiki_url):
    try:
        response = requests.get(wiki_url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Locate the infobox, where the birth date is stored
        infobox = soup.find("table", {"class": "infobox"})
        if not infobox:
            print(f"⚠️ No infobox found on {wiki_url}")
            return None

        # Look for 'Born' row (usually inside <th> tag)
        for row in infobox.find_all("tr"):
            header = row.find("th")
            if header and "Born" in header.text:
                dob_text = row.find("td").text.strip()  # Extract the date text
                dob_text = re.sub(r"\[.*?\]", "", dob_text)  # Remove footnotes like [1]
                dob_text = dob_text.split("\n")[0]  # Remove unnecessary text

                # Use regex to extract only the date portion (ignore location)
                match = re.search(r"([A-Za-z]+ \d{1,2}, \d{4})", dob_text)
                if match:
                    dob_cleaned = match.group(1)  # Extract only the Month Day, Year format
                    # Convert to YYYY-MM-DD format
                    return datetime.strptime(dob_cleaned, "%B %d, %Y").strftime("%Y-%m-%d")

                print(f"⚠️ Failed to parse DOB: '{dob_text}' from {wiki_url}")
                return None

        print(f"⚠️ 'Born' section not found for {wiki_url}")
        return None

    except Exception as e:
        print(f"❌ Error scraping {wiki_url}: {e}")
        return None

# Extract birth dates for each justice
data = []
for name, link in justices:
    # print(f"🔍 Scraping DOB for {name}...")
    dob = extract_date_of_birth(link)
    data.append({"name": name, "dob": dob})
    time.sleep(1)  # Sleep to avoid rate limiting

# Store data in a DataFrame
d_date_of_birth_supreme_court_justice = pd.DataFrame(data)
d_date_of_birth_supreme_court_justice.to_csv("d_date_of_birth_supreme_court_justice.csv")

# Function to display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Show the table
display_scrollable_dataframe(d_date_of_birth_supreme_court_justice.head(20))


Unnamed: 0,name,dob
0,John Jay,1745-12-23
1,John Rutledge,1739-09-17
2,William Cushing,1732-03-01
3,James Wilson,1742-09-14
4,John Blair,1732-04-17
5,James Iredell,1751-10-05
6,Thomas Johnson,1732-11-04
7,William Paterson,1745-12-24
8,John Rutledge,1739-09-17
9,Samuel Chase,1741-04-17


In [8]:
import pandas as pd
from datetime import datetime

# Load the service records of Supreme Court Justices from CSV
d_date_of_birth_supreme_court_justice = pd.read_csv("d_date_of_birth_supreme_court_justice.csv")

# Ensure no duplicate 'dob' column exists before merging
if "dob" in d_service_supreme_court_justices.columns:
    d_service_supreme_court_justices = d_service_supreme_court_justices.drop(columns=["dob"])

# Merge service records with birthdates, ensuring no column name conflicts
d_service_supreme_court_justices = d_service_supreme_court_justices.merge(
    d_date_of_birth_supreme_court_justice, on="name", how="left", suffixes=("", "_dob")
)

# Rename dob_dob to just "dob" (cleaner column name)
if "dob_dob" in d_service_supreme_court_justices.columns:
    d_service_supreme_court_justices.rename(columns={"dob_dob": "dob"}, inplace=True)

# Convert dates to datetime format
d_service_supreme_court_justices["start_date"] = pd.to_datetime(d_service_supreme_court_justices["start_date"])
d_service_supreme_court_justices["end_date"] = pd.to_datetime(d_service_supreme_court_justices["end_date"])
d_service_supreme_court_justices["dob"] = pd.to_datetime(d_service_supreme_court_justices["dob"])

# Create a list of all months from the first justice's term start to March 2025
start_date = d_service_supreme_court_justices["start_date"].min()
end_date = datetime(2025, 3, 1)
all_months = pd.date_range(start=start_date, end=end_date, freq="MS")  # MS = Month Start

# Create an empty list to store all rows
justice_rows = []

# Function to calculate exact age (rounded to 1 decimal place)
def calculate_exact_age(dob, current_date):
    years = current_date.year - dob.year
    months = current_date.month - dob.month
    days = current_date.day - dob.day

    # Adjust for negative month/day difference
    if months < 0 or (months == 0 and days < 0):
        years -= 1
        months += 12

    # Convert months to fraction of a year
    age = years + (months / 12)
    return round(age, 1)  # Rounded to 1 decimal place

# Iterate through all months and record every justice in office
for date in all_months:
    active_justices = d_service_supreme_court_justices[
        (d_service_supreme_court_justices["start_date"] <= date) & 
        (d_service_supreme_court_justices["end_date"] >= date)
    ]
    
    for _, row in active_justices.iterrows():
        age = calculate_exact_age(row["dob"], date) if pd.notnull(row["dob"]) else None
        justice_rows.append({
            "year": date.year,
            "month": date.strftime("%B"),
            "name": row["name"],
            "position": row["position"],
            "age": age
        })

# Convert list to DataFrame
d_supreme_court_justices = pd.DataFrame(justice_rows)

# Display DataFrame
from IPython.display import display, HTML

# Function to display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Show the table
display_scrollable_dataframe(d_supreme_court_justices.tail(200))

Unnamed: 0,year,month,name,position,age
26966,2023,May,Amy Coney Barrett,Associate Justice,51.3
26967,2023,May,Ketanji Brown Jackson,Associate Justice,52.7
26968,2023,June,Clarence Thomas,Associate Justice,75.0
26969,2023,June,John Roberts,Chief Justice,68.4
26970,2023,June,Samuel Alito,Associate Justice,73.2
26971,2023,June,Sonia Sotomayor,Associate Justice,69.0
26972,2023,June,Elena Kagan,Associate Justice,63.2
26973,2023,June,Neil Gorsuch,Associate Justice,55.8
26974,2023,June,Brett Kavanaugh,Associate Justice,58.3
26975,2023,June,Amy Coney Barrett,Associate Justice,51.4


## Congress

### Senate

In [9]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
from IPython.core.display import display, HTML

# Function to display DataFrame as a scrollable table
def display_scrollable_dataframe(df):
    display(HTML(df.to_html(notebook=True, escape=False)))

# Function to determine ordinal suffix (1st, 2nd, 3rd, 4th...)
def get_ordinal_suffix(n):
    if 11 <= (n % 100) <= 13:
        return "th"
    else:
        return {1: "st", 2: "nd", 3: "rd"}.get(n % 10, "th")

# Function to extract Congress start and end dates from the infobox
def get_congress_dates(soup):
    congress_dates = soup.find('table', class_='infobox')
    start_date, end_date = "Unknown", "Unknown"

    if congress_dates:
        th_elements = congress_dates.find_all('th', class_='infobox-header')
        for th in th_elements:
            text = th.get_text(strip=True)  # Extract text from the <th>
            match = re.search(r"(\w+ \d{1,2}, \d{4})\s*–\s*(\w+ \d{1,2}, \d{4})", text)
            if match:
                start_date, end_date = match.groups()
                break  # Stop after finding the first match

    return start_date, end_date

# Function to extract senator names, links, states, and correct details
def extract_senators(congress_number):
    # Generate Wikipedia URL with correct ordinal suffix
    ordinal_suffix = get_ordinal_suffix(congress_number)
    url = f"https://en.wikipedia.org/wiki/{congress_number}{ordinal_suffix}_United_States_Congress"

    # Fetch the page content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract Congress start and end dates
    congress_start_date, congress_end_date = get_congress_dates(soup)

    # Locate the Senate section under Members
    senate_section = soup.find('h3', id=lambda x: x and x.startswith("Senate"))

    # Initialize list to store senator data
    senators_data = []
    senator_order = []  # Maintain order of appearance

    if senate_section:
        senate_table = senate_section.find_next('table', class_='col-begin')

        if senate_table:
            current_state = None  # Tracks which state senators belong to
            for element in senate_table.find_all(['h4', 'dl']):  
                # If an <h4> tag is found, it's a new state
                if element.name == 'h4':
                    current_state = element.get_text(strip=True)  

                # If it's a <dl>, it contains senators
                if element.name == 'dl':
                    for dd in element.find_all('dd', recursive=False):  # Only top-level senators
                        a_tag = dd.find('a')  # Find senator name
                        if a_tag:
                            senator_name = a_tag.text.strip()
                            senator_link = "https://en.wikipedia.org" + a_tag['href']  # Construct full Wikipedia link

                            # Default values
                            detail = ""
                            start_date, end_date = congress_start_date, congress_end_date

                            # Extract "until"
                            text_content = dd.get_text().strip()
                            match_until = re.search(r"until (\w+ \d{1,2}, \d{4})", text_content)
                            if match_until:
                                detail = f"until {match_until.group(1)}"
                                end_date = match_until.group(1)

                            # Extract "from"
                            match_from = re.search(r"from (\w+ \d{1,2}, \d{4})", text_content)
                            if match_from and not detail:  # Only add "from" if "until" isn't already set
                                detail = f"from {match_from.group(1)}"
                                start_date = match_from.group(1)

                            # Store senator and track order
                            senators_data.append([congress_number, senator_name, senator_link, current_state, detail, start_date, end_date])
                            senator_order.append(senator_name)

                            # Handle double-nested replacements
                            nested_dl = dd.find('dl')
                            if nested_dl:
                                for nested_dd in nested_dl.find_all('dd'):
                                    nested_a_tag = nested_dd.find('a')
                                    if nested_a_tag:
                                        replacement_name = nested_a_tag.text.strip()
                                        replacement_link = "https://en.wikipedia.org" + nested_a_tag['href']

                                        # Handle "from", "until", or full date range cases
                                        nested_text_content = nested_dd.get_text().strip()
                                        match_from_nested = re.search(r"from (\w+ \d{1,2}, \d{4})", nested_text_content)
                                        match_until_nested = re.search(r"until (\w+ \d{1,2}, \d{4})", nested_text_content)
                                        match_full_range = re.search(r"(\w+ \d{1,2}, \d{4}) – (\w+ \d{1,2}, \d{4})", nested_text_content)

                                        replacement_detail = ""
                                        replacement_start = start_date
                                        replacement_end = congress_end_date

                                        if match_full_range:
                                            replacement_detail = f"{match_full_range.group(1)} – {match_full_range.group(2)}"
                                            replacement_start = match_full_range.group(1)
                                            replacement_end = match_full_range.group(2)
                                        elif match_until_nested:
                                            replacement_detail = f"until {match_until_nested.group(1)}"
                                            replacement_end = match_until_nested.group(1)
                                        elif match_from_nested:
                                            replacement_detail = f"from {match_from_nested.group(1)}"
                                            replacement_start = match_from_nested.group(1)

                                        # Store replacement senator **only if not a duplicate**
                                        senators_data.append([congress_number, replacement_name, replacement_link, current_state, replacement_detail, replacement_start, replacement_end])
                                        senator_order.append(replacement_name)

    # Create DataFrame
    columns = ["Congress Number", "Name", "Link", "State", "Detail", "Start Date", "End Date"]
    df = pd.DataFrame(senators_data, columns=columns)

    # Deduplicate while **preserving the original order**
    df["order"] = df["Name"].apply(lambda name: senator_order.index(name))  # Map order index
    df.sort_values(by=["order", "Detail"], ascending=[True, False], inplace=True)  # Keep order & prefer "until" > "from"

    # Keep only the row with the most useful information (one with Detail), keeping first occurrence in order
    df = df.drop_duplicates(subset=["Name"], keep="first").drop(columns=["order"]).reset_index(drop=True)

    return df

# Load existing data if available
csv_filename = "d_senators.csv"
if os.path.exists(csv_filename):
    existing_data = pd.read_csv(csv_filename)
else:
    existing_data = pd.DataFrame()

# Specify range of Congress sessions to scrape
congress_range = range(1, 4)  # Change this range as needed

# Scrape data for each Congress session
all_senators = []
for congress_number in congress_range:
    if existing_data.empty or congress_number not in existing_data["Congress Number"].values:
        print(f"Scraping Congress {congress_number}...")
        senators_df = extract_senators(congress_number)
        all_senators.append(senators_df)

# Save new results to CSV (append if file exists)
if all_senators:
    new_data = pd.concat(all_senators, ignore_index=True)
    final_data = pd.concat([existing_data, new_data], ignore_index=True).drop_duplicates()
    final_data.to_csv(csv_filename, index=False)
    print(f"Data saved to {csv_filename}.")

# Load and display the full dataset as a scrollable table
full_senators_df = pd.read_csv(csv_filename)
display_scrollable_dataframe(full_senators_df)


  from IPython.core.display import display, HTML


Scraping Congress 1...
Scraping Congress 2...
Scraping Congress 3...
Data saved to d_senators.csv.


Unnamed: 0,Congress Number,Name,Link,State,Detail,Start Date,End Date
0,1,Oliver Ellsworth,https://en.wikipedia.org/wiki/Oliver_Ellsworth,Connecticut,,"March 4, 1789","March 3, 1791"
1,1,William S. Johnson,https://en.wikipedia.org/wiki/William_Samuel_J...,Connecticut,,"March 4, 1789","March 3, 1791"
2,1,George Read,https://en.wikipedia.org/wiki/George_Read_(sig...,Delaware,,"March 4, 1789","March 3, 1791"
3,1,Richard Bassett,https://en.wikipedia.org/wiki/Richard_Bassett_...,Delaware,,"March 4, 1789","March 3, 1791"
4,1,William Few,https://en.wikipedia.org/wiki/William_Few,Georgia,,"March 4, 1789","March 3, 1791"
5,1,James Gunn,https://en.wikipedia.org/wiki/James_Gunn_(sena...,Georgia,,"March 4, 1789","March 3, 1791"
6,1,Charles Carroll,https://en.wikipedia.org/wiki/Charles_Carroll_...,Maryland,,"March 4, 1789","March 3, 1791"
7,1,John Henry,https://en.wikipedia.org/wiki/John_Henry_(sena...,Maryland,,"March 4, 1789","March 3, 1791"
8,1,Tristram Dalton,https://en.wikipedia.org/wiki/Tristram_Dalton,Massachusetts,,"March 4, 1789","March 3, 1791"
9,1,Caleb Strong,https://en.wikipedia.org/wiki/Caleb_Strong,Massachusetts,,"March 4, 1789","March 3, 1791"


### House of Representatives

### House of Representatives

## Pictures

In [45]:
import os
import requests
from bs4 import BeautifulSoup
import re

# Ensure soupsieve is installed
try:
    import soupsieve
except ImportError:
    os.system("pip install soupsieve")
    import soupsieve

# Create directories for images
os.makedirs("images/presidents", exist_ok=True)
os.makedirs("images/vice_presidents", exist_ok=True)

# Wikipedia URLs
PRESIDENTS_URL = "https://en.wikipedia.org/wiki/List_of_presidents_of_the_United_States"
VICE_PRESIDENTS_URL = "https://en.wikipedia.org/wiki/List_of_vice_presidents_of_the_United_States"

# Function to scrape names from Wikipedia
def get_names(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    table = soup.find("table", class_="wikitable")
    names = []
    
    if table:
        for row in table.find_all("tr")[1:]:  # Skip header row
            columns = row.find_all("td")
            if columns and len(columns) > 1:  # Skip first column (row number)
                name = columns[1].text.strip()
                name = re.sub(r'\(.*?\)', '', name)  # Remove birth-death years
                name = re.sub(r'\[.*?\]', '', name)  # Remove citations
                name = name.strip()
                if name and not any(x in name.lower() for x in ["vacant", "federalist", "nullifier"]):
                    names.append(name)
    
    return names

# Fetch names
presidents = get_names(PRESIDENTS_URL)
vice_presidents = get_names(VICE_PRESIDENTS_URL)

print("Presidents List:", presidents)
print("Vice Presidents List:", vice_presidents)

# Wikipedia API to fetch image URLs
def get_wikipedia_image(name):
    api_url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{name.replace(' ', '_')}"
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()
        if "originalimage" in data:
            return data["originalimage"]["source"]
    return None

def download_image(url, path):
    if url:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36"
        }
        
        for attempt in range(3):  # Retry up to 3 times
            print(f"Downloading (attempt {attempt+1}) {url} -> {path}")
            response = requests.get(url, headers=headers, allow_redirects=True, stream=True)
            
            if response.status_code == 200:
                with open(path, "wb") as file:
                    for chunk in response.iter_content(1024):
                        file.write(chunk)
                # print(f"✅ Saved: {path}")
                return
            else:
                # print(f"❌ Failed (attempt {attempt+1}) with status code: {response.status_code}")
                time.sleep(2)  # Wait before retrying
        
        print(f"🚨 Skipping: {url} after 3 failed attempts")


# Process Presidents
for president in presidents:
    image_url = get_wikipedia_image(president)
    print(f"Fetching image for {president}: {image_url}")
    if image_url:
        download_image(image_url, f"images/presidents/{president}.jpg")

# Process Vice Presidents
for vice_president in vice_presidents:
    image_url = get_wikipedia_image(vice_president)
    print(f"Fetching image for {vice_president}: {image_url}")
    if image_url:
        download_image(image_url, f"images/vice_presidents/{vice_president}.jpg")

print("Images downloaded successfully!")

Presidents List: ['George Washington', 'John Adams', 'Thomas Jefferson', 'James Madison', 'James Monroe', 'John Quincy Adams', 'Andrew Jackson', 'Martin Van Buren', 'William Henry Harrison', 'John Tyler', 'James K. Polk', 'Zachary Taylor', 'Millard Fillmore', 'Franklin Pierce', 'James Buchanan', 'Abraham Lincoln', 'Andrew Johnson', 'Ulysses S. Grant', 'Rutherford B. Hayes', 'James A. Garfield', 'Chester A. Arthur', 'Grover Cleveland', 'Benjamin Harrison', 'Grover Cleveland', 'William McKinley', 'Theodore Roosevelt', 'William Howard Taft', 'Woodrow Wilson', 'Warren G. Harding', 'Calvin Coolidge', 'Herbert Hoover', 'Franklin D. Roosevelt', 'Harry S. Truman', 'Dwight D. Eisenhower', 'John F. Kennedy', 'Lyndon B. Johnson', 'Richard Nixon', 'Gerald Ford', 'Jimmy Carter', 'Ronald Reagan', 'George H. W. Bush', 'Bill Clinton', 'George W. Bush', 'Barack Obama', 'Donald Trump', 'Joe Biden', 'Donald Trump']
Vice Presidents List: ['John Adams', 'Thomas Jefferson', 'Aaron Burr', 'George Clinton', '

## APPENDIX

In [8]:
import fitz  # PyMuPDF
import re
import pandas as pd

def extract_senators_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    senators = []
    current_congress = None
    current_year = None  # Track the most recent year marker
    last_seen_start_date = None  # Track the most recent explicitly mentioned start date

    # Regex patterns
    congress_pattern = re.compile(r"(\w+)\s+CONGRESS")
    year_marker_pattern = re.compile(r"\* \* \* (\d{4}) \* \* \*")  # e.g., *** 1789 ***
    start_date_pattern = re.compile(r"^([A-Za-z]+ \d{1,2})$")  # Matches standalone start dates
    senator_pattern = re.compile(r"^([A-Za-z\s\.,'-]+)\s+\(([^)]+)-([A-Z]{2})\)")  # Name, party, state
    end_date_pattern = re.compile(r"([A-Za-z]+ \d{1,2},? \d{4})")  # Matches any full date format

    month_map = {
        "January": "01", "February": "02", "March": "03", "April": "04",
        "May": "05", "June": "06", "July": "07", "August": "08",
        "September": "09", "October": "10", "November": "11", "December": "12"
    }

    for page_num in range(len(doc)):  # Iterate through all pages
        text = doc[page_num].get_text("text")
        lines = text.split("\n")

        for i, line in enumerate(lines):
            line = line.strip()

            # Detect Congress session
            congress_match = congress_pattern.search(line)
            if congress_match:
                current_congress = congress_match.group(1) + " Congress"

            # Detect Year Marker (e.g., *** 1789 ***)
            year_match = year_marker_pattern.search(line)
            if year_match:
                current_year = year_match.group(1)  # Assign year when found

            # Detect Start Date (Left Side)
            start_date_match = start_date_pattern.match(line)
            if start_date_match and current_year:  # Ensure we have a year before using it
                month, day = start_date_match.group(1).split()
                month = month.title()  # Convert to title case to match dictionary
                day = day.zfill(2)  # Ensure two-digit day format
                last_seen_start_date = f"{current_year}-{month_map[month]}-{day}"

            # Detect Senator Entry
            senator_match = senator_pattern.search(line)
            if senator_match:
                name = senator_match.group(1).strip()
                state = senator_match.group(3).strip()

                # Use the last seen start date
                start_date = last_seen_start_date if last_seen_start_date else None

                # Extract end date immediately to the right of the name
                remaining_text = line[senator_match.end():].strip()
                end_date = None
                end_date_match = end_date_pattern.search(remaining_text)
                if end_date_match:
                    end_date_text = end_date_match.group(1)
                    parts = end_date_text.split()
                    if len(parts) == 3:
                        month, day, year = parts
                        month = month.title()  # Convert uppercase to title case
                        day = day.zfill(2)  # Ensure proper day format
                        end_date = f"{year}-{month_map[month]}-{day}"

                # Store extracted data
                senators.append({
                    "name": name,
                    "state": state,
                    "congress": current_congress,
                    "start_date": start_date,
                    "end_date": end_date,  # Will be empty if still active
                })

    return senators

# Run the function on the PDF
pdf_path = "chronlist.pdf"  # Ensure correct file path
senators_data = extract_senators_from_pdf(pdf_path)

# Convert to DataFrame and save to CSV
df = pd.DataFrame(senators_data)
df.to_csv("senators_data_final.csv", index=False)

print("Data extraction complete. Saved as senators_data_final.csv")


Data extraction complete. Saved as senators_data_final.csv
