In [1]:
!pip install torch
!pip install tensorflow
!pip install transformers
!pip install datasets



In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

#Scrape presidency.ucsb.edu website for debate links 

BASE_URL = "https://www.presidency.ucsb.edu"
DEBATES_INDEX = f"{BASE_URL}/documents/app-categories/elections-and-transitions/debates"

# Fetch index page
resp = requests.get(DEBATES_INDEX)
soup = BeautifulSoup(resp.text, "html.parser")

#See all available HTML Tags
all_tags = [tag.name for tag in soup.find_all()]
unique_tags = list(set(all_tags))  # find unique tags

print("All unique tags on page:")
print(unique_tags)

#Grab the URLs. They're found in a page's <a> tag, with an hfref attribute, we want those with debate in the href
debate_links = []

for a in soup.find_all("a", href=True):
    href = a["href"]
    
# Check if "debate" in URL and year is between 2015-2024, don't want links to search links but to actual debate transcripts
    if "debate" in href and "app-categories" not in href:
        full_url = BASE_URL + href if href.startswith("/") else href
        debate_links.append(full_url)

print(f"Found {len(debate_links)} debate links:")
for link in debate_links[:10]:
    print(link)        



All unique tags on page:
['html', 'ul', 'form', 'aside', 'button', 'li', 'p', 'label', 'head', 'img', 'body', 'script', 'h4', 'h2', 'title', 'meta', 'footer', 'section', 'link', 'span', 'nav', 'input', 'br', 'h3', 'strong', 'div', 'a']
Found 18 debate links:
https://www.presidency.ucsb.edu/documents/vice-presidential-debate-new-york-city
https://www.presidency.ucsb.edu/people/other/presidential-candidate-debates
https://www.presidency.ucsb.edu/documents/presidential-debate-philadelphia-pennsylvania
https://www.presidency.ucsb.edu/people/other/presidential-candidate-debates
https://www.presidency.ucsb.edu/documents/presidential-debate-atlanta-georgia
https://www.presidency.ucsb.edu/documents/republican-candidates-debate-des-moines-iowa-1
https://www.presidency.ucsb.edu/people/other/presidential-candidate-debates
https://www.presidency.ucsb.edu/documents/republican-candidates-debate-tuscaloosa-alabama
https://www.presidency.ucsb.edu/people/other/presidential-candidate-debates
https://www

In [3]:
#Grab speeches only between 2015-2024
from datetime import datetime

filtered_links = []

for link in debate_links:
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    date_tag = soup.find("span", class_="date-display-single")
    if date_tag:
        date_str = date_tag.get_text(strip=True)
        try:
            date_obj = datetime.strptime(date_str, "%B %d, %Y")  # e.g., 'October 19, 2016'
            if 2015 <= date_obj.year <= 2024:
                filtered_links.append(link)
        except:
            continue

print(f"Debates from 2015–2024: {len(filtered_links)}")

Debates from 2015–2024: 10


In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

BASE_URL = "https://www.presidency.ucsb.edu"
debates_data = []

for idx, link in enumerate(filtered_links):
    print(f"Processing {idx+1}/{len(filtered_links)}: {link}")
    resp = requests.get(link)
    soup = BeautifulSoup(resp.text, "html.parser")
    
    # Extract debate title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "Unknown Debate"
    
    # Extract date
    date_tag = soup.find("span", class_="date-display-single")
    date = date_tag.get_text(strip=True) if date_tag else ""
    
    # Extract participants
    participants_b = soup.find("b", string=re.compile(r"PARTICIPANTS", re.I))
    participants = []
    if participants_b:
        # Collect text following the <b>PARTICIPANTS:</b> tag until MODERATORS
        text_parts = []
        for sib in participants_b.next_siblings:
            if sib.name == "b" and "MODERATOR" in sib.get_text(strip=True).upper():
                break
            if isinstance(sib, str):
                text_parts.append(sib.strip())
            elif sib.name == "br":
                text_parts.append("\n")
            elif sib.name:
                text_parts.append(sib.get_text(" ", strip=True))
        combined = " ".join(text_parts)
        # Split by 'and' or newline, remove empty and trailing pieces
        raw_names = re.split(r"\band\b|\n", combined)
        for name in raw_names:
            name = name.strip().strip(",").strip()
            if name:
                # Extract name part before parentheses (D)/(R)
                clean_name = re.sub(r"\s*\([^)]*\)", "", name).strip()
                if clean_name:
                    participants.append(clean_name)
    print(f" Found participants: {participants}")
    
    # Extract transcript lines
    speech = []
    paragraphs = soup.find_all("p")
    for p in paragraphs:
        b = p.find("b")
        if b:
            speaker_raw = b.get_text(strip=True).rstrip(":").upper()
            # Match if speaker is one of the participants by last name
            if any(speaker_raw in name.upper() for name in participants):
                # Remove <b> tag and get rest of text
                b.extract()
                text = p.get_text(" ", strip=True)
                if text:
                    speech.append({
                        "debate_id": idx + 1,
                        "debate_title": title,
                        "debate_date": date,
                        "speaker": speaker_raw.title(),
                        "text": text
                    })
    print(f"Extracted {len(speech)} lines for {title}\n")
    debates_data.extend(speech)

    time.sleep(1) # Being nice to server


# Convert to dataframe 
df_debates = pd.DataFrame(debates_data)
print("All debates collected:", df_debates["debate_title"].nunique(), "unique debates")
df_debates.head()


Processing 1/10: https://www.presidency.ucsb.edu/documents/vice-presidential-debate-new-york-city
 Found participants: ['Senator J.D. Vance', 'Governor Tim Walz']
Extracted 97 lines for Vice Presidential Debate in New York City

Processing 2/10: https://www.presidency.ucsb.edu/documents/presidential-debate-philadelphia-pennsylvania
 Found participants: ['Vice President Kamala Harris', 'Former President Donald Trump']
Extracted 105 lines for Presidential Debate in Philadelphia, Pennsylvania

Processing 3/10: https://www.presidency.ucsb.edu/documents/presidential-debate-atlanta-georgia
 Found participants: ['President Joe Biden', 'Former President Donald Trump']
Extracted 97 lines for Presidential Debate in Atlanta, Georgia

Processing 4/10: https://www.presidency.ucsb.edu/documents/republican-candidates-debate-des-moines-iowa-1
 Found participants: ['Governor Ron DeSantis;', 'Former Governor Nikki Haley;']
Extracted 176 lines for Republican Candidates Debate in Des Moines, Iowa

Process

Unnamed: 0,debate_id,debate_title,debate_date,speaker,text
0,1,Vice Presidential Debate in New York City,"October 01, 2024",Walz,"Well, thank you. And thank you for those joini..."
1,1,Vice Presidential Debate in New York City,"October 01, 2024",Vance,"So, Margaret, I want to answer the question. F..."
2,1,Vice Presidential Debate in New York City,"October 01, 2024",Walz,"Well, look, Donald Trump was in office. We'll ..."
3,1,Vice Presidential Debate in New York City,"October 01, 2024",Vance,"Well, first of all, Margaret, diplomacy is not..."
4,1,Vice Presidential Debate in New York City,"October 01, 2024",Vance,"Sure. So first of all, let's start with the hu..."


In [5]:
import re

def clean_text(row):
    speaker = row["speaker"]
    text = row["text"]

    # Remove speaker name repetition at start (e.g. "Kamala Harris:" or "HARRIS - ")
    pattern = re.compile(rf"^{speaker}[\.\:\-\s]+", re.IGNORECASE)
    text = pattern.sub("", text)

    # Remove parenthetical notes like (applause), [laughter], (inaudible)
    text = re.sub(r"[\(\[].*?[\)\]]", "", text)

    # Collapse all whitespace, tabs, and line breaks into single spaces
    text = re.sub(r"\s+", " ", text)

    # Trim leading/trailing spaces
    return text.strip()

# Apply to DataFrame
df_debates["text"] = df_debates.apply(clean_text, axis=1)


In [6]:
df_debates.to_csv("TILIS.csv", index = False)