## 1) Web Scraping to obtain transcripts

1) source https://podscript.ai/podcasts/ for transcripts

This script will create a text file for each episode's transcript in the script's directory. Ensure you have sufficient permissions and storage space for the files.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import os

In [None]:
class PodscriptScraper:
    def __init__(self, path_to_driver):
        self.path_to_driver = path_to_driver
        service = Service(executable_path=self.path_to_driver)
        self.driver = webdriver.Edge(service=service)

    def close(self):
        self.driver.quit()

    def navigate_to_podcast(self, podcast_name):
        print(f"Navigating to podcast: {podcast_name}")
        self.driver.get("https://podscript.ai/podcasts/")
        time.sleep(5)  # Wait for page to load
        podcasts = self.driver.find_elements(By.CSS_SELECTOR, "article.post-entry")
        for podcast in podcasts:
            header = podcast.find_element(By.CSS_SELECTOR, "h2")
            if podcast_name.lower() in header.text.lower():
                link = podcast.find_element(By.CSS_SELECTOR, "a.entry-link")
                link.click()
                time.sleep(5)  # Wait for navigation
                return True
        return False

    def get_episode_links(self):
        print("Fetching episode links...")
        episode_links = []
        episodes = self.driver.find_elements(By.CSS_SELECTOR, "article.post-entry a.entry-link")
        for episode in episodes:
            link = episode.get_attribute('href')
            episode_links.append(link)
        return episode_links
    
    def navigate_to_next_page(self):
        try:
            wait = WebDriverWait(self.driver, 20)

            # Now wait for the 'next' button
            next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "nav.pagination a.next")))

            if next_button:
                next_page_url = next_button.get_attribute('href')
                print(f"Navigating to next page: {next_page_url}")
                self.driver.get(next_page_url)
                return True
        except (NoSuchElementException, TimeoutException) as e:
            print(f"No next button found or timed out: {e}")
            return False

    def scrape_transcript(self, episode_url):
        print(f"Scraping transcript from {episode_url}")
        self.driver.get(episode_url)
        time.sleep(10)  # Increased wait time for content to load

        try:
            # Targeting all paragraph tags for transcript extraction
            paragraphs = self.driver.find_elements(By.TAG_NAME, "p")
            transcript = "\n".join([p.text for p in paragraphs if p.text.strip() != ""])
            
            if not transcript:
                print("No transcript content found.")
            else:
                print("Transcript scraped successfully.")
            
            return transcript
        except Exception as e:
            print(f"Error while scraping transcript: {e}")
            return ""


    def save_transcript(self, podcast_name, episode_title, transcript):
        # Create 'Transcripts' folder if it doesn't exist
        transcripts_folder = 'Transcripts'
        if not os.path.exists(transcripts_folder):
            os.makedirs(transcripts_folder)

        # Format filename and check if it already exists
        filename = f"{podcast_name}_{episode_title}.txt".replace(" ", "_").replace("/", "_")
        filepath = os.path.join(transcripts_folder, filename)
        
        if os.path.exists(filepath):
            print(f"Transcript already downloaded: {filename}")
            return
        else:
            with open(filepath, 'w', encoding='utf-8') as file:
                file.write(transcript)
            print(f"Transcript saved: {filename}")

def main():
    path_to_driver = "./msedgedriver.exe" 
    scraper = PodscriptScraper(path_to_driver)
    transcripts_folder = 'Transcripts'

    try:
        podcasts = [
            "lex fridman podcast - transcripts",
            "huberman lab podcast - transcripts",
            "the diary of a ceo with steven bartlett podcast - transcripts",
            "lifespan with dr. david sinclair podcast - transcripts",
            "revolutions podcast - transcripts"
        ]

        for podcast_name in podcasts:
            if scraper.navigate_to_podcast(podcast_name):
                while True:
                    episode_links = scraper.get_episode_links()
                    for link in episode_links:
                        episode_title = link.split('/')[-2]  # Extract title from URL
                        # Format filename and check if it already exists
                        filename = f"{podcast_name}_{episode_title}.txt".replace(" ", "_").replace("/", "_")
                        filepath = os.path.join(transcripts_folder, filename)
                        
                        if os.path.exists(filepath):
                            print(f"Transcript already downloaded: {filename}")
                            continue

                        transcript = scraper.scrape_transcript(link)
                        scraper.save_transcript(podcast_name, episode_title, transcript)

                    if not scraper.navigate_to_next_page():
                        break
            else:
                print(f"Could not navigate to {podcast_name}")
    finally:
        scraper.close()

if __name__ == "__main__":
    main()

## Scraper for the Huberman Lab Podcast website


In [11]:
import re

class HubermanLabScraper:
    def __init__(self, path_to_driver):
        self.path_to_driver = path_to_driver
        service = Service(executable_path=self.path_to_driver)
        self.driver = webdriver.Edge(service=service)

    def close(self):
        self.driver.quit()

    def navigate_to_episodes_page(self, url=None):
        if url is None:
            url = "https://www.hubermanlab.com/all-episodes"
        print(f"Navigating to Huberman Lab episodes page: {url}")
        self.driver.get(url)
        time.sleep(5)  # Wait for page to load

    def get_episode_links(self):
        print("Fetching episode links...")
        episode_links = []
        episodes = self.driver.find_elements(By.CSS_SELECTOR, "h3.hit-title a.u-text-black")
        for episode in episodes:
            link = episode.get_attribute('href')
            # Check if the link already contains the full URL
            if link.startswith("http"):
                full_link = link
            else:
                full_link = f"https://www.hubermanlab.com{link}"
            episode_links.append(full_link)
        return episode_links

    # def navigate_to_next_page(self):
    #     try:
    #         # Scroll to the bottom of the page
    #         self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    #         time.sleep(5)  # Wait a bit for the page to load after scrolling

    #         wait = WebDriverWait(self.driver, 10)
    #         # Wait for the 'next' button to be present on the page
    #         next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "li.ais-Pagination-item--nextPage a.ais-Pagination-link")))

    #         if next_button:
    #             next_page_url = next_button.get_attribute('href')
    #             print(f"Navigating to next page: {next_page_url}")

    #             # Click the 'next' button
    #             next_button.click()
    #             time.sleep(5)  # Wait for the page to load after clicking
    #             return True
    #     except (NoSuchElementException, TimeoutException) as e:
    #         print(f"No next button found or timed out: {e}")
    #         return False


    def navigate_to_episodes_page(self, url=None):
        if url is None:
            url = "https://www.hubermanlab.com/all-episodes"
        print(f"Navigating to Huberman Lab episodes page: {url}")
        self.driver.get(url)
        time.sleep(5)  # Wait for page to load

    def navigate_to_next_page(self):
        print("Attempting to navigate to the next page...")
        current_url = self.driver.current_url
        print(f"Current URL before navigating to next: {current_url}")

        try:
            wait = WebDriverWait(self.driver, 10)
            next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "li.ais-Pagination-item--nextPage a.ais-Pagination-link")))

            if next_button:
                next_page_url = next_button.get_attribute('href')
                print(f"Found next button, navigating to: {next_page_url}")
                next_button.click()
                time.sleep(5)  # Wait for the page to load after clicking
                return next_page_url  # Return the URL of the next page
            else:
                print("No more pages to navigate.")
                return None
        except (NoSuchElementException, TimeoutException) as e:
            print(f"No next button found or timed out: {e}")
            return None


    def scrape_transcript(self, episode_url):
        print(f"Scraping transcript from {episode_url}")
        self.driver.get(episode_url)
        time.sleep(10)  # Increased wait time for content to load

        try:
            transcript_sections = self.driver.find_elements(By.CSS_SELECTOR, "div.rich-text-transcript.w-richtext")
            print(f"Found {len(transcript_sections)} transcript sections")

            if not transcript_sections:
                print("No transcript content found. Skipping this episode.")
                return None

            # Using JavaScript to get innerHTML of the transcript section
            transcript_html = self.driver.execute_script("return arguments[0].innerHTML;", transcript_sections[0])
            
            # Optional: Clean up HTML tags if necessary, using regex or an HTML parser

            if not transcript_html:
                print("Transcript content is empty. Skipping this episode.")
                return None

            print("Transcript scraped successfully.")
            return transcript_html
        except Exception as e:
            print(f"Error while scraping transcript: {e}")
            return None


    def save_transcript(self, podcast_name, episode_title, transcript):
        # Create 'Transcripts' folder if it doesn't exist
        transcripts_folder = 'Transcripts'
        if not os.path.exists(transcripts_folder):
            os.makedirs(transcripts_folder)

        # Format filename and check if it already exists
        filename = f"{podcast_name}_{episode_title}.txt".replace(" ", "_").replace("/", "_")
        filepath = os.path.join(transcripts_folder, filename)
        
        if os.path.exists(filepath):
            print(f"Transcript already downloaded: {filename}")
            return
        else:
            with open(filepath, 'w', encoding='utf-8') as file:
                file.write(transcript)
            print(f"Transcript saved: {filename}")

def main():
    path_to_driver = "./msedgedriver.exe"
    scraper = HubermanLabScraper(path_to_driver)

    try:
        base_url = "https://www.hubermanlab.com/all-episodes"
        next_page_url = base_url  # Start with the base URL
        while next_page_url:
            # Navigate to the current main episodes page
            scraper.navigate_to_episodes_page(next_page_url)
            episode_links = scraper.get_episode_links()
            for link in episode_links:
                episode_title = link.split('/')[-1].split('?')[0]  # Extract title from URL
                filename = f"huberman_lab_{episode_title}.txt".replace(" ", "_").replace("/", "_")
                filepath = os.path.join('Transcripts', filename)

                if os.path.exists(filepath):
                    print(f"Transcript already downloaded: {filename}")
                    continue

                transcript = scraper.scrape_transcript(link)
                if transcript:  # Check if transcript is not None
                    scraper.save_transcript("huberman_lab", episode_title, transcript)

            print("Finished processing current page, moving to the next page...")
            # Go back to the main episodes page before finding the 'next' button
            scraper.navigate_to_episodes_page(base_url)
            next_page_url = scraper.navigate_to_next_page()  # Get the URL of the next page

    finally:
        scraper.close()

if __name__ == "__main__":
    main()


Navigating to Huberman Lab episodes page: https://www.hubermanlab.com/all-episodes
Fetching episode links...
Transcript already downloaded: huberman_lab_david-goggins-how-to-build-immense-inner-strength.txt
Transcript already downloaded: huberman_lab_ama-14-2023-philanthropy-evening-routine-light-therapy-health-metrics-more.txt
Scraping transcript from https://www.hubermanlab.com/episode/rick-rubin-protocols-to-access-creative-energy-and-process
Found 1 transcript sections
Transcript content is empty. Skipping this episode.
Scraping transcript from https://www.hubermanlab.com/episode/dr-robert-lustig-how-sugar-processed-foods-impact-your-health
Found 1 transcript sections
Transcript content is empty. Skipping this episode.
Scraping transcript from https://www.hubermanlab.com/episode/dr-karen-parker-the-causes-treatments-for-autism
Found 1 transcript sections
Transcript content is empty. Skipping this episode.
Scraping transcript from https://www.hubermanlab.com/episode/robert-greene-a-

KeyboardInterrupt: 