## 1) Web Scraping to obtain transcripts

This script will create a text file for each episode's transcript in the script's directory. Ensure you have sufficient permissions and storage space for the files.

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
import time
import os

class PodscriptScraper:
    def __init__(self, path_to_driver):
        self.path_to_driver = path_to_driver
        service = Service(executable_path=self.path_to_driver)
        self.driver = webdriver.Edge(service=service)

    def close(self):
        self.driver.quit()

    def navigate_to_podcast(self, podcast_name):
        print(f"Navigating to podcast: {podcast_name}")
        self.driver.get("https://podscript.ai/podcasts/")
        time.sleep(5)  # Wait for page to load
        podcasts = self.driver.find_elements(By.CSS_SELECTOR, "article.post-entry")
        for podcast in podcasts:
            header = podcast.find_element(By.CSS_SELECTOR, "h2")
            if podcast_name.lower() in header.text.lower():
                link = podcast.find_element(By.CSS_SELECTOR, "a.entry-link")
                link.click()
                time.sleep(5)  # Wait for navigation
                return True
        return False

    def get_episode_links(self):
        print("Fetching episode links...")
        episode_links = []
        episodes = self.driver.find_elements(By.CSS_SELECTOR, "article.post-entry a.entry-link")
        for episode in episodes:
            link = episode.get_attribute('href')
            episode_links.append(link)
        return episode_links
    
    def navigate_to_next_page(self):
        try:
            # Wait for up to 10 seconds for the 'next' button to be available
            wait = WebDriverWait(self.driver, 10)
            next_button = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "nav.pagination a.next")))

            if next_button:
                next_page_url = next_button.get_attribute('href')
                print(f"Navigating to next page: {next_page_url}")
                self.driver.get(next_page_url)
                time.sleep(5)  # Wait for page to load
                return True
        except (NoSuchElementException, TimeoutException) as e:
            print(f"No next button found or timed out: {e}")
            return False

    def scrape_transcript(self, episode_url):
        print(f"Scraping transcript from {episode_url}")
        self.driver.get(episode_url)
        time.sleep(10)  # Increased wait time for content to load

        try:
            # Targeting all paragraph tags for transcript extraction
            paragraphs = self.driver.find_elements(By.TAG_NAME, "p")
            transcript = "\n".join([p.text for p in paragraphs if p.text.strip() != ""])
            
            if not transcript:
                print("No transcript content found.")
            else:
                print("Transcript scraped successfully.")
            
            return transcript
        except Exception as e:
            print(f"Error while scraping transcript: {e}")
            return ""


    def save_transcript(self, podcast_name, episode_title, transcript):
        # Create 'Transcripts' folder if it doesn't exist
        transcripts_folder = 'Transcripts'
        if not os.path.exists(transcripts_folder):
            os.makedirs(transcripts_folder)

        # Format filename and check if it already exists
        filename = f"{podcast_name}_{episode_title}.txt".replace(" ", "_").replace("/", "_")
        filepath = os.path.join(transcripts_folder, filename)
        
        if os.path.exists(filepath):
            print(f"Transcript already downloaded: {filename}")
            return
        else:
            with open(filepath, 'w', encoding='utf-8') as file:
                file.write(transcript)
            print(f"Transcript saved: {filename}")

def main():
    path_to_driver = "./msedgedriver.exe" 
    scraper = PodscriptScraper(path_to_driver)
    transcripts_folder = 'Transcripts'

    try:
        podcasts = [
            "lex fridman podcast - transcripts",
            "huberman lab podcast - transcripts",
            "the diary of a ceo with steven bartlett podcast - transcripts",
            "lifespan with dr. david sinclair podcast - transcripts",
            "revolutions podcast - transcripts"
        ]

        for podcast_name in podcasts:
            if scraper.navigate_to_podcast(podcast_name):
                while True:
                    episode_links = scraper.get_episode_links()
                    for link in episode_links:
                        episode_title = link.split('/')[-2]  # Extract title from URL
                        # Format filename and check if it already exists
                        filename = f"{podcast_name}_{episode_title}.txt".replace(" ", "_").replace("/", "_")
                        filepath = os.path.join(transcripts_folder, filename)
                        
                        if os.path.exists(filepath):
                            print(f"Transcript already downloaded: {filename}")
                            continue

                        transcript = scraper.scrape_transcript(link)
                        scraper.save_transcript(podcast_name, episode_title, transcript)

                    if not scraper.navigate_to_next_page():
                        break
            else:
                print(f"Could not navigate to {podcast_name}")
    finally:
        scraper.close()

if __name__ == "__main__":
    main()


Navigating to podcast: lex fridman podcast - transcripts
Fetching episode links...
Transcript already downloaded: lex_fridman_podcast_-_transcripts_374-robert-playter-boston-dynamics-ceo-on-humanoid-and-legged-robotics.txt
Transcript already downloaded: lex_fridman_podcast_-_transcripts_373-manolis-kellis-evolution-of-human-civilization-and-superintelligent-ai.txt
Transcript already downloaded: lex_fridman_podcast_-_transcripts_372-simone-giertz-queen-of-sh-tty-robots-innovative-engineering-and-design.txt
Transcript already downloaded: lex_fridman_podcast_-_transcripts_371-max-tegmark-the-case-for-halting-ai-development.txt
Transcript already downloaded: lex_fridman_podcast_-_transcripts_370-edward-frenkel-reality-is-a-paradox-mathematics-physics-truth-love.txt
Transcript already downloaded: lex_fridman_podcast_-_transcripts_369-paul-rosolie-amazon-jungle-uncontacted-tribes-anacondas-and-ayahuasca.txt
Transcript already downloaded: lex_fridman_podcast_-_transcripts_368-eliezer-yudkowsk