In [2]:
# Importing the necessary Python libraries
import os
import json
import time
import yaml
from datetime import datetime

import polars as pl

import feedparser
import mlx_whisper
import requests

with open('../keys/watercooler_rss_feed.yaml', 'r') as f:
    wc_rss_urls = yaml.safe_load(f)
    wc_rss_patreon_url = wc_rss_urls['WATERCOOLER_PATREON_FEED']
    wc_rss_public_url = wc_rss_urls['WATERCOOLER_PUBLIC_FEED']

wc_rss_patreon = feedparser.parse(wc_rss_patreon_url)
wc_rss_public = feedparser.parse(wc_rss_public_url)

In [None]:
df_wc_public_episodes = pl.read_csv('../data/episode-metadata/wc_public_episodes.csv')

wc_public_transcript_dir = '../data/transcripts/main'

# Iterating over all the episodes in the public episode metadata DataFrame
for episode in df_wc_public_episodes.iter_rows(named = True):

    # Setting the file path for the episode transcript
    episode_transcript_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.txt")
    episode_audio_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.mp3")

    # Checking if the episode transcript file exists
    if os.path.exists(episode_transcript_filepath):
        continue

    # Downloading the audio file for the episode
    response = requests.get(episode['link'], stream = True)

    # Writing the audio file to the disk
    with open(episode_audio_filepath, 'wb') as f:
        for chunk in response.iter_content(chunk_size = 1024):
            f.write(chunk)

    # Transcribing the audio file with the MLX Whisper API
    transcribed_text = mlx_whisper.transcribe(episode_audio_filepath, path_or_hf_repo=f'mlx-community/whisper-large-v3-turbo')['text']

    # Writing the transcribed text to the episode transcript file
    with open(episode_transcript_filepath, 'w') as f:
        f.write(transcribed_text)

    # Deleting the audio file
    os.remove(episode_audio_filepath)

    

    print(episode_transcript_filepath)

    break