In [1]:
# Importing the necessary Python libraries
import os
import json
import time
import yaml
import traceback
from datetime import datetime

import polars as pl

import mlx_whisper
import requests

In [None]:
df_wc_public_episodes = pl.read_csv('../data/episode-metadata/wc_public_episodes.csv')
df_wc_patreon_episodes = pl.read_csv('../data/episode-metadata/wc_patreon_episodes.csv')
df_wc_movie_episodes = pl.read_csv('../data/episode-metadata/wc_movie_night_episodes.csv')

wc_transcript_dir = '../data/transcripts/'

## Public Episodes

In [3]:
# Iterating over all the episodes in the public episode metadata DataFrame
for episode in df_wc_public_episodes.iter_rows(named = True):

    # Setting the file path main episodes
    wc_public_transcript_dir = os.path.join(wc_transcript_dir, 'main')

    # Setting the file path for the episode transcript
    episode_transcript_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.txt")
    episode_transcript_metadata_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.txt.metadata.json")
    episode_audio_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.mp3")

    # Checking if the episode transcript file exists
    if os.path.exists(episode_transcript_filepath):
        continue

    # Attempting to download and transcribe the audio
    try:

        # Downloading the audio file for the episode
        response = requests.get(episode['link'], stream = True)

        # Writing the audio file to the disk
        with open(episode_audio_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size = 1024):
                f.write(chunk)

        # Transcribing the audio file with the MLX Whisper API
        transcribed_text = mlx_whisper.transcribe(episode_audio_filepath, path_or_hf_repo=f'mlx-community/whisper-large-v3-turbo')['text']

        # Writing the transcribed text to the episode transcript file
        with open(episode_transcript_filepath, 'w') as f:
            f.write(transcribed_text)

        # Deleting the audio file
        os.remove(episode_audio_filepath)

    except Exception as e:
        # Log the error details to a file
        error_log_path = '../data/logs/transcript_errors.log'
        os.makedirs(os.path.dirname(error_log_path), exist_ok=True)
        
        with open(error_log_path, 'a') as log_file:
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            log_file.write(f"[{timestamp}] Error processing episode {episode['episode_num']}: {str(e)}\n")
            log_file.write(f"Episode link: {episode['link']}\n")
            log_file.write(f"Traceback: {traceback.format_exc()}\n\n")
        
        print(f"Error processing episode {episode['episode_num']}. See log file for details.")

        if os.path.exists(episode_audio_filepath):
            os.remove(episode_audio_filepath)

        continue

    # Forming the metadata content
    episode_metadata = {
        'metadataAttributes': {
            'episode_title': episode['title'],
            'episode_summary': episode['summary'],
            'episode_num': episode['episode_num'],
            'episode_upload_date': episode['timestamp']
        }
    }

    # Writing the metadata content to the episode transcript metadata file
    with open(episode_transcript_metadata_filepath, 'w') as f:
        json.dump(episode_metadata, f, indent = 4)

    break

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

## Patreon Episodes

In [4]:
# Iterating over all the episodes in the Patreon episode metadata DataFrame
for episode in df_wc_patreon_episodes.iter_rows(named = True):

    # Setting the file path main episodes
    wc_public_transcript_dir = os.path.join(wc_transcript_dir, 'patreon')

    # Setting the file path for the episode transcript
    episode_transcript_filepath = os.path.join(wc_public_transcript_dir, f"episode_{str(episode['episode_num']).zfill(3)}.txt")
    episode_transcript_metadata_filepath = os.path.join(wc_public_transcript_dir, f"episode_{str(episode['episode_num']).zfill(3)}.txt.metadata.json")
    episode_audio_filepath = os.path.join(wc_public_transcript_dir, f"episode_{str(episode['episode_num']).zfill(3)}.mp3")

    # Checking if the episode transcript file exists
    if os.path.exists(episode_transcript_filepath):
        continue

    # Attempting to download and transcribe the audio
    try:

        # Downloading the audio file for the episode
        response = requests.get(episode['link'], stream = True)

        # Writing the audio file to the disk
        with open(episode_audio_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size = 1024):
                f.write(chunk)

        # Transcribing the audio file with the MLX Whisper API
        transcribed_text = mlx_whisper.transcribe(episode_audio_filepath, path_or_hf_repo=f'mlx-community/whisper-large-v3-turbo')['text']

        # Writing the transcribed text to the episode transcript file
        with open(episode_transcript_filepath, 'w') as f:
            f.write(transcribed_text)

        # Deleting the audio file
        os.remove(episode_audio_filepath)

    except Exception as e:
        # Log the error details to a file
        error_log_path = '../data/logs/transcript_errors.log'
        os.makedirs(os.path.dirname(error_log_path), exist_ok=True)
        
        with open(error_log_path, 'a') as log_file:
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            log_file.write(f"[{timestamp}] Error processing episode {episode['episode_num']}: {str(e)}\n")
            log_file.write(f"Episode link: {episode['link']}\n")
            log_file.write(f"Traceback: {traceback.format_exc()}\n\n")
        
        print(f"Error processing episode {episode['episode_num']}. See log file for details.")

        if os.path.exists(episode_audio_filepath):
            os.remove(episode_audio_filepath)

        continue

    # Forming the metadata content
    episode_metadata = {
        'metadataAttributes': {
            'episode_title': episode['title'],
            'episode_summary': episode['summary'],
            'episode_num': episode['episode_num'],
            'episode_upload_date': episode['timestamp']
        }
    }

    # Writing the metadata content to the episode transcript metadata file
    with open(episode_transcript_metadata_filepath, 'w') as f:
        json.dump(episode_metadata, f, indent = 4)

    break

## Movie Night Episodes

In [None]:
# Iterating over all the episodes in the public episode metadata DataFrame
for episode in df_wc_public_episodes.iter_rows(named = True):

    # Setting the file path main episodes
    wc_public_transcript_dir = os.path.join(wc_transcript_dir, 'movie-night')

    # Setting the file path for the episode transcript
    episode_transcript_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.txt")
    episode_transcript_metadata_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.txt.metadata.json")
    episode_audio_filepath = os.path.join(wc_public_transcript_dir, f"episode_{episode['episode_num']:03d}.mp3")

    # Checking if the episode transcript file exists
    if os.path.exists(episode_transcript_filepath):
        continue

    # Attempting to download and transcribe the audio
    try:

        # Downloading the audio file for the episode
        response = requests.get(episode['link'], stream = True)

        # Writing the audio file to the disk
        with open(episode_audio_filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size = 1024):
                f.write(chunk)

        # Transcribing the audio file with the MLX Whisper API
        transcribed_text = mlx_whisper.transcribe(episode_audio_filepath, path_or_hf_repo=f'mlx-community/whisper-large-v3-turbo')['text']

        # Writing the transcribed text to the episode transcript file
        with open(episode_transcript_filepath, 'w') as f:
            f.write(transcribed_text)

        # Deleting the audio file
        os.remove(episode_audio_filepath)

    except Exception as e:
        # Log the error details to a file
        error_log_path = '../data/logs/transcript_errors.log'
        os.makedirs(os.path.dirname(error_log_path), exist_ok=True)
        
        with open(error_log_path, 'a') as log_file:
            timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            log_file.write(f"[{timestamp}] Error processing episode {episode['episode_num']}: {str(e)}\n")
            log_file.write(f"Episode link: {episode['link']}\n")
            log_file.write(f"Traceback: {traceback.format_exc()}\n\n")
        
        print(f"Error processing episode {episode['episode_num']}. See log file for details.")

        if os.path.exists(episode_audio_filepath):
            os.remove(episode_audio_filepath)

        continue

    # Forming the metadata content
    episode_metadata = {
        'metadataAttributes': {
            'episode_title': episode['title'],
            'episode_summary': episode['summary'],
            'episode_num': episode['episode_num'],
            'episode_upload_date': episode['timestamp']
        }
    }

    # Writing the metadata content to the episode transcript metadata file
    with open(episode_transcript_metadata_filepath, 'w') as f:
        json.dump(episode_metadata, f, indent = 4)

    break