# Metadata Managers

### Imports and constants

In [217]:
from abc import ABC, abstractmethod
import json

IMAGE_URL = "image_url"
CREATORS = "creators"
CHANNEL_NAME = "channel_name"
TITLE = "title"
SUMMARY = "summary"
AUDIO_URL= "audio_url"

### Base Metadata Manager

In [218]:
CONTENT_DIRECTORY_NAME = "content"
AUDIO_FILE_NAME = "audio.mp3"
TRANSCRIPT_FILE_NAME = "transcript.json"
SUMMARY_FILE_NAME = "summary.txt"

class BaseMetadataManager(ABC):
    """ Base class for metadata managers """

    @abstractmethod
    def fetch_metadata(feed, entry):
        """ Fetches metadata from the source. For example title, author, etc """
        pass


#### RSS Podcast Metadata Manager

In [219]:
from pathlib import Path
import utils

class PodcastMetadataManager(BaseMetadataManager):
    """  """
    def __init__(self, feed: json, entry: json) -> None:
        """ Initialises PodcastMetadataManager """
        self.feed = feed
        self.entry = entry

    def _fetch_podcast_metadata(self) -> dict:
        """ Fetches podcast metadata from RSS feed """
        if self.feed is None:
            raise ValueError("Podcast feed is None")
        return {CHANNEL_NAME: self.feed.title,
                IMAGE_URL: self.feed.image.href,
                CREATORS: [author.name for author in self.feed.authors]}

    def _fetch_episode_metadata(self) -> dict:
        """ Fetches episode metadata from entry """
        if self.entry is None:
            raise ValueError("Episode entry is None")
        return {TITLE: self.entry.title,
                AUDIO_URL: [link.href for link in self.entry.links if link.type == 'audio/mpeg'][0]}
    
    def _fetch_store_paths(self) -> dict:
        """ Fetches store paths for podcast metadata """
        CONTENT_DIRECTORY = Path(CONTENT_DIRECTORY_NAME)
        CONTENT_DIRECTORY.mkdir(exist_ok=True)
        # Setup channel directory
        channel_dir_name = utils.to_filename(self.feed.title)
        channel_dir = CONTENT_DIRECTORY.joinpath(channel_dir_name)
        channel_dir.mkdir(exist_ok=True)
        # Setup channel directory
        media_source_dir_name = utils.to_filename(self.entry.title)
        media_source_dir = channel_dir.joinpath(media_source_dir_name)
        media_source_dir.mkdir(exist_ok=True)
        # Determin audio, transcript, summary and highlights path
        return {AUDIO_FILE_NAME.split('.')[0]: media_source_dir.joinpath(AUDIO_FILE_NAME),
                TRANSCRIPT_FILE_NAME.split('.')[0]: media_source_dir.joinpath(TRANSCRIPT_FILE_NAME),
                SUMMARY_FILE_NAME.split('.')[0]: media_source_dir.joinpath(SUMMARY_FILE_NAME)}

    def fetch_metadata(self) -> dict:
        """ Fetches metadata from the source. For example title, author, etc """
        return (self._fetch_store_paths(),
                {**self._fetch_podcast_metadata(),
                 **self._fetch_episode_metadata()})


### Youtube Metadata Manager

In [220]:
from pytube import YouTube

class YouTubeVideoMetadataManager(BaseMetadataManager):
    """ Youtube video metadata manager """ 
    def __init__(self, feed : YouTube) -> None:
        """ Initialises YouTubeVideoMetadataManager """
        self.feed = feed

    def _fetch_store_paths(self) -> dict:
        """ Fetches store paths for podcast metadata """
        CONTENT_DIRECTORY = Path(CONTENT_DIRECTORY_NAME)
        CONTENT_DIRECTORY.mkdir(exist_ok=True)
        # Setup channel directory
        channel_dir_name = utils.to_filename(self.feed.author)
        channel_dir = CONTENT_DIRECTORY.joinpath(channel_dir_name)
        channel_dir.mkdir(exist_ok=True)
        # Setup channel directory
        media_source_dir_name = utils.to_filename(self.feed.title)
        media_source_dir = channel_dir.joinpath(media_source_dir_name)
        media_source_dir.mkdir(exist_ok=True)
        # Determin audio, transcript, summary and highlights path
        return {AUDIO_FILE_NAME.split('.')[0]: media_source_dir.joinpath(AUDIO_FILE_NAME),
                TRANSCRIPT_FILE_NAME.split('.')[0]: media_source_dir.joinpath(TRANSCRIPT_FILE_NAME),
                SUMMARY_FILE_NAME.split('.')[0]: media_source_dir.joinpath(SUMMARY_FILE_NAME)}

    def fetch_metadata(self) -> dict:
        """ Fetches metadata from the source. For example title, author, etc """
        return (self._fetch_store_paths(),
                {CHANNEL_NAME: self.feed.author,
                    IMAGE_URL: self.feed.thumbnail_url,
                    CREATORS: [self.feed.author],
                    TITLE: self.feed.title,
                    AUDIO_URL: self.feed.streams.filter(only_audio=True).order_by('abr').desc().first().url})
        

# Media Sources

#### Imports

In [221]:
from abc import ABC, abstractmethod

import pytube
import feedparser
import json

from fuzzywuzzy import fuzz

### Base Media Source

In [222]:
from  pathlib import Path
import utils

class MediaSource(ABC):
    """ Base class for media sources """
    @abstractmethod
    def fetch_metadata(self):
        """ Fetches metadata from the source. For example title, author, etc """
        pass

    @abstractmethod
    def fetch_audio_paths_urls(self):
        """ Fetches media from the source """
        pass


### Podcast from RSS Source

In [223]:
class RSSPodcast(MediaSource):
    
    def __init__(self, url : str, episode_title: str = None) -> None:
        self.url = url
        self._feed = feedparser.parse(self.url)
        self.channel_name, self.title = self._feed.feed.title, None
        self._episode = None
        if episode_title:
            self._episode = self.find_entry_from_title(episode_title)
            self.fetch_metadata()

    def list_episodes(self):
        for entry in self._feed.entries:
            print("Title:", entry.title)

    def find_entry_from_title(self, title : str, similarity_threshold: float = 80) -> json:
        """ 
        Returns the entry with the highest similarity score
        over the similarity threshold.
        """
        title_list = [entry.title for entry in self._feed.entries]
        similarity_scores = []
        for t in title_list:
            similarity_scores.append(fuzz.ratio(title, t))
        max_score = max(similarity_scores)
        max_index = similarity_scores.index(max_score)
        return self._feed.entries[max_index]

    def fetch_metadata(self):
        """ Fetches podcast metadata from RSS feed """
        if self._episode is None:
            raise ValueError("No episode selected")
        self.store_paths, metadata = PodcastMetadataManager(self._feed.feed, self._episode).fetch_metadata()
        for key, value in metadata.items():
            setattr(self, key, value)

    def fetch_audio_paths_urls(self):
        """ Returns the download url for the episode """
        return {self.store_paths["audio"]: getattr(self, AUDIO_URL)}

    def __repr__(self):
        """ Representation of Podcast Object """
        return f"""Podcast[Podcast = {self.channel_name}, Episode = {self.title}]"""

### YouTube Video Source

In [224]:
class YouTubeVideo(MediaSource):
    """ YouTube video source class """
    def __init__(self, url : str) -> None:
        self.url = url
        self._feed = pytube.YouTube(self.url)
        self.fetch_metadata()

    def fetch_metadata(self) -> None:
        """ Fetches YouTube video metadata """
        self.store_paths, metadata = YouTubeVideoMetadataManager(self._feed).fetch_metadata()
        for key, value in metadata.items():
            setattr(self, key, value)
    
    def fetch_audio_paths_urls(self):
        """ Returns the download url for the episode """
        return {self.store_paths["audio"]: getattr(self, AUDIO_URL)}

    def __repr__(self):
        """ Representation of YouTubeVideo Object """
        return f"""YouTubeVideo[Channel = {self.channel_name}, Title = {self.title}]"""

### YouTube Playlist Source

In [234]:
class YouTubePlaylist(MediaSource):
    """ YouTube playlist source class """
    def __init__(self, url : str) -> None:
        self.url = url
        self._feed = pytube.Playlist(self.url)
        self.title = self._feed.title
        self.fetch_metadata()

    def fetch_metadata(self) -> None:
        """ Fetches YouTube playlist metadata """
        self.youtube_videos = [YouTubeVideo(video_url) for video_url in self._feed.video_urls]

    def fetch_audio_paths_urls(self):
        """ Returns the download url for the episode """
        audio_paths_urls = {}
        for video in self.youtube_videos:
            audio_paths_urls.update(video.fetch_audio_paths_urls())

    def __repr__(self):
        """ Representation of YouTubePlaylist Object """
        return f"""YouTubePlaylist[Playlist = {self.title}]"""



### Source Factory

In [226]:
class SourceFactory:
    @staticmethod
    def create_source(source_type: str, url: str, episode_title: str = None) -> MediaSource:
        if source_type == "rss":
            return RSSPodcast(url, episode_title)
        elif source_type == "youtube":
            return YouTubeVideo(url)

### Tests

In [227]:
rss = SourceFactory.create_source('rss', 'https://lexfridman.com/feed/podcast/', 'John Mearsheimer: Israel-Palestine, Russia-Ukraine, China, NATO, and WW3')
rss.fetch_metadata()

for key, value in rss.__dict__.items():
    if not key.startswith('_'):
        print(f"{key}:", value)

print(rss)

url: https://lexfridman.com/feed/podcast/
channel_name: Lex Fridman Podcast
title: #401 – John Mearsheimer: Israel-Palestine, Russia-Ukraine, China, NATO, and WW3
store_paths: {'audio': PosixPath('content/lex_fridman_podcast/401_john_mearsheimer_israel_palestine_russia_ukraine_china_nato_and_ww3/audio.mp3'), 'transcript': PosixPath('content/lex_fridman_podcast/401_john_mearsheimer_israel_palestine_russia_ukraine_china_nato_and_ww3/transcript.json'), 'summary': PosixPath('content/lex_fridman_podcast/401_john_mearsheimer_israel_palestine_russia_ukraine_china_nato_and_ww3/summary.txt')}
image_url: https://lexfridman.com/wordpress/wp-content/uploads/powerpress/artwork_3000-230.png
creators: ['Lex Fridman']
audio_url: https://media.blubrry.com/takeituneasy/content.blubrry.com/takeituneasy/lex_ai_john_mearsheimer.mp3
Podcast[Podcast = Lex Fridman Podcast, Episode = #401 – John Mearsheimer: Israel-Palestine, Russia-Ukraine, China, NATO, and WW3]


In [241]:
ytv = SourceFactory.create_source('youtube', 'https://www.youtube.com/watch?v=CvQ7e6yUtnw')
for key, value in ytv.__dict__.items():
    if not key.startswith('_'):
        print(f"{key}:", value)
print(ytv)

url: https://www.youtube.com/watch?v=CvQ7e6yUtnw
store_paths: {'audio': PosixPath('content/arjancodes/this_is_why_python_data_classes_are_awesome/audio.mp3'), 'transcript': PosixPath('content/arjancodes/this_is_why_python_data_classes_are_awesome/transcript.json'), 'summary': PosixPath('content/arjancodes/this_is_why_python_data_classes_are_awesome/summary.txt')}
channel_name: ArjanCodes
image_url: https://i.ytimg.com/vi/CvQ7e6yUtnw/hq720.jpg
creators: ['ArjanCodes']
title: This Is Why Python Data Classes Are Awesome
audio_url: https://rr2---sn-uhvcpaxoa-5hne.googlevideo.com/videoplayback?expire=1700761032&ei=aDlfZbrIOcyp1gLQt7ToAQ&ip=94.157.1.0&id=o-AF0H1HSkOYZl3mKa_rdqfbuSwkVOcdy52c10bNH9w6J1&itag=251&source=youtube&requiressl=yes&mh=Gm&mm=31%2C29&mn=sn-uhvcpaxoa-5hne%2Csn-5hne6nzk&ms=au%2Crdu&mv=m&mvi=2&pl=23&initcwndbps=2317500&vprv=1&mime=audio%2Fwebm&gir=yes&clen=18946649&dur=1338.941&lmt=1651195320148263&mt=1700738960&fvip=5&keepalive=yes&fexp=24007246&c=ANDROID_EMBEDDED_PLAYER&

In [240]:
ytp = YouTubePlaylist('https://www.youtube.com/playlist?list=PLrAXtmErZgOeiKm4sgNOknGvNjby9efdf')
for key, value in ytp.__dict__.items():
    if not key.startswith('_'):
        print(f"{key}:", value)
print(ytp)

url: https://www.youtube.com/playlist?list=PLrAXtmErZgOeiKm4sgNOknGvNjby9efdf
title: Select Lectures
youtube_videos: [YouTubeVideo[Channel = Lex Fridman, Title = Deep Learning State of the Art (2020)], YouTubeVideo[Channel = Lex Fridman, Title = Deep Learning Basics: Introduction and Overview]]
YouTubePlaylist[Playlist = Select Lectures]


# Audio Downloader

In [230]:
import requests

class AudioDownloader:
    """ Audio downloader class """
    
    @staticmethod
    def download_audio_from_url(audio_url, audio_path):
        """
        downloads audio content from a given URL in chunks and saves it to a specified local file path.
        It properly handles HTTP response status checks and resource management using context managers.
        The use of chunked downloading is particularly useful for handling large files without consuming
        excessive memory.
        """
        with requests.get(audio_url, stream=True) as response:
            # This line checks if the HTTP request was successful.
            # If the status code of the response is not in the 200-299 range (indicating success),
            # this line will raise an exception, indicating that there was an error in fetching the content.
            response.raise_for_status()
            with open(audio_path, 'wb') as f:   # Writes a file in binary mode
                # It iterates the response and writes the file in chunks of 8KB
                # It's a good practice when handling big files
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)

    @staticmethod
    def download_from_media_source(source: MediaSource):
        """ Returns an audio downloader object from a media source """
        for audio_path, audio_url in source.fetch_audio_paths_urls().items():
            AudioDownloader.download_audio_from_url(audio_url, audio_path)

In [231]:
AudioDownloader.download_from_media_source(rss)