# Media

## Podcasts

In [5]:
from . import utils
from pathlib import Path
import feedparser

class Podcast:
    """ Class that holds the podcast feed and its metadata """
    def __init__(self, url):
        self.feed = feedparser.parse(url)
        # Podcast Information
        self.title = self.feed.feed.title
        self.image = self.feed.feed.image.href
        try:
            self.subtitle = self.feed.feed.subtitle
        except:
            print("No subtitle found")
        # Last Episode
        self.episode = Episode(self.title, self.feed.entries[0])


    def __repr__(self):
        """ Representation of Podcast Object """
        return f"""Podcast[Podcast={self.title},
                    Episode={self.episode.title}]"""


class Episode:
    """ Class that holds episode information """
    def __init__(self, podcast_title, episode_feed):
        self.podcast = podcast_title
        # Episode Information
        self.title = episode_feed.title
        for link in episode_feed.links:
            if link['type'] == 'audio/mpeg':
                self.url = link.href
        # Save files paths
        self.file_paths = self.setup_paths()


    def setup_paths(self):
        """ Sets up the file pats for the episode """
        paths = {}
        # Files' names
        AUDIO_FILE_NAME = "audio.mp3"
        TRANSCRIPT_FILE_NAME = "transcript.json"
        SUMMARY_FILE_NAME = "summary.txt"
        HIGHLIGHTS_FILE_NAME = "highlights.txt"
        # Setup content folder
        CONTENT_DIRECTORY = Path("podcasts")
        CONTENT_DIRECTORY.mkdir(exist_ok=True)
        # Setup podcast folder
        podcast_folder_name = utils.to_filename(self.podcast)
        podcast_directory = CONTENT_DIRECTORY.joinpath(podcast_folder_name)
        podcast_directory.mkdir(exist_ok=True)
        # Setup episode folder
        episode_directory = podcast_directory.joinpath(f"episode_{0}")
        episode_directory.mkdir(exist_ok=True)
        # Determin audio, transcript, summary and highlights path
        paths['audio'] = episode_directory.joinpath(AUDIO_FILE_NAME)
        paths['transcript']= episode_directory.joinpath(TRANSCRIPT_FILE_NAME)
        paths['summary'] = episode_directory.joinpath(SUMMARY_FILE_NAME)
        paths['highlights'] = episode_directory.joinpath(HIGHLIGHTS_FILE_NAME)

        return paths


    def download(self):
        """ Downloads the episode audio """
        # Check file type before downloading
        utils.download_audio(self.url, audio_path=self.file_paths['audio'])


    def __repr__(self):
        """ Representation of Episode Object """
        return f"""Episode[Podcast={self.podcast},
                    Episode={self.title}]"""

ImportError: attempted relative import with no known parent package

# AudioDownloaders

## Audio Downloaders Utilities

In [None]:
from pytube.query import StreamQuery
from pytube import Stream



## Audio Downloaders

In [None]:
from abc import ABC, abstractmethod

class BaseAudioDownloader(ABC):
    """ Base class for audio downloaders """
    pass

class YouTubeAudioDownloader(BaseAudioDownloader):
    """ YouTube audio downloader class """

    def __init__(self, account_name: str = None, password: str = None) -> None:
        """ YouTube audio downloader class constructor """
        self.set_auth_credentials(account_name, password)

    def set_auth_credentials(self, account_name: str, password: str) -> None:
        """ Set authentication credentials """
        self.account_name = account_name
        self.password = password

    @staticmethod
    def get_highest_abr_audio_stream(streams : StreamQuery) -> Stream:
        """ Returns the highest abr audio stream """
        return streams.filter(only_audio=True).order_by('abr').desc().first()

    def download(self, youtube : YouTubeVideo) -> None:
        """ Download audio from YouTube video """
        streams = youtube.feed.streams.filter(only_audio=True)
        stream = self.get_highest_abr_audio_stream(streams)
        

        


In [None]:
ytad = YouTubeAudioDownloader(account_name='user', password='pass')

In [None]:
yt = pytube.YouTube('https://www.youtube.com/watch?v=CvQ7e6yUtnw')

In [None]:
YouTubeAudioDownloader.get_highest_abr_audio_stream(yt.feed.streams)

<Stream: itag="251" mime_type="audio/webm" abr="160kbps" acodec="opus" progressive="False" type="audio">

# Metadata Managers

### Imports and constants

In [6]:
from abc import ABC, abstractmethod
import json

IMAGE_URL = "image_url"
CREATORS = "creators"
CHANNEL_NAME = "channel_name"
TITLE = "title"
SUMMARY = "summary"
DOWNLOAD_URL= "download_url"

### Base Metadata Manager

In [7]:
class BaseMetadataManager(ABC):
    """ Base class for metadata managers """

    @staticmethod
    @abstractmethod
    def fetch_metadata(feed, entry):
        """ Fetches metadata from the source. For example title, author, etc """
        ...

    @staticmethod
    @abstractmethod
    def fetch_store_paths():
        """ Fetches store paths for podcast metadata """
        ...

#### RSS Podcast Metadata Manager

In [8]:

class PodcastMetadataManager(BaseMetadataManager):
    """  """
    @staticmethod
    def _fetch_podcast_metadata(feed: json) -> dict:
        """ Fetches podcast metadata from RSS feed """
        if feed is None:
            raise ValueError("Podcast feed is None")
        return {CHANNEL_NAME: feed.title,
                IMAGE_URL: feed.image.href,
                CREATORS: [author.name for author in feed.authors]}

    @staticmethod
    def _fetch_episode_metadata(entry: json) -> dict:
        """ Fetches episode metadata from entry """
        if entry is None:
            raise ValueError("Episode entry is None")
        return {TITLE: entry.title,
                DOWNLOAD_URL: [link.href for link in entry.links if link.type == 'audio/mpeg'][0]}

    @staticmethod
    def fetch_metadata(feed: json, entry: json) -> dict:
        """ Fetches metadata from the source. For example title, author, etc """
        return {**PodcastMetadataManager._fetch_podcast_metadata(feed),
                **PodcastMetadataManager._fetch_episode_metadata(entry)}
    
    @staticmethod
    def fetch_store_paths():
        """ Fetches store paths for podcast metadata """
        pass

### Youtube Metadata Manager

In [9]:
from pytube import YouTube

class YouTubeVideoMetadataManager(BaseMetadataManager):
    """ Youtube video metadata manager """ 
    @staticmethod
    def fetch_metadata(feed: YouTube) -> json:
        """ Fetches metadata from the source. For example title, author, etc """
        return {CHANNEL_NAME: feed.author,
                IMAGE_URL: feed.thumbnail_url,
                CREATORS: [feed.author],
                TITLE: feed.title,
                DOWNLOAD_URL: feed.streams.filter(only_audio=True).order_by('abr').desc().first().url}
    
    @staticmethod
    def fetch_store_paths():
        """ Fetches store paths for podcast metadata """
        pass
        

# Media Sources

#### Imports

In [10]:
from abc import ABC, abstractmethod

import pytube
import feedparser
import json

from fuzzywuzzy import fuzz



### Base Media Source

In [11]:
class MediaSource(ABC):
    """ Base class for media sources """
    @abstractmethod
    def fetch_metadata(self):
        """ Fetches metadata from the source. For example title, author, etc """
        pass

### Podcast from RSS Source

In [12]:
class RSSSource(MediaSource):
    
    def __init__(self, url : str, episode_title: str = None) -> None:
        self.url = url
        self._feed = feedparser.parse(self.url)
        self.channel_name, self.title = self._feed.feed.title, None
        self._episode = None
        if episode_title:
            self._episode = self.find_entry_from_title(episode_title)
            self.fetch_metadata()

    def list_episodes(self):
        for entry in self._feed.entries:
            print("Title:", entry.title)

    def find_entry_from_title(self, title : str, similarity_threshold: float = 80) -> json:
        """ 
        Returns the entry with the highest similarity score
        over the similarity threshold.
        """
        title_list = [entry.title for entry in self._feed.entries]
        similarity_scores = []
        for t in title_list:
            similarity_scores.append(fuzz.ratio(title, t))
        max_score = max(similarity_scores)
        max_index = similarity_scores.index(max_score)
        return self._feed.entries[max_index]

    def fetch_metadata(self):
        """ Fetches podcast metadata from RSS feed """
        # if self._episode is None:
        #     raise ValueError("No episode selected")
        metadata = PodcastMetadataManager.fetch_metadata(self._feed.feed, self._episode)
        for key, value in metadata.items():
            setattr(self, key, value)

    def __repr__(self):
        """ Representation of Podcast Object """
        return f"""Podcast[Podcast = {self.channel_name}, Episode = {self.title}]"""

### YouTube Video Source

In [13]:
class YouTubeVideoSource(MediaSource):
    """ YouTube video source class """
    def __init__(self, url : str) -> None:
        super().__init__()
        self.url = url
        self._feed = pytube.YouTube(self.url)

    def fetch_metadata(self) -> None:
        """ Fetches YouTube video metadata """
        metadata = YouTubeVideoMetadataManager.fetch_metadata(self._feed)
        for key, value in metadata.items():
            setattr(self, key, value)

### Source Factory

In [14]:
class SourceFactory:
    @staticmethod
    def create_source(source_type: str, url: str, episode_title: str = None) -> MediaSource:
        if source_type == "rss":
            return RSSSource(url, episode_title)
        elif source_type == "youtube":
            return YouTubeVideoSource(url)

# Tests

In [17]:
rss = SourceFactory.create_source('rss', 'https://lexfridman.com/feed/podcast/', 'John Mearsheimer: Israel-Palestine, Russia-Ukraine, China, NATO, and WW3')
rss.fetch_metadata()

for key, value in rss.__dict__.items():
    if not key.startswith('_'):
        print(f"{key}:", value)

print(rss)

url: https://lexfridman.com/feed/podcast/
channel_name: Lex Fridman Podcast
title: #401 – John Mearsheimer: Israel-Palestine, Russia-Ukraine, China, NATO, and WW3
image_url: https://lexfridman.com/wordpress/wp-content/uploads/powerpress/artwork_3000-230.png
creators: ['Lex Fridman']
download_url: https://media.blubrry.com/takeituneasy/content.blubrry.com/takeituneasy/lex_ai_john_mearsheimer.mp3
Podcast[Podcast = Lex Fridman Podcast, Episode = #401 – John Mearsheimer: Israel-Palestine, Russia-Ukraine, China, NATO, and WW3]


In [None]:
ytv = SourceFactory.create_source('youtube', 'https://www.youtube.com/watch?v=CvQ7e6yUtnw')
ytv.fetch_metadata()
for key, value in ytv.__dict__.items():
    if not key.startswith('_'):
        print(f"{key}:", value)

url: https://www.youtube.com/watch?v=CvQ7e6yUtnw
channel_name: ArjanCodes
image_url: https://i.ytimg.com/vi/CvQ7e6yUtnw/hq720.jpg
creators: ['ArjanCodes']
title: This Is Why Python Data Classes Are Awesome
download_url: https://rr2---sn-uhvcpaxoa-5hne.googlevideo.com/videoplayback?expire=1700702774&ei=1lVeZbHNL8C_6dsPoeCKyAw&ip=94.157.1.0&id=o-AIZefq_J3RAfLGp-Bm6IB3d6g2k_04RxZ9treyg5OZJD&itag=251&source=youtube&requiressl=yes&mh=Gm&mm=31%2C29&mn=sn-uhvcpaxoa-5hne%2Csn-5hne6nzk&ms=au%2Crdu&mv=m&mvi=2&pl=23&initcwndbps=2355000&vprv=1&mime=audio%2Fwebm&gir=yes&clen=18946649&dur=1338.941&lmt=1651195320148263&mt=1700680857&fvip=5&keepalive=yes&fexp=24007246&c=ANDROID_EMBEDDED_PLAYER&txp=4432434&sparams=expire%2Cei%2Cip%2Cid%2Citag%2Csource%2Crequiressl%2Cvprv%2Cmime%2Cgir%2Cclen%2Cdur%2Clmt&sig=ANLwegAwRAIgUu7upxWOiqaKsZHaID9sR9jxse87ZmTmvB5T4qo3-ocCIB2kG_GidSOfk_PJqogT5QaI88JuoDhvdeI_MfDE0CBi&lsparams=mh%2Cmm%2Cmn%2Cms%2Cmv%2Cmvi%2Cpl%2Cinitcwndbps&lsig=AM8Gb2swRAIgdz_pR5sRQQi_HBPem3LFJ5Ei