##### *Importing Libraries*

In [1]:
import os
import time
import requests

import regex as re
from bs4 import BeautifulSoup

In [2]:
rss_feed_urls = {'Lex Fridman': "https://lexfridman.com/feed/podcast/",
                 'Tim Ferris' : "https://rss.art19.com/tim-ferriss-show",
                 'Andrew Huberman': "https://feeds.megaphone.fm/hubermanlab",
                 'Peter Attia' : "https://peterattiamd.com/feed/",}

In [3]:
for val in rss_feed_urls.items():
    print(val)

('Lex Fridman', 'https://lexfridman.com/feed/podcast/')
('Tim Ferris', 'https://rss.art19.com/tim-ferriss-show')
('Andrew Huberman', 'https://feeds.megaphone.fm/hubermanlab')
('Peter Attia', 'https://peterattiamd.com/feed/')


##### *Podcast to download.*

In [4]:
podcast = 'Lex Fridman'

##### *Fetch the RSS feed content.*

In [5]:
page = requests.get(rss_feed_urls[podcast])
soup = BeautifulSoup(page.content, 'xml')

##### *Find all podcast items in the RSS feed.*

In [6]:
pod_items = soup.find_all('item')

In [7]:
pod_items[0]

<item>
<title>#398 – Mark Zuckerberg: First Interview in the Metaverse</title>
<link>https://lexfridman.com/mark-zuckerberg-3/?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=mark-zuckerberg-3</link>
<pubDate>Thu, 28 Sep 2023 21:15:22 +0000</pubDate>
<guid isPermaLink="false">https://lexfridman.com/?p=5661</guid>
<comments>https://lexfridman.com/mark-zuckerberg-3/#respond</comments>
<wfw:commentRss>https://lexfridman.com/mark-zuckerberg-3/feed/</wfw:commentRss>
<slash:comments>0</slash:comments>
<category>ai</category>
<description>Mark Zuckerberg is CEO of Meta. Please support this podcast by checking out our sponsors:
- LMNT: https://drinkLMNT.com/lex to get free sample pack
- InsideTracker: https://insidetracker.com/lex to get 20% off
- Eight Sleep: https://www.eightsleep.com/lex to get special savings
- AG1: https://drinkag1.com/lex to get 1 month supply of fish oil
- NetSuite: http://netsuite.com/lex to get free product tour

Transcript: https://lexfridman.com/mark-zuckerberg-3

In [8]:
pod_items[0].find('description')

<description>Mark Zuckerberg is CEO of Meta. Please support this podcast by checking out our sponsors:
- LMNT: https://drinkLMNT.com/lex to get free sample pack
- InsideTracker: https://insidetracker.com/lex to get 20% off
- Eight Sleep: https://www.eightsleep.com/lex to get special savings
- AG1: https://drinkag1.com/lex to get 1 month supply of fish oil
- NetSuite: http://netsuite.com/lex to get free product tour

Transcript: https://lexfridman.com/mark-zuckerberg-3-transcript

EPISODE LINKS:
Mark's Facebook: https://facebook.com/zuck
Mark's Instagram: https://instagram.com/zuck
Mark's Threads: https://threads.net/@zuck
Meta AI: https://ai.meta.com/
Meta Quest: https://www.meta.com/quest/
Meta Connect 2023: https://www.metaconnect.com

PODCAST INFO:
Podcast website: https://lexfridman.com/podcast
Apple Podcasts: https://apple.co/2lwqZIr
Spotify: https://spoti.fi/2nEwCF8
RSS: https://lexfridman.com/feed/podcast/
YouTube Full Episodes: https://youtube.com/lexfridman
YouTube Clips: http

##### *Create a folder to store downloaded **MP3** files.*

In [9]:
download_folder = './downloads'
os.makedirs(download_folder, exist_ok=True)

#### **Adding filters to find and dowload multiple specific podcasts.** 

##### *Counter for downloaded podcasts.*

In [10]:
count = 0

In [11]:
start_time = time.time()

##### *Loop through podcast items and download MP3 files.*

In [12]:
for pod in pod_items:
    if count == 1: 
        break

    title = pod.find('title').text 
    description = pod.find('description').text
    mp3_url = pod.find('enclosure')['url']

    # Sanitize the title to remove invalid characters.
    sanitized_title = re.sub(r'[\\/:*?"<>|]', '', title)

    if re.search(r'zuckerberg', description, re.I) and re.search(r'\.mp3', mp3_url, re.I):
        print(f'Title : ', title)
        print(f'MP3 url : ', mp3_url)
        print(f'Downloading ...')

        try: 
            redirect_url = requests.get(mp3_url).url
            mp3_file = requests.get(redirect_url)

            if mp3_file.status_code == 200:
                with open(os.path.join(download_folder, f'{sanitized_title}.mp3'), 'wb') as f: 
                    f.write(mp3_file.content)
                
                print('Download Complete !!\n\n')
                count += 1

            else: 
                print(f'Failed to download: {mp3_url} (Status code: {mp3_file.status_code})\n')


        except Exception as e:
            print(f'Error downloading: {mp3_url}')
            print(e)
            print('\n')

Title :  #398 – Mark Zuckerberg: First Interview in the Metaverse
MP3 url :  https://media.blubrry.com/takeituneasy/content.blubrry.com/takeituneasy/lex_ai_mark_zuckerberg_3.mp3
Downloading ...
Download Complete !!




In [13]:
end_time = time.time()
elapsed_time = end_time - start_time

In [14]:
print(f"Downloaded a total of : {count} MP3 file / files.")
print(f"Total time taken was {elapsed_time:.2f} seconds.")

Downloaded a total of : 1 MP3 file / files.
Total time taken was 57.79 seconds.


#### **Trabscribing Podcasts.**

In [15]:
api_key = os.environ['ASSEMBLY_AI_KEY']
headers = {'authorization': os.environ['ASSEMBLY_AI_KEY']}

In [18]:
files = "./downloads"
file_names = os.listdir(files)

print(file_names)

['#398 – Mark Zuckerberg First Interview in the Metaverse.mp3']


In [20]:
def read_file(filename, chunk_size=5242880): 
    with open(filename, 'rb') as _file:
        while True: 
            data = _file.read(chunk_size)
            if not data: 
                break 
            yield data 

In [22]:
for file in file_names:
    file_path = f"./downloads/{file}"
    print("Uploading ...", file)

    response = requests.post('https://api.assemblyai.com/v2/upload', headers=headers, data=read_file(file_path))

    print(response.json())
    print('Upload Complete !!')

#398 – Mark Zuckerberg First Interview in the Metaverse.mp3
{'upload_url': 'https://cdn.assemblyai.com/upload/285fe7c6-cfb9-4122-9eba-463d55301698'}


In [None]:
count=0
output_ids = []
for url in urls:
    print("Transcription #", count)
    endpoint = "https://api.assemblyai.com/v2/transcript"
    
    json = {
        "audio_url": url['upload_url'],
        "audio_start_from": 300000,
        "audio_end_at": 600000,
    }

    headers = {
        "authorization": os.environ['ASSEMBLY_AI_KEY'],
        "content-type": "application/json"
    }
    response = requests.post(endpoint, json=json, headers=headers)
    print(response)
    output_ids.append(response.json()['id'])
    count+=1