##### *Importing Libraries*

In [2]:
import os
import time
import requests

import regex as re
from bs4 import BeautifulSoup

In [3]:
rss_feed_urls = {'Lex Fridman': "https://lexfridman.com/feed/podcast/",
                 'Tim Ferris' : "https://rss.art19.com/tim-ferriss-show",
                 'Andrew Huberman': "https://feeds.megaphone.fm/hubermanlab",
                 'Peter Attia' : "https://peterattiamd.com/feed/",}

In [4]:
for val in rss_feed_urls.items():
    print(val)

('Lex Fridman', 'https://lexfridman.com/feed/podcast/')
('Tim Ferris', 'https://rss.art19.com/tim-ferriss-show')
('Andrew Huberman', 'https://feeds.megaphone.fm/hubermanlab')
('Peter Attia', 'https://peterattiamd.com/feed/')


##### *Podcast to download.*

In [5]:
podcast = 'Lex Fridman'

##### *Fetch the RSS feed content.*

In [6]:
page = requests.get(rss_feed_urls[podcast])
soup = BeautifulSoup(page.content, 'xml')

##### *Find all podcast items in the RSS feed.*

In [7]:
pod_items = soup.find_all('item')

In [9]:
pod_items[1]

<item>
<title>#397 – Greg Lukianoff: Cancel Culture, Deplatforming, Censorship &amp; Free Speech</title>
<link>https://lexfridman.com/greg-lukianoff/?utm_source=rss&amp;utm_medium=rss&amp;utm_campaign=greg-lukianoff</link>
<pubDate>Mon, 25 Sep 2023 01:12:01 +0000</pubDate>
<guid isPermaLink="false">https://lexfridman.com/?p=5654</guid>
<comments>https://lexfridman.com/greg-lukianoff/#respond</comments>
<wfw:commentRss>https://lexfridman.com/greg-lukianoff/feed/</wfw:commentRss>
<slash:comments>0</slash:comments>
<category>ai</category>
<description>Greg Lukianoff is a free speech advocate, first-amendment attorney, president of FIRE - Foundation for Individual Rights and Expression, and co-author of The Coddling of the American Mind and a new book The Canceling of the American Mind. Please support this podcast by checking out our sponsors:
- Policygenius: https://www.policygenius.com/
- Babbel: https://babbel.com/lexpod and use code Lexpod to get 55% off
- BetterHelp: https://betterhel

In [8]:
pod_items[0].find('description')

<description>Mark Zuckerberg is CEO of Meta. Please support this podcast by checking out our sponsors:
- LMNT: https://drinkLMNT.com/lex to get free sample pack
- InsideTracker: https://insidetracker.com/lex to get 20% off
- Eight Sleep: https://www.eightsleep.com/lex to get special savings
- AG1: https://drinkag1.com/lex to get 1 month supply of fish oil
- NetSuite: http://netsuite.com/lex to get free product tour

Transcript: https://lexfridman.com/mark-zuckerberg-3-transcript

EPISODE LINKS:
Mark's Facebook: https://facebook.com/zuck
Mark's Instagram: https://instagram.com/zuck
Mark's Threads: https://threads.net/@zuck
Meta AI: https://ai.meta.com/
Meta Quest: https://www.meta.com/quest/
Meta Connect 2023: https://www.metaconnect.com

PODCAST INFO:
Podcast website: https://lexfridman.com/podcast
Apple Podcasts: https://apple.co/2lwqZIr
Spotify: https://spoti.fi/2nEwCF8
RSS: https://lexfridman.com/feed/podcast/
YouTube Full Episodes: https://youtube.com/lexfridman
YouTube Clips: http

##### *Create a folder to store downloaded **MP3** files.*

In [9]:
download_folder = './downloads'
os.makedirs(download_folder, exist_ok=True)

#### **Adding filters to find and dowload multiple specific podcasts.** 

##### *Counter for downloaded podcasts.*

In [10]:
count = 0

In [11]:
start_time = time.time()

##### *Loop through podcast items and download MP3 files.*

In [12]:
for pod in pod_items:
    if count == 1: 
        break

    title = pod.find('title').text 
    description = pod.find('description').text
    mp3_url = pod.find('enclosure')['url']

    # Sanitize the title to remove invalid characters.
    sanitized_title = re.sub(r'[\\/:*?"<>|]', '', title)

    if re.search(r'zuckerberg', description, re.I) and re.search(r'\.mp3', mp3_url, re.I):
        print(f'Title : ', title)
        print(f'MP3 url : ', mp3_url)
        print(f'Downloading ...')

        try: 
            redirect_url = requests.get(mp3_url).url
            mp3_file = requests.get(redirect_url)

            if mp3_file.status_code == 200:
                with open(os.path.join(download_folder, f'{sanitized_title}.mp3'), 'wb') as f: 
                    f.write(mp3_file.content)
                
                print('Download Complete !!\n\n')
                count += 1

            else: 
                print(f'Failed to download: {mp3_url} (Status code: {mp3_file.status_code})\n')


        except Exception as e:
            print(f'Error downloading: {mp3_url}')
            print(e)
            print('\n')

Title :  #398 – Mark Zuckerberg: First Interview in the Metaverse
MP3 url :  https://media.blubrry.com/takeituneasy/content.blubrry.com/takeituneasy/lex_ai_mark_zuckerberg_3.mp3
Downloading ...
Download Complete !!




In [13]:
end_time = time.time()
elapsed_time = end_time - start_time

In [14]:
print(f"Downloaded a total of : {count} MP3 file / files.")
print(f"Total time taken was {elapsed_time:.2f} seconds.")

Downloaded a total of : 1 MP3 file / files.
Total time taken was 57.79 seconds.


#### **Trabscribing Podcasts.**

In [15]:
api_key = os.environ['ASSEMBLY_AI_KEY']
headers = {'authorization': os.environ['ASSEMBLY_AI_KEY']}

In [18]:
files = "./downloads"
file_names = os.listdir(files)

print(file_names)

['#398 – Mark Zuckerberg First Interview in the Metaverse.mp3']


In [20]:
def read_file(filename, chunk_size=5242880): 
    with open(filename, 'rb') as _file:
        while True: 
            data = _file.read(chunk_size)
            if not data: 
                break 
            yield data 

In [22]:
for file in file_names:
    file_path = f"./downloads/{file}"
    print("Uploading ...", file)

    response = requests.post('https://api.assemblyai.com/v2/upload', headers=headers, data=read_file(file_path))

    print(response.json())
    print('Upload Complete !!')

#398 – Mark Zuckerberg First Interview in the Metaverse.mp3
{'upload_url': 'https://cdn.assemblyai.com/upload/285fe7c6-cfb9-4122-9eba-463d55301698'}


In [None]:
count=0
output_ids = []
for url in urls:
    print("Transcription #", count)
    endpoint = "https://api.assemblyai.com/v2/transcript"
    
    json = {
        "audio_url": url['upload_url'],
        "audio_start_from": 300000,
        "audio_end_at": 600000,
    }

    headers = {
        "authorization": os.environ['ASSEMBLY_AI_KEY'],
        "content-type": "application/json"
    }
    response = requests.post(endpoint, json=json, headers=headers)
    print(response)
    output_ids.append(response.json()['id'])
    count+=1

In [10]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [11]:
ls

 Volume in drive D is New Volume
 Volume Serial Number is F4D1-A60B

 Directory of D:\code\Python portfolio\Pod-Castaway

09/10/2023  18:40    <DIR>          .
08/10/2023  15:57    <DIR>          ..
09/10/2023  18:30    <DIR>          .ipynb_checkpoints
07/10/2023  19:47    <DIR>          __pycache__
08/10/2023  01:50             2,272 download.py
08/10/2023  15:57    <DIR>          downloads
09/10/2023  18:40            28,281 extract-transcripts.ipynb
07/10/2023  19:47             1,378 podcast.py
08/10/2023  14:35             4,477 README.md
08/10/2023  11:54            15,708 requirements.txt
07/10/2023  19:24                40 runall.sh
07/10/2023  17:32    <DIR>          transcripts
08/10/2023  01:51             4,578 transcripts.py
               7 File(s)         56,734 bytes
               6 Dir(s)  112,490,369,024 bytes free


In [12]:
!ls

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [13]:
import pandas as pd

In [16]:
pd.options

<pandas._config.config.DictWrapper at 0x19ba3e4eb80>

In [17]:
!pip install jupyternotify

Collecting jupyternotify
  Downloading jupyternotify-0.1.15.tar.gz (7.2 kB)
Building wheels for collected packages: jupyternotify
  Building wheel for jupyternotify (setup.py): started
  Building wheel for jupyternotify (setup.py): finished with status 'done'
  Created wheel for jupyternotify: filename=jupyternotify-0.1.15-py3-none-any.whl size=8743 sha256=bb751940bbb5d3102c477227a90108786bff05a47c2202f3d24007d64f6d8a0f
  Stored in directory: c:\users\faizan\appdata\local\pip\cache\wheels\db\f4\43\06c94fe0f5bacf0029ea8ebb8d080f372b97661740be7b3d74
Successfully built jupyternotify
Installing collected packages: jupyternotify
Successfully installed jupyternotify-0.1.15


In [20]:
import time 
import random

In [22]:
%load_ext jupyternotify

<IPython.core.display.Javascript object>

In [23]:
%%notify

r = random.randint(10, 15)
time.sleep(r)

<IPython.core.display.Javascript object>