## Import libraries

In [1]:
!pip install beautifulsoup4
!pip install requests
!pip install lxml



In [2]:
from bs4 import BeautifulSoup
import requests
import os
import re

In [3]:
# For the sake of tests we will use a podcast with shorter duration (only 3 minutes)
rss_feed_url = 'https://feeds.buzzsprout.com/1829765.rss'

# Page content from Website URL
page = requests.get( rss_feed_url )
  
# parse html content
soup = BeautifulSoup( page.content , 'html.parser')



In [4]:
print(soup.prettify())

<?xml version="1.0" encoding="UTF-8" ?>
<?xml-stylesheet href="https://feeds.buzzsprout.com/styles.xsl" type="text/xsl"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd" xmlns:podcast="https://podcastindex.org/namespace/1.0">
 <channel>
  <atom:link href="https://feeds.buzzsprout.com/1829765.rss" rel="self" type="application/rss+xml">
  </atom:link>
  <atom:link href="https://pubsubhubbub.appspot.com/" rel="hub" xmlns="http://www.w3.org/2005/Atom">
  </atom:link>
  <title>
   ORT Shorts
  </title>
  <lastbuilddate>
   Wed, 22 Feb 2023 06:46:26 -0500
  </lastbuilddate>
  <link/>
  http://thomasjayoord.com/
  <language>
   en-us
  </language>
  <copyright>
   © 2023 ORT Shorts
  </copyright>
  <podcast:locked>
   yes
  </podcast:locked>
  <podcast:guid>
   bddeb632-ca4b-5e52-ac27-d27a758ef874
  </podcast:guid>
  <itunes:author>
   Thomas Jay Oord
  </itunes:autho

In [5]:
# Get all items
items = soup.find_all('item')
print('Found', len(items), 'items.')

Found 148 items.


In [6]:
ep_desc = items[0].find('enclosure')['url']
print(ep_desc)

https://www.buzzsprout.com/1829765/11649222-ep-148-near-death-experiences.mp3


In [7]:
try:
    os.mkdir('downloads')
except FileExistsError:
    print('Folder /downloads already exists.')

Folder /downloads already exists.


In [8]:
def download_and_save(url, filename):
    mp3 = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(mp3.content)
        
download_and_save(items[0].find('enclosure')['url'], 'downloads/first_i.mp3')

In [9]:
# Only limit # of items will be transcripted
count, limit = 0, 2

for item in items:
    title = item.find('title').text
    url = item.find('enclosure')['url']
    description = item.find('description').text
    
    # We can configure regex pattern for searching in item description
    if re.search('.*', description, re.I):
        print('Title:', title)
        download_and_save(url, 'downloads/'+title+'.mp3')
        print('Downoloaded ', title)
        count += 1
        if count == limit:
            break

Title: Ep. 148: Near Death Experiences
Downoloaded  Ep. 148: Near Death Experiences
Title: Ep. 147: Creator & Creatures (7) are Valuable in Themselves
Downoloaded  Ep. 147: Creator & Creatures (7) are Valuable in Themselves


## Transcribe a podcast

In [10]:
!pip install boto3



## Reading creadentials. You should have file named aws_credentials.json in the same directory

In [11]:
import json

with open('aws_credentials.json', 'r') as f:
    credentials = json.load(f)

ACCESS_KEY = credentials['aws_key']
SECRET_ACCESS_KEY = credentials['aws_secret_key']
REGION = credentials['aws_region']
BUCKET_NAME = credentials['aws_s3_bucket_name']
    
print('ACCESS_KEY', ACCESS_KEY[:2])
print('SECRET_ACCESS_KEY', SECRET_ACCESS_KEY[:2])
print('REGION', REGION)
print('BUCKET_NAME', BUCKET_NAME[:2])

ACCESS_KEY AK
SECRET_ACCESS_KEY gU
REGION eu-central-1
BUCKET_NAME di


## Upload files to AWS S3

In [12]:
import boto3
from botocore.exceptions import ClientError

## Create s3 client

In [13]:
s3_client = boto3.client('s3',
    aws_access_key_id = ACCESS_KEY, #access key ID,
    aws_secret_access_key = SECRET_ACCESS_KEY, # secret access key
    region_name = REGION)# region: "eu-central-1"

In [14]:
def create_bucket(s3_client, bucket_name, region=None):
    buckets_response = s3_client.list_buckets()
    for bucket in buckets_response['Buckets']:
        if bucket['Name'] == bucket_name:
            print('Bucket with name', bucket_name, 'already exists. Skip creating')
            return True
    try:
        if region is None:
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        print('Error creating bucket', e)
        return False
    return True

In [15]:
response = s3_client.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

Existing buckets:
  dimo-transcribe-hubermanlab-podcasts


In [16]:
create_bucket(s3_client, BUCKET_NAME, region=REGION)

Bucket with name dimo-transcribe-hubermanlab-podcasts already exists. Skip creating


True

## Add all files from the downloads folder to the bucket

In [17]:
def upload_file(s3_client, file_name, bucket, object_name=None):
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        print('ERROR uploading file', file_name, 'to bucket', bucket, e)
        return False
    return True

In [18]:
def modify_file_name(filename):
    modified_file_name = filename.replace(' ', '-').replace(':', '')
    modified_file_name = modified_file_name.lower()
    return modified_file_name

for filename in os.listdir('downloads/'):
    print(modify_file_name(filename))

ep.-139-creator-&-creatures-give-&-receive.mp3
transcribedimo1677091590359.json
transcribedimo1677013500488.json
ep.-141-creating-a-better-world.mp3
ep.-148-near-death-experiences.mp3
first_i.mp3
transcribedimo1677013500488.p
ep.-147-creator-&-creatures-(7)-are-valuable-in-themselves.mp3
ep.-140-creator-&-creatures-(2)-experience-the-ongoing-flow-of-time.mp3
ep.-145-creator-&-creatures-(6)-have-needs.mp3
ep.-142-creator-&-creatures-(3)-promote-well-being.mp3
ep.-143-creator-&-creatures-(4)-are-experiencers.mp3
ep.-145-creator-&-creatures-(5)-are-free-agents.mp3
ep.-146-emil-brunner's-theology-of-love.mp3


In [19]:
import pathlib

for filename in os.listdir('downloads/'):
    f = os.path.join('downloads/', filename)
    # checking if it is a file
    if pathlib.Path(filename).suffix != '.mp3':
        print('Not a mp3 file', filename)
        continue
    if not os.path.isfile(f):
        print('Skip not a file', filename)
        continue
    modified_file_name = modify_file_name(filename)
    print('Uploading ', modified_file_name)
    upload_file(s3_client, f, BUCKET_NAME, modified_file_name)

Uploading  ep.-139-creator-&-creatures-give-&-receive.mp3
Not a mp3 file transcribeDimo1677091590359.json
Not a mp3 file transcribeDimo1677013500488.json
Not a mp3 file transcribeDimo1677013500488.p
Uploading  ep.-140-creator-&-creatures-(2)-experience-the-ongoing-flow-of-time.mp3


## Transcribe a mp3 file from S3 bucket

In [20]:
import time

def amazon_transcribe(transcribe, filename, bucket):
    file_uri = 's3://' + bucket + '/' + filename
    print('Going to transcribe file from ', file_uri)
    job_name = 'transcribeDimo' + str(round(time.time()*1000))
    file_format = filename.split('.')[-1]
    transcribe.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': file_uri},
        MediaFormat = file_format,
        LanguageCode='en-US')
    while True:
        result = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        status = result['TranscriptionJob']['TranscriptionJobStatus']
        print('Status: ', status)
        if status == 'FAILED':
            print('Job with name ', job_name, 'failed', result)
            return
        if status == 'COMPLETED':
            print('Job ends with status COMPLETED')
            return result['TranscriptionJob']['Transcript']['TranscriptFileUri']
        time.sleep(10)

In [21]:
transcribe = boto3.client('transcribe',
    aws_access_key_id = ACCESS_KEY, #access key ID,
    aws_secret_access_key = SECRET_ACCESS_KEY, # secret access key
    region_name = REGION)# region: "eu-central-1"

In [22]:
# first_file_name = modify_file_name(os.listdir('downloads/')[0])
import pandas as pd

def get_transcription(transcribe, filename, bucket_name):
    file_transcription_uri = amazon_transcribe(transcribe, filename, bucket_name)
    if file_transcription_uri:
        print('File transcription URI:', file_transcription_uri)
        return pd.read_json(file_transcription_uri)
        
        
file_transcription = get_transcription(transcribe, 'Paul-Rosenberg.mp3', BUCKET_NAME)
file_transcription

Going to transcribe file from  s3://dimo-transcribe-hubermanlab-podcasts/Paul-Rosenberg.mp3
Status:  IN_PROGRESS
Status:  IN_PROGRESS
Status:  IN_PROGRESS
Status:  IN_PROGRESS
Status:  COMPLETED
Job ends with status COMPLETED
File transcription URI: https://s3.eu-central-1.amazonaws.com/aws-transcribe-eu-central-1-prod/282245150475/transcribeDimo1677095355131/4b78be9e-8c70-4b0f-b366-e7d63358f234/asrOutput.json?X-Amz-Security-Token=IQoJb3JpZ2luX2VjEEMaDGV1LWNlbnRyYWwtMSJHMEUCIQDW9G9UVOHQ9txlj2vVP0uYBRI3OAihf4IAfQvZ3ygLGwIgbR5S7RGQgWICSEPSO8LoRqGN3HGFiw8tmwAsya6%2FhEIq3wQI3P%2F%2F%2F%2F%2F%2F%2F%2F%2F%2FARADGgw2MDI2NTkyODE0NDUiDBHyi5nGQ4w4t70UpCqzBGvmEJYJsrwjxGzPgwMH6rryRPdJHhadJZlv83c51%2BQZv1RnQ43yTTzUitQ%2BgMHGcH8k44fBpO7%2BsiCmxSLFhyVKFvhuz5cxorhsePzZSCXVv67snl7Mdm3N%2BofVII1Z2ugpLLBZkrmct6V8RHHOATkMI1CgpB5Ei7d5ld0SYHeR%2BjdjsqfHZ9Xb7DbzCZ5eYeO8tVo4PN8ht%2B5XoslM33rCcJ8FKW4REDo%2FKzbioLsaiwnM0GzZYfg6hBceu%2BmhL0HrCN2qL0%2FZcX1I9qyX%2FiqhIURBRhq56Uh5S3VJzLbzePj1jxdWJO2e4o9qsuoz0TTInZI

Unnamed: 0,jobName,accountId,results,status
items,transcribeDimo1677095355131,282245150475,"[{'start_time': '0.0', 'end_time': '0.36', 'al...",COMPLETED
transcripts,transcribeDimo1677095355131,282245150475,[{'transcript': 'Hey em it's paul. Um I listen...,COMPLETED


In [23]:
job_name = file_transcription['jobName'][0]
file_transcription.to_json('downloads/' + job_name + '.json', indent=2)

## Process all .mp3 files 

In [None]:
for filename in os.listdir('downloads/'):
    f = os.path.join('downloads/', filename)
    # checking if it is a file
    if pathlib.Path(filename).suffix != '.mp3':
        print('Not a mp3 file', filename)
        continue
    if not os.path.isfile(f):
        print('Skip not a file', filename)
        continue
    modified_file_name = modify_file_name(filename)
    print('Transcripting ', modified_file_name)
    file_transcription = get_transcription(transcribe, modified_file_name, BUCKET_NAME)
    if file_transcription:
        job_name = file_transcription['jobName'][0]
        file_transcription.to_json('downloads/' + job_name + '.json', indent=2)