## Import libraries

In [None]:
!pip install beautifulsoup4
!pip install requests
!pip install lxml

In [None]:
from bs4 import BeautifulSoup
import requests
import os
import re

In [None]:
# For the sake of tests we will use a podcast with shorter duration (only 3 minutes)
rss_feed_url = 'https://feeds.buzzsprout.com/1829765.rss'

# Page content from Website URL
page = requests.get( rss_feed_url )
  
# parse html content
soup = BeautifulSoup( page.content , 'html.parser')

In [None]:
print(soup.prettify())

In [None]:
# Get all items
items = soup.find_all('item')
print('Found', len(items), 'items.')

In [None]:
ep_desc = items[0].find('enclosure')['url']
print(ep_desc)

In [None]:
try:
    os.mkdir('downloads')
except FileExistsError:
    print('Folder /downloads already exists.')

In [None]:
def download_and_save(url, filename):
    mp3 = requests.get(url)
    with open(filename, 'wb') as f:
        f.write(mp3.content)
        
download_and_save(items[0].find('enclosure')['url'], 'downloads/first_i.mp3')

In [None]:
count, limit = 0, 10

for item in items:
    title = item.find('title').text
    url = item.find('enclosure')['url']
    description = item.find('description').text
    
    if re.search('.*', description, re.I):
        print('Title:', title)
        download_and_save(url, 'downloads/'+title+'.mp3')
        print('Downoloaded ', title)
        count += 1
        if count == limit:
            break

## Transcribe a podcast

In [None]:
!pip install boto3

## Reading creadentials. You should have file named aws_credentials.json in the same directory

In [None]:
import json

with open('aws_credentials.json', 'r') as f:
    credentials = json.load(f)

ACCESS_KEY = credentials['aws_key']
SECRET_ACCESS_KEY = credentials['aws_secret_key']
REGION = credentials['aws_region']
BUCKET_NAME = credentials['aws_s3_bucket_name']
    
# print('ACCESS_KEY', ACCESS_KEY)
# print('SECRET_ACCESS_KEY', SECRET_ACCESS_KEY)
# print('REGION', REGION)
# print('BUCKET_NAME', BUCKET_NAME)

## Upload files to AWS S3

In [None]:
import boto3
from botocore.exceptions import ClientError

## Create s3 client

In [None]:
s3_client = boto3.client('s3',
    aws_access_key_id = ACCESS_KEY, #access key ID,
    aws_secret_access_key = SECRET_ACCESS_KEY, # secret access key
    region_name = REGION)# region: "eu-central-1"

In [None]:
def create_bucket(s3_client, bucket_name, region=None):
    buckets_response = s3_client.list_buckets()
    for bucket in buckets_response['Buckets']:
        if bucket['Name'] == bucket_name:
            print('Bucket with name', bucket_name, 'already exists. Skip creating')
            return True
    try:
        if region is None:
            s3_client.create_bucket(Bucket=bucket_name)
        else:
            location = {'LocationConstraint': region}
            s3_client.create_bucket(Bucket=bucket_name,
                                    CreateBucketConfiguration=location)
    except ClientError as e:
        print('Error creating bucket', e)
        return False
    return True

In [None]:
response = s3_client.list_buckets()

# Output the bucket names
print('Existing buckets:')
for bucket in response['Buckets']:
    print(f'  {bucket["Name"]}')

In [None]:
create_bucket(s3_client, BUCKET_NAME, region=REGION)

## Add all files from the downloads folder to the bucket

In [None]:
def upload_file(s3_client, file_name, bucket, object_name=None):
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        print('ERROR uploading file', file_name, 'to bucket', bucket, e)
        return False
    return True

In [None]:
def modify_file_name(filename):
    modified_file_name = filename.replace(' ', '-').replace(':', '')
    modified_file_name = modified_file_name.lower()
    return modified_file_name

for filename in os.listdir('downloads/'):
    print(modify_file_name(filename))

In [None]:
import pathlib

for filename in os.listdir('downloads/'):
    f = os.path.join('downloads/', filename)
    # checking if it is a file
    if pathlib.Path(filename).suffix != '.mp3':
        print('Not a mp3 file', filename)
        continue
    if not os.path.isfile(f):
        print('Skip not a file', filename)
        continue
    modified_file_name = modify_file_name(filename)
    print('Uploading ', modified_file_name)
    upload_file(s3_client, f, BUCKET_NAME, modified_file_name)

## Transcribe a mp3 file from S3 bucket

In [None]:
import time

def amazon_transcribe(transcribe, filename, bucket):
    file_uri = 's3://' + bucket + '/' + filename
    print('Going to transcribe file from ', file_uri)
    job_name = 'transcribeDimo' + str(round(time.time()*1000))
    file_format = filename.split('.')[-1]
    transcribe.start_transcription_job(
        TranscriptionJobName=job_name,
        Media={'MediaFileUri': file_uri},
        MediaFormat = file_format,
        LanguageCode='en-US')
    while True:
        result = transcribe.get_transcription_job(TranscriptionJobName=job_name)
        status = result['TranscriptionJob']['TranscriptionJobStatus']
        print('Status: ', status)
        if status == 'FAILED':
            print('Job with name ', job_name, 'failed', result)
            return
        if status == 'COMPLETED':
            print('Job ends with status COMPLETED')
            return result['TranscriptionJob']['Transcript']['TranscriptFileUri']
        time.sleep(10)

In [None]:
transcribe = boto3.client('transcribe',
    aws_access_key_id = ACCESS_KEY, #access key ID,
    aws_secret_access_key = SECRET_ACCESS_KEY, # secret access key
    region_name = REGION)# region: "eu-central-1"

In [None]:
# first_file_name = modify_file_name(os.listdir('downloads/')[0])
import pandas as pd

def get_transcription(transcribe, filename, bucket_name):
    file_transcription_uri = amazon_transcribe(transcribe, filename, bucket_name)
    if file_transcription_uri:
        print('File transcription URI:', file_transcription_uri)
        return pd.read_json(file_transcription_uri)
        
        
file_transcription = get_transcription(transcribe, 'Paul-Rosenberg.mp3', BUCKET_NAME)
file_transcription

In [None]:
job_name = file_transcription['jobName'][0]
file_transcription.to_json('downloads/' + job_name + '.json', indent=2)

## Process all .mp3 files 

In [None]:
for filename in os.listdir('downloads/'):
    f = os.path.join('downloads/', filename)
    # checking if it is a file
    if pathlib.Path(filename).suffix != '.mp3':
        print('Not a mp3 file', filename)
        continue
    if not os.path.isfile(f):
        print('Skip not a file', filename)
        continue
    modified_file_name = modify_file_name(filename)
    print('Transcripting ', modified_file_name)
    file_transcription = get_transcription(transcribe, modified_file_name, BUCKET_NAME)
    if file_transcription:
        job_name = file_transcription['jobName'][0]
        file_transcription.to_json('downloads/' + job_name + '.json', indent=2)