## Download podcasts

In [1]:
import requests
import os
import re
from bs4 import BeautifulSoup
import lxml

In [2]:
rss_feed_url = "https://feeds.megaphone.fm/darknetdiaries"

page = requests.get(rss_feed_url)
soup = BeautifulSoup(page.content,'xml')

In [3]:
podcast_items = soup.find_all('item')

In [4]:
count = 0
for podcast in podcast_items:
    if count == 5:
        break
        
    title = podcast.find('title').text
    title = re.sub(r'[^\w\s-]', '', title).strip()
    title = re.sub(r'[-\s]+', '-', title)
    description = podcast.find('description').text
    mp3_url = podcast.find('enclosure')['url']
    
    if re.search(r'hackers', description, re.I):
        mp3_file = requests.get(mp3_url)
        with open(f'./downloads/{title}.mp3','wb') as f:
            f.write(mp3_file.content)
            
        count += 1

## Transcribe podcasts

In [5]:
api_key = os.environ['ASSEMBLY_AI_KEY']

In [6]:
def read_file(filename, chunk_size=5242880):
    with open(filename, 'rb') as _file:
        while True:
            data = _file.read(chunk_size)
            if not data:
                break
            yield data

In [7]:
headers = {
        "authorization": os.environ['ASSEMBLY_AI_KEY'],
        "content-type": "application/json"
}

# Upload podcasts to Assembly AI for transcription
file_names = os.listdir('./downloads')
upload_urls = []
output_ids = []
for file in file_names:
    file_path = f'./downloads/{file}'
    print("Uploading", file)
    
    base_url = "https://api.assemblyai.com/v2"    
    response = requests.post(base_url + "/upload",
                          headers=headers,
                          data=read_file(file_path))
                          
    upload_url = response.json()["upload_url"]
    upload_urls.append(upload_url)
    
    endpoint = f"https://api.assemblyai.com/v2/transcript" 
    json = {     
        "audio_url": response.json()['upload_url'],
        "audio_start_from":300000,
         "audio_end_at":600000,
    }

    url = base_url + "/transcript"

    response = requests.post(endpoint, json=json, headers=headers)
    transcript_id = response.json()['id']
    output_ids.append(transcript_id)

Uploading 103-Cloud-Hopper.mp3
Uploading 112-Dirty-Coms.mp3
Uploading 123-Newswires.mp3
Uploading 94-Mariposa.mp3
Uploading 98-Zero-Day-Brokers.mp3


In [8]:
import time

for transcript_id in output_ids:    
    endpoint = f"https://api.assemblyai.com/v2/transcript/{transcript_id}"
    while True:
        response = requests.get(endpoint, headers= headers)
        status = response.json()['status']
        if status == 'completed':
            print("got transcript")
            with open(f'./transcripts/{transcript_id}.txt', 'w') as f:
                f.write(response.json()['text'])
            break
            
        print("Couldn't retrieve transcript; try again later")
        time.sleep(180)

got transcript
got transcript
got transcript
got transcript
Couldn't retrieve transcript; try again later
got transcript
