In [18]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from copy import deepcopy
from urllib.parse import urlparse

### URLs

In [2]:
# urls for JRE transcripts
url_podgist = 'https://www.podgist.com/joe-rogan-experience/index.html'
url_podscribe = 'https://podscribe.app/feeds/http-joeroganexpjoeroganlibsynprocom-rss'

### Requests

In [4]:
# pull in websites
resp_podgist = requests.get(url_podgist)
resp_podscribe = requests.get(url_podscribe)

### Beautiful Soup Pages

In [5]:
# Beautiful Soup pages
page_podgist = BeautifulSoup(resp_podgist.text, 'html.parser')
page_podscribe = BeautifulSoup(resp_podscribe.text, 'html.parser')

### Scrape Podgist

In [25]:
# list in podgist
podgist_episode_list = []

# domain
uri_podgist = urlparse(url_podgist)
domain_podgist = '{uri.scheme}://{uri.netloc}'.format(uri=uri_podgist)

# get rows in table
rows_podcasts = (
    page_podgist
    .find('main', role='source-list')
    .find('table')
    .findAll('tr')
)

# loop through rows looking for link
for row in rows_podcasts:
    # cells in row
    cells = row.findAll('td')
    
    # look for 2 or greater cells
    if len(cells) >= 2:
        
        # look for span with transcribed icon
        if cells[0].find('span', class_='icon-ghost icon-transcribed'):
            
            # empty episode dict
            episode = {}
            
            
            # get episode name and number
            episode['name'] = cells[1].find('a').text
            episode['url'] = domain_podgist+cells[1].find('a')['href']

            # append to list
            podgist_episode_list.append(deepcopy(episode))

### Scrape Podscribe

In [31]:
# list in podscribe
podscribe_episode_list = []

# domain
uri_podscribe = urlparse(url_podscribe)
domain_podscribe = '{uri.scheme}://{uri.netloc}'.format(uri=uri_podscribe)

# get rows in table
rows_podcasts = (
    page_podscribe
    .find('table')
    .find('tbody')
    .findAll('tr')
)

# loop through rows looking for link
for row in rows_podcasts:
    # cells in row
    cells = row.findAll('td')
    
    # look for 2 or greater cells
    if len(cells) >= 2:
        
        # look for episode link
        episode_cell = cells[1].find('a')
        
        if episode_cell:
            
            # look for episode transcribed icon
            if episode_cell.find('i'):
                
                # empty episode dict
                episode = {}
                
                # get episode name and number
                episode['name'] = episode_cell.text
                episode['url'] = domain_podscribe+episode_cell['href']

                # append episode
                podscribe_episode_list.append(deepcopy(episode))

### Eliminate Duplicates

In [None]:
# all episodes
total_episodes = set(
    podscribe_episode_list.copy()+podgist_episode_list.copy()
)
len(total_episodes)