In [1]:
import sys
print("Python Version:", sys.version)

Python Version: 3.7.4 (default, Aug 13 2019, 20:35:49) 
[GCC 7.3.0]


In [2]:
#######################
# standard code block #
#######################

# auto reload imports that change
%load_ext autoreload
# only set to auto reload for marked imports
%autoreload 1

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from copy import deepcopy
from urllib.parse import urlparse

%aimport utils.scrape
from utils import scrape

### URLs

In [4]:
# urls for JRE transcripts
url_podgist = 'https://www.podgist.com/joe-rogan-experience/index.html'
url_podscribe = 'https://podscribe.app/feeds/http-joeroganexpjoeroganlibsynprocom-rss'

### Requests

In [5]:
# pull in websites
resp_podgist = requests.get(url_podgist)
resp_podscribe = requests.get(url_podscribe)

### Beautiful Soup Pages

In [6]:
# Beautiful Soup pages
page_podgist = BeautifulSoup(resp_podgist.text, 'html.parser')
page_podscribe = BeautifulSoup(resp_podscribe.text, 'html.parser')

### Get Podgist URLs

In [7]:
# list in podgist
podgist_episode_list = []

# domain
uri_podgist = urlparse(url_podgist)
domain_podgist = '{uri.scheme}://{uri.netloc}'.format(uri=uri_podgist)

# get rows in table
rows_podcasts = (
    page_podgist
    .find('main', role='source-list')
    .find('table')
    .findAll('tr')
)

# loop through rows looking for link
for row in rows_podcasts:
    # cells in row
    cells = row.findAll('td')
    
    # look for 2 or greater cells
    if len(cells) >= 2:
        
        # look for span with transcribed icon
        if cells[0].find('span', class_='icon-ghost icon-transcribed'):
            
            # empty episode dict
            episode = {}
            
            
            # get episode name and url
            episode['name'] = cells[1].find('a').text
            episode['url'] = domain_podgist+cells[1].find('a')['href']
            
            # get number
            match_episode_num = re.search('\d+',episode['name'])
            
            # append number if you have it
            if match_episode_num:
                episode['number'] = match_episode_num.group(0)
            else:
                episode['number'] = None
                
            # append to list
            podgist_episode_list.append(deepcopy(episode))

print(f'Scraped {len(podgist_episode_list)} urls')

Scraped 805 urls


### Get Podscribe URLs

In [8]:
# list in podscribe
podscribe_episode_list = []

# domain
uri_podscribe = urlparse(url_podscribe)
domain_podscribe = '{uri.scheme}://{uri.netloc}'.format(uri=uri_podscribe)

# get rows in table
rows_podcasts = (
    page_podscribe
    .find('table')
    .find('tbody')
    .findAll('tr')
)

# loop through rows looking for link
for row in rows_podcasts:
    # cells in row
    cells = row.findAll('td')
    
    # look for 2 or greater cells
    if len(cells) >= 2:
        
        # look for episode link
        episode_cell = cells[1].find('a')
        
        if episode_cell:
            
            # look for episode transcribed icon
            if episode_cell.find('i'):
                
                # empty episode dict
                episode = {}
                
                # get episode name and number
                episode['name'] = episode_cell.text
                episode['url'] = domain_podscribe+episode_cell['href']

                # get number
                match_episode_num = re.search('\d+',episode['name'])
                
                if match_episode_num:
                    episode['number'] = match_episode_num.group(0)
                else:
                    episode['number'] = None
                
                # append episode
                podscribe_episode_list.append(deepcopy(episode))
                
print(f'Scraped {len(podscribe_episode_list)} urls')

Scraped 322 urls


### Scrape Podgist

In [10]:
# log file
prog_file = open('logs/podgist_progress.txt','a')
err_file = None

In [11]:
# mongo documents
mongo_docs = []

# total podcasts
tot = len(podgist_episode_list)

# go through episodes
for i, ep in enumerate(podgist_episode_list):
    # get page
    resp_ep = scrape.get_page(ep['url'], 2.0)
    
    # check status 200 (good)
    if resp_ep.status_code != 200:
        
        if not err_file:
            err_file = open('logs/podgist_errors.txt','a')
        
        # write error to log    
        scrape.rec_error(err_file, f"status code !=200 for url: {ep['url']}")
    
    # soup
    soup_ep = BeautifulSoup(resp_ep.text,'html.parser')
    
    # parse
    text_ep = ''

    try:
        # get transcription divs
        for div in soup_ep.findAll('div', class_='transcription'):
            
            # spans store transcript text
            for span in div.findAll('span'):
                text_ep += span.text
        
        # for mongo
        mongo_ep = {}
        mongo_ep.update(deepcopy(ep))
        mongo_ep['text'] = deepcopy(text_ep)
        mongo_ep['source'] = 'podgist'
        mongo_docs.append(deepcopy(mongo_ep))
        
    except Exception as e:
        if not err_file:
            err_file = open('logs/podgist_errors.txt','a')
        scrape.rec_error(err_file, f"Error {str(e)}\nurl: {ep['url']}")
        continue
        
    scrape.rec_progress(i, tot, prog_file, ep['url'])

In [None]:
# close files
prog_file.close()
if err_file:
    err_file.close()

### Scrape Podscribe

In [32]:
# log file
prog_file = open('logs/podscribe_progress.txt','a')
err_file = None

In [33]:
# total podcasts
tot = len(podscribe_episode_list)

# go through episodes
for i, ep in enumerate(podscribe_episode_list):
    # get page
    resp_ep = scrape.get_page(ep['url'], 2.0)
    
    # check status 200 (good)
    if resp_ep.status_code != 200:
        
        if not err_file:
            err_file = open('logs/podscribe_errors.txt','a')
        
        # write error to log    
        scrape.rec_error(err_file, f"status code !=200 for url: {ep['url']}")
    
    # soup
    soup_ep = BeautifulSoup(resp_ep.text,'html.parser')
    
    # parse
    text_ep = ''

    try:
        # episode description
        ep_desc = (
            soup_ep
            .find('main')
            .find('p', class_='episode-content')
            .text
        )

        # get transcription p's
        ep_p_list = (
            soup_ep
            .find('main')
            .find('p', class_=None)
            .findAllNext('p', class_='')
        )

        for p in ep_p_list:
            text_ep += p.text

        # for mongo
        mongo_ep = {}
        mongo_ep.update(deepcopy(ep))
        mongo_ep['text'] = deepcopy(text_ep)
        mongo_ep['source'] = 'podscribe'
        mongo_ep['desc'] = ep_desc
        mongo_docs.append(deepcopy(mongo_ep))

    except Exception as e:
        if not err_file:
            err_file = open('logs/podscribe_errors.txt','a')
        scrape.rec_error(err_file, f"Error {str(e)}\nurl: {ep['url']}")
        continue
        
    scrape.rec_progress(i, tot, prog_file, ep['url'])

In [34]:
# close files
prog_file.close()
if err_file:
    err_file.close()

### Connect to Database

In [35]:
from pymongo import MongoClient
%aimport credentials.cred
from credentials import cred


config = {
  'host': cred.mongo_host,
  'username': cred.mongo_user,
  'password': cred.mongo_pass,
  'authSource': cred.mongo_auth_db
}

# get a mongo client
client = MongoClient(**config)

# use the raw database
jre_raw = client.jre_raw
podcasts = jre_raw.podcasts

### Push to Mongo

In [38]:
podcasts.insert_many(mongo_docs)

<pymongo.results.InsertManyResult at 0x7f6baff22960>

### Check push

In [41]:
podcasts.count_documents({}) == len(mongo_docs)

True