In [1]:
#import statements
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import time
import html2text
import re
import os

In [2]:
#helper functions
#source: https://realpython.com/python-web-scraping-practical-introduction/
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)

In [3]:
#scrape main tumblr page
main_url = 'https://cecilspeaks.tumblr.com/'
main_html = simple_get(main_url)

In [4]:
main_html = BeautifulSoup(main_html, 'html.parser')

In [5]:
divparent = main_html.find_all('li')

In [6]:
#find hrefs with links to episodes
#pull all from html li that match regex
result = re.findall(r"<a\shref=\"\/[\w\-\:]*", str(divparent))
#drop last 5 entries (rss,random,archive,ask, and mobile)
result = result[:-5]
#get rid of beginning stuff
result = [r[10:] for r in result]

In [7]:
urls = ['https://cecilspeaks.tumblr.com/post/182482146001/141-save-dark-owl-records', 
        'https://cecilspeaks.tumblr.com/post/182853449756/142-ufo-sighting-reports', 
        'https://cecilspeaks.tumblr.com/post/183149396746/143-pioneer-days']
k = ['ep141','ep142','ep143']

In [8]:
#create dictionary for html
wtnv_html_dict = dict()

#in loop, tack link onto main url
for r in result:
    time.sleep(1)
    raw_string = main_url+r
    key = r
    print(key, raw_string, time.localtime())
    #pull in raw html
    tz = simple_get(raw_string)
    wtnv_html_dict[key] = tz

ep01 https://cecilspeaks.tumblr.com/ep01 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=19, tm_sec=0, tm_wday=6, tm_yday=69, tm_isdst=1)
ep02 https://cecilspeaks.tumblr.com/ep02 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=19, tm_sec=2, tm_wday=6, tm_yday=69, tm_isdst=1)
ep03 https://cecilspeaks.tumblr.com/ep03 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=19, tm_sec=3, tm_wday=6, tm_yday=69, tm_isdst=1)
ep04 https://cecilspeaks.tumblr.com/ep04 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=19, tm_sec=5, tm_wday=6, tm_yday=69, tm_isdst=1)
ep05 https://cecilspeaks.tumblr.com/ep05 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=19, tm_sec=7, tm_wday=6, tm_yday=69, tm_isdst=1)
ep06 https://cecilspeaks.tumblr.com/ep06 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=19, tm_sec=9, tm_wday=6, tm_yday=69, tm_isdst=1)
ep07 https://cecilspeaks.tum

ep47 https://cecilspeaks.tumblr.com/ep47 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=20, tm_sec=36, tm_wday=6, tm_yday=69, tm_isdst=1)
ep48 https://cecilspeaks.tumblr.com/ep48 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=20, tm_sec=38, tm_wday=6, tm_yday=69, tm_isdst=1)
ep49a https://cecilspeaks.tumblr.com/ep49a time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=20, tm_sec=40, tm_wday=6, tm_yday=69, tm_isdst=1)
ep49b https://cecilspeaks.tumblr.com/ep49b time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=20, tm_sec=42, tm_wday=6, tm_yday=69, tm_isdst=1)
ep50 https://cecilspeaks.tumblr.com/ep50 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=20, tm_sec=43, tm_wday=6, tm_yday=69, tm_isdst=1)
ep51 https://cecilspeaks.tumblr.com/ep51 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=20, tm_sec=45, tm_wday=6, tm_yday=69, tm_isdst=1)
ep52 https://cecil

ep87 https://cecilspeaks.tumblr.com/ep87 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=22, tm_sec=13, tm_wday=6, tm_yday=69, tm_isdst=1)
ep88 https://cecilspeaks.tumblr.com/ep88 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=22, tm_sec=15, tm_wday=6, tm_yday=69, tm_isdst=1)
live7 https://cecilspeaks.tumblr.com/live7 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=22, tm_sec=17, tm_wday=6, tm_yday=69, tm_isdst=1)
ep89 https://cecilspeaks.tumblr.com/ep89 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=22, tm_sec=19, tm_wday=6, tm_yday=69, tm_isdst=1)
bonus5 https://cecilspeaks.tumblr.com/bonus5 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=22, tm_sec=20, tm_wday=6, tm_yday=69, tm_isdst=1)
ep89-1 https://cecilspeaks.tumblr.com/ep89-1 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=22, tm_sec=22, tm_wday=6, tm_yday=69, tm_isdst=1)
ep90 https:/

ep131 https://cecilspeaks.tumblr.com/ep131 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=23, tm_sec=44, tm_wday=6, tm_yday=69, tm_isdst=1)
ep132 https://cecilspeaks.tumblr.com/ep132 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=23, tm_sec=45, tm_wday=6, tm_yday=69, tm_isdst=1)
live8 https://cecilspeaks.tumblr.com/live8 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=23, tm_sec=47, tm_wday=6, tm_yday=69, tm_isdst=1)
ep133 https://cecilspeaks.tumblr.com/ep133 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=23, tm_sec=49, tm_wday=6, tm_yday=69, tm_isdst=1)
ep314 https://cecilspeaks.tumblr.com/ep314 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=23, tm_sec=50, tm_wday=6, tm_yday=69, tm_isdst=1)
ep135 https://cecilspeaks.tumblr.com/ep135 time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=23, tm_sec=52, tm_wday=6, tm_yday=69, tm_isdst=1)
ep136 http

In [10]:
for i in range(len(urls)):
    time.sleep(1)
    raw_string = urls[i]
    key = k[i]
    print(key, raw_string, time.localtime())
    #pull in raw html
    tz = simple_get(raw_string)
    wtnv_html_dict[key] = tz

ep141 https://cecilspeaks.tumblr.com/post/182482146001/141-save-dark-owl-records time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=25, tm_sec=15, tm_wday=6, tm_yday=69, tm_isdst=1)
ep142 https://cecilspeaks.tumblr.com/post/182853449756/142-ufo-sighting-reports time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=25, tm_sec=17, tm_wday=6, tm_yday=69, tm_isdst=1)
ep143 https://cecilspeaks.tumblr.com/post/183149396746/143-pioneer-days time.struct_time(tm_year=2019, tm_mon=3, tm_mday=10, tm_hour=12, tm_min=25, tm_sec=18, tm_wday=6, tm_yday=69, tm_isdst=1)


In [14]:
count = 1
for entry in wtnv_html_dict:
    #BeautifulSoup to html
    html = BeautifulSoup(wtnv_html_dict[entry], 'html.parser')
    title = str(html.find("meta",  property="og:title"))
    content = str(str(html.find("meta",  property="og:description").get('content')).encode("utf-8"))
    #c = html.description.string

    if len(title.split('\"')) == 1:
        title = 'The Librarian (Preview) featuring Horoscopes'
    else:
        title = title.split('\"')[1]
    
    print('Writing ',title)
    
    #open/make file
    fileName = 'wtnv'+str(count)+'.txt'
    f = open(fileName,"w+")
    #write title
    f.write(title)
    f.write("\n")
    #write text
    f.write(content)
    #close file
    f.close()
    
    count+=1

Writing  Episode 1 - Pilot
Writing  Episode 2 - Glow Cloud
Writing  Episode 3 - Station Management
Writing  Episode 4 - PTA Meeting
Writing  Episode 5 - The Shape in Grove Park
Writing  Episode 6 - The Drawbridge
Writing  Episode 7 - History Week
Writing  Episode 8 - The Lights in Radon Canyon
Writing  Episode 9 - “PYRAMID”
Writing  Episode 10 - Feral Dogs
Writing  Episode 11 - Wheat &amp; Wheat By-Products
Writing  Episode 12 - The Candidate
Writing  Episode 13 - A Story About You.
Writing  Episode 14 - The Man in the Tan Jacket
Writing  Episode 15 - Street Cleaning Day
Writing  Episode 16 - The Phone Call
Writing  Episode 17 - Valentine
Writing  Episode 18 - The Traveler
Writing  Episode 19A - The Sandstorm (Night Vale)
Writing  Episode 19B - The Sandstorm (Desert Bluffs)
Writing  Episode 20 - Poetry Week
Writing  Episode 21 - A Memory of Europe
Writing  Episode 22 - The Whispering Forest
Writing  Episode 23 - Eternal Scouts
Writing  Episode 24 - The Mayor
Writing  Episode 25 - One Y

In [15]:
print(count)

167
