In [5]:
from dotenv import load_dotenv
import pandas as pd
import json
import importlib
from sources import earwolf, rss
import contentful

load_dotenv()

True

In [42]:
# Scrape data from sources and write to file

earwolf_eps = earwolf.scrape()
earwolf_eps.to_csv("data/episodes/episodes_earwolf.csv", index=False)

rss_eps = rss.scrape()
rss_eps.to_csv("data/episodes/episodes_rss.csv", index=False)

STARTING EARWOLF
Could not parse number for <li>Ep #UB1 - <a href="/episode/bonus-the-mysterious-secrets-of-uncle-berties-botanarium-episode-1/">Bonus! The Mysterious Secrets Of Uncle Bertie’s Botanarium – Episode 1</a> (<span>Jemaine Clement</span>)</li>
Could not parse number for <li>Ep #B3 - <a href="/episode/live-from-ucb-ny/">Live from UCB NY!</a> (<span>Zach Galifianakis</span>, <span>David Cross</span>, <span>Todd Barry</span>, <span>John Gemberling</span> &amp; <span>Ted Leo</span>)</li>
Could not parse number for <li>Ep #B2 - <a href="/episode/live-from-sf-sketchfest/">Live from SF Sketchfest!</a> (<span>Doug Benson</span>, <span>Michael Ian Black</span>, <span>Dana Gould</span>, <span>Paul F. Tompkins</span> &amp; <span>Reggie Watts</span>)</li>
Could not parse number for <li>Ep #B1 - <a href="/episode/live-from-vancouver/">Live from Vancouver!</a> (<span>Todd Barry</span>, <span>Nick Thune</span>, <span>Paul F. Tompkins</span>, <span>Garfunkel &amp; Oates</span> &amp; <span>

In [43]:
# Read separate episode sources and merge

earwolf_eps = pd.read_csv("data/episodes/episodes_earwolf.csv")
rss_eps = pd.read_csv("data/episodes/episodes_rss.csv")

episodes = pd.merge(earwolf_eps, rss_eps, on=["number", "bestOf", "live"], how="inner")

episodes.rename(columns={"guests_x":"guests"}, inplace=True)
del episodes['guests_y']

episodes.to_csv("data/episodes/episodes_merged.csv", index=False)

### Remove duplicates and fix misspellings

In [44]:
# Count number of appearances per guest

episodes = pd.read_csv("data/episodes/episodes_merged_final.csv")

all_guests = []
for guest_list in episodes["guests"]:
    for guest in json.loads(guest_list):
        all_guests.append(guest)

guest_appearances = pd.Series(name="numberOfAppearances", data=all_guests).value_counts()
guest_appearances.to_csv("data/guests/guests.csv")

In [46]:
importlib.reload(contentful)

# Write all guests to contentful and track IDs

guests = pd.read_csv("data/guests/guests.csv", index_col=0)

for i, row in guests.iterrows():
    print(f"Writing: {i}")
    res = contentful.writeGuest(i)
    guests.at[i, "contentfulId"] = res["sys"]["id"]

guests.to_csv("data/guests/guests_ids.csv")

In [27]:
importlib.reload(contentful)

# Write all episodes to contentful

guests = pd.read_csv("data/guests/guests_ids.csv", index_col=0)
episodes = pd.read_csv("data/episodes/episodes_merged_final.csv")

for i, row in episodes.iterrows():
    print(f"Writing: {row['number']} {row['title']}")
    guest_ids = [guests.loc[guest_name]["contentfulId"] for guest_name in json.loads(row["guests"])]
    res = contentful.writeEpisode(title=row["title"], number=row["number"], releaseDate=row["releaseDate"], guest_ids=guest_ids, bestOf=row["bestOf"], earwolfUrl=row["earwolfUrl"])
    episodes.at[i, "contentfulId"] = res["sys"]["id"]

episodes.to_csv("data/episodes/episodes_ids.csv")

verything Is Horrible and Wonderful
Writing: 531.0. Atmosphere Bully
Writing: 530.0. Zoom Zoom
Writing: 529.0. Am I The What?
Writing: 528.0. This Is The Story of My Life
Writing: 527.0. Hootie Hoo
Writing: 526.0. Air Lift Me Out
Writing: 2017.4. Best of 2017 Pt. 4
Writing: 2017.3. Best of 2017 Pt. 3
Writing: 2017.2. Best of 2017 Pt. 2
Writing: 2017.1. Best of 2017 Pt. 1 - Nagada: A Star Wars Story
Writing: 525.0. 2017 Holiday Spectacular
Writing: 524.0. Merry Chunky Christmas with Neil Patrick Harris
Writing: 523.0. Chekhov's Phone
Writing: 522.0. Charlie's Bucket List
Writing: 521.0. Bing Bong Goodbye
Writing: 520.0. Dead Body Rap
Writing: 519.0. Law O Ver Everything
Writing: 518.0. Corn Dog Horndog
Writing: 517.0. Aha Moments with The National
Writing: 516.0. Solo Bolo Cincolo
Writing: 515.0. Return to Suicide House Part 666
Writing: 514.0. The Calvins Twins Return
Writing: 513.0. Hobo Code
Writing: 512.0. What's Your AIM?
Writing: 511.0. Morzouksnick Interruption
Writing: 510.0. Po