In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# import transcript data
transcript_df = pd.read_parquet("~/grimoire/this-viz-will-kill-you/data/raw/transcript_df.parquet")

# validate output
print(transcript_df.shape)
display(transcript_df["episode_id"].unique())
display(transcript_df.head())

(63307, 4)


array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165])

Unnamed: 0,episode_id,episode_name,speaker,text
0,1,TPWKY-Episode-95-Tetanus,Erin Welsh,Hey everyone. We're including a content warnin...
1,1,TPWKY-Episode-95-Tetanus,Erin Allmann Updyke,"""Mrs. Cyril, Orchard Street, age 24 was delive..."
2,1,TPWKY-Episode-95-Tetanus,,"The wrinkled forehead, the elevated brow, the ..."
3,1,TPWKY-Episode-95-Tetanus,,"The paroxysm was renewed by a slight noise, th..."
4,1,TPWKY-Episode-95-Tetanus,TPWKY,(This Podcast Will Kill You intro theme)


In [3]:
# process the transcripts into single episode chucks
# copy the dataframe
episode_df = transcript_df.copy()

# group by and concatenate text
episode_df = episode_df.groupby(["episode_id", "episode_name"])["text"].agg(lambda x: ' '.join(x)).reset_index()

# check output
display(episode_df.head())

Unnamed: 0,episode_id,episode_name,text
0,1,TPWKY-Episode-95-Tetanus,Hey everyone. We're including a content warnin...
1,2,TPWKY-Bonus-Episode-Ed-Yong,"Hi, I'm Erin Welsh and this is This Podcast Wi..."
2,3,TPWKY-Episode-88-Endometriosis,"This is Exactly Right. Hi, my name is Susie. F..."
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ..."
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a..."


In [4]:
# did the erins say "wash your hands, you filthy animals?"
episode_df = episode_df.assign(wash_your_hands = lambda df: df["text"].str.contains(r"wash your hands", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.head())

Unnamed: 0,episode_id,episode_name,text,wash_your_hands
0,1,TPWKY-Episode-95-Tetanus,Hey everyone. We're including a content warnin...,True
1,2,TPWKY-Bonus-Episode-Ed-Yong,"Hi, I'm Erin Welsh and this is This Podcast Wi...",False
2,3,TPWKY-Episode-88-Endometriosis,"This is Exactly Right. Hi, my name is Susie. F...",True
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ...",True
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True


In [5]:
# did the erins mention vaccines?
episode_df = episode_df.assign(vaccine = lambda df: df["text"].str.contains(r"vaccine", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["vaccine"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine
0,1,TPWKY-Episode-95-Tetanus,Hey everyone. We're including a content warnin...,True,True
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ...",True,True
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True,True
5,6,TPWKY-Episode-121-Tularemia,"""E. F., male, age 49, physician began investig...",True,True
6,7,TPWKY-Episode-38-Lead-Poisoning,"""John was a well-nourished, playful, and coope...",True,True
...,...,...,...,...,...
159,160,Ch3_Control_Transcript,My name is Colleen Kraft. So I wear a number o...,True,True
160,161,TPWKY-Episode-101-Immortality,"Hello listeners. Before we get to the episode,...",True,True
162,163,TPWKY-Episode-89-Hepatitis-B,This is Exactly Right. My name is Su Wang. I a...,True,True
163,164,TPWKY-Episode-68-Coccidioidomycosis,This is Exactly Right. So my name is Tori and ...,True,True


In [6]:
# did the erins mention bacteria?
episode_df = episode_df.assign(bacteria = lambda df: df["text"].str.contains(r"bacteria", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["bacteria"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria
0,1,TPWKY-Episode-95-Tetanus,Hey everyone. We're including a content warnin...,True,True,True
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ...",True,True,True
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True,True,True
5,6,TPWKY-Episode-121-Tularemia,"""E. F., male, age 49, physician began investig...",True,True,True
7,8,TPWKY-Episode-53-Radiation,"Just a warning out there, this is a pretty gru...",True,True,True
...,...,...,...,...,...,...
152,153,TPWKY-Episode-39-Toxoplasmosis,"""I contracted toxoplasmosis in my first trimes...",True,True,True
156,157,TPWKY-COVID-15-Disease-II,"This is Exactly Right. My name is Vince, 36 ye...",True,True,True
162,163,TPWKY-Episode-89-Hepatitis-B,This is Exactly Right. My name is Su Wang. I a...,True,True,True
163,164,TPWKY-Episode-68-Coccidioidomycosis,This is Exactly Right. So my name is Tori and ...,True,True,True


In [7]:
# did the erins mention viruses?
episode_df = episode_df.assign(virus = lambda df: df["text"].str.contains(r"virus", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["virus"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus
0,1,TPWKY-Episode-95-Tetanus,Hey everyone. We're including a content warnin...,True,True,True,True
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ...",True,True,True,True
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True,True,True,True
5,6,TPWKY-Episode-121-Tularemia,"""E. F., male, age 49, physician began investig...",True,True,True,True
6,7,TPWKY-Episode-38-Lead-Poisoning,"""John was a well-nourished, playful, and coope...",True,True,False,True
...,...,...,...,...,...,...,...
157,158,TPWKY-COVID-12-Control-II,"This is Exactly Right. My name is Andrew, I li...",True,True,False,True
159,160,Ch3_Control_Transcript,My name is Colleen Kraft. So I wear a number o...,True,True,False,True
162,163,TPWKY-Episode-89-Hepatitis-B,This is Exactly Right. My name is Su Wang. I a...,True,True,True,True
163,164,TPWKY-Episode-68-Coccidioidomycosis,This is Exactly Right. So my name is Tori and ...,True,True,True,True


In [8]:
# did the erins mention fungi?
episode_df = episode_df.assign(fungi = lambda df: df["text"].str.contains(r"fungi", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["fungi"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi
17,18,TPWKY-Episode-67-HPV,"This is Exactly Right. My name is Stephanie, I...",True,True,True,True,True
21,22,TPWKY-Episode-26-Vaccines-I,"This is Exactly Right. ""I am one of the increa...",True,True,True,True,True
49,50,TPWKY-Episode-113-Vitamin-D,This particular illness started in late Decemb...,True,False,False,False,True
61,62,TPWKY-Episode-76-Chickenpox,This is Exactly Right. We were having hat day ...,True,True,True,True,True
66,67,TPWKY-Episode-40-Dancing-Plague,"This is Exactly Right. ""It was a week before t...",True,True,True,True,True
67,68,TPWKY-Bonus-Episode-Coprolites,"Hi, I'm Erin Welsh and this is This Podcast Wi...",False,False,True,True,True
72,73,TPWKY-Episode-33-Chytrid,"I've always loved frogs, toads, salamanders, a...",True,True,True,False,True
83,84,TPWKY-Episode-63-Poison-Ivy,This is Exactly Right. Before we get started o...,True,True,True,False,True
84,85,TPWKY-Episode-15-MRSA,This is Exactly Right. Warning for the squeami...,True,True,True,True,True
89,90,TPWKY-Episode-54-Caffeine,"(This Podcast Will Kill You intro theme) Hi, I...",True,False,False,False,True


In [9]:
# did the erins mention parasites?
episode_df = episode_df.assign(parasite = lambda df: df["text"].str.contains(r"parasite", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["parasite"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi,parasite
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True,True,True,True,False,True
13,14,TPWKY-Episode-42-Dengue,My name is Alex Trillo and I am an assistant p...,True,True,True,True,False,True
19,20,TPWKY-Episode-47-Schistosomiasis,"""Soon after my attention had been directed to ...",True,True,True,True,False,True
27,28,TPWKY-Episode-52-Rinderpest,"""But a far more general plague than the smallp...",True,True,True,True,False,True
28,29,TPWKY-Bonus-Episode-Deborah-Blum,"Hi, I'm Erin Welsh and this is This Podcast Wi...",False,False,True,False,False,True
32,33,TPWKY-Episode-123-Hand-Foot-Mouth-Disease,"On a Saturday in the late summer of 2018, I we...",True,True,True,True,False,True
38,39,TPWKY-Episode-37-E.-coli,By the time paramedics removed Brianne Kiner f...,True,True,True,True,False,True
39,40,TPWKY-Episode-31-Giardia,"""July 3rd. I woke that morning lethargic. The ...",True,True,True,False,False,True
41,42,TPWKY-Episode-94-Chlamydia,"""In high school I heard the rumors and saw the...",True,True,True,True,False,True
44,45,TPWKY-Episode-10-Yellow-Fever,"This is Exactly Right. ""Yellow fever had a mac...",True,True,False,True,False,True


In [10]:
# did the erins mention ticks?
episode_df = episode_df.assign(ticks = lambda df: df["text"].str.contains(r"ticks", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["ticks"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi,parasite,ticks
5,6,TPWKY-Episode-121-Tularemia,"""E. F., male, age 49, physician began investig...",True,True,True,True,False,False,True
20,21,TPWKY-Episode-22-Belladonna,This is Exactly Right. (This Podcast Will Kill...,True,False,False,False,False,False,True
44,45,TPWKY-Episode-10-Yellow-Fever,"This is Exactly Right. ""Yellow fever had a mac...",True,True,False,True,False,True,True
54,55,TPWKY-Episode-102-Arsenic,Hello listeners. Before we get to the episode ...,True,False,False,False,False,False,True
61,62,TPWKY-Episode-76-Chickenpox,This is Exactly Right. We were having hat day ...,True,True,True,True,True,True,True
68,69,TPWKY-Episode-32-Ask-The-Erins,(This Podcast Will Kill You intro theme) Hi. H...,True,True,True,True,False,True,True
71,72,TPWKY-Episode-65-Sweating-Sickness,"This is Exactly Right. ""In the same year, a ne...",True,True,True,True,False,False,True
73,74,TPWKY-Episode-18-Hantavirus,"This is Exactly Right. ""She began to feel feve...",True,True,True,True,False,False,True
81,82,TPWKY-Episode-49-Eastern-Equine-Encephalitis,"""On the 7th of September, my boy took her from...",True,True,False,True,False,False,True
89,90,TPWKY-Episode-54-Caffeine,"(This Podcast Will Kill You intro theme) Hi, I...",True,False,False,False,True,True,True


In [11]:
# did the erins mention pandemics?
episode_df = episode_df.assign(pandemic = lambda df: df["text"].str.contains(r"pandemic", flags=re.IGNORECASE, regex=True))

# vs. did the Erings mention epidemics?
episode_df = episode_df.assign(epidemic = lambda df: df["text"].str.contains(r"epidemic", flags=re.IGNORECASE, regex=True))

# check output
display(episode_df.loc[episode_df["pandemic"] == True])
display(episode_df.loc[episode_df["epidemic"] == True])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi,parasite,ticks,pandemic,epidemic
0,1,TPWKY-Episode-95-Tetanus,Hey everyone. We're including a content warnin...,True,True,True,True,False,False,False,True,False
1,2,TPWKY-Bonus-Episode-Ed-Yong,"Hi, I'm Erin Welsh and this is This Podcast Wi...",False,False,False,False,False,False,False,True,False
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ...",True,True,True,True,False,False,False,True,True
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True,True,True,True,False,True,False,True,True
6,7,TPWKY-Episode-38-Lead-Poisoning,"""John was a well-nourished, playful, and coope...",True,True,False,True,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
156,157,TPWKY-COVID-15-Disease-II,"This is Exactly Right. My name is Vince, 36 ye...",True,True,True,True,False,False,False,True,True
157,158,TPWKY-COVID-12-Control-II,"This is Exactly Right. My name is Andrew, I li...",True,True,False,True,False,False,False,True,False
159,160,Ch3_Control_Transcript,My name is Colleen Kraft. So I wear a number o...,True,True,False,True,False,False,False,True,True
163,164,TPWKY-Episode-68-Coccidioidomycosis,This is Exactly Right. So my name is Tori and ...,True,True,True,True,True,False,False,True,False


Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi,parasite,ticks,pandemic,epidemic
2,3,TPWKY-Episode-88-Endometriosis,"This is Exactly Right. Hi, my name is Susie. F...",True,False,False,False,False,False,False,False,True
3,4,TPWKY-Episode-119-Marburg-Virus,"""The laboratory assistant Marga K. had worked ...",True,True,True,True,False,False,False,True,True
4,5,TPWKY-Episode-61-Typhoid,"This is Exactly Right. Number one. ""The camp a...",True,True,True,True,False,True,False,True,True
5,6,TPWKY-Episode-121-Tularemia,"""E. F., male, age 49, physician began investig...",True,True,True,True,False,False,True,False,True
6,7,TPWKY-Episode-38-Lead-Poisoning,"""John was a well-nourished, playful, and coope...",True,True,False,True,False,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
154,155,TPWKY-COVID-8-Disparities,"This is Exactly Right. ""I am a public defender...",True,True,False,True,False,False,False,True,True
156,157,TPWKY-COVID-15-Disease-II,"This is Exactly Right. My name is Vince, 36 ye...",True,True,True,True,False,False,False,True,True
159,160,Ch3_Control_Transcript,My name is Colleen Kraft. So I wear a number o...,True,True,False,True,False,False,False,True,True
162,163,TPWKY-Episode-89-Hepatitis-B,This is Exactly Right. My name is Su Wang. I a...,True,True,True,True,False,False,False,False,True


In [12]:
# process the text data in episode_df

# clean the text up
episode_df = episode_df.assign(
    text = lambda df: df["text"]
    .str.encode('ascii', 'ignore').str.decode('utf-8') # fix any encoding issues
    .str.lower()                                       # convert to lowercase
    .str.replace(r'[^a-z\s]', '', regex=True)       # remove special characters
    .str.replace(r'\s+', ' ', regex=True)              # remove extra whitespace
)

# remove stop words
try:
    # try to access stop words
    stopwords.words('english')
except LookupError:
    # if stop words are not found, download them
    nltk.download("stopwords")

try:
    # try to access word tokenizer
    word_tokenize("hello world")
except LookupError:
    # if punkt is not found, download it
    nltk.download('punkt')

# assign stopwords to a variable
stop_words = set(stopwords.words('english'))

# set up custom stop words
custom_stop_words = ["im", "theyre", "hey", "hi", "us", "lot", "thats", "like", "use", "well", "could", "much", "theres", "also", "get", "dont", "yeah", "okay", "episode", "lets", "youre", "gon", "weve", "na", "didnt", "doesnt", "weve", "whats", "cant", "th", ]
stop_words.update(custom_stop_words)

# tokenize each transcript
episode_df = episode_df.assign(
    processed_text = lambda df: df["text"].apply(lambda x: [word for word in word_tokenize(x) if word not in stop_words])
)

# check output
display(episode_df.head())

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi,parasite,ticks,pandemic,epidemic,processed_text
0,1,TPWKY-Episode-95-Tetanus,hey everyone were including a content warning ...,True,True,True,True,False,False,False,True,False,"[everyone, including, content, warning, firsth..."
1,2,TPWKY-Bonus-Episode-Ed-Yong,hi im erin welsh and this is this podcast will...,False,False,False,False,False,False,False,True,False,"[erin, welsh, podcast, kill, welcome, everyone..."
2,3,TPWKY-Episode-88-Endometriosis,this is exactly right hi my name is susie from...,True,False,False,False,False,False,False,False,True,"[exactly, right, name, susie, pretty, young, a..."
3,4,TPWKY-Episode-119-Marburg-Virus,the laboratory assistant marga k had worked si...,True,True,True,True,False,False,False,True,True,"[laboratory, assistant, marga, k, worked, sinc..."
4,5,TPWKY-Episode-61-Typhoid,this is exactly right number one the camp at c...,True,True,True,True,False,True,False,True,True,"[exactly, right, number, one, camp, chickamaug..."


In [13]:
# get simple metrics for validation
print(episode_df.shape)
display(episode_df["episode_id"].unique())
display(episode_df.head())

# save to drive
episode_df.to_parquet("~/grimoire/this-viz-will-kill-you/data/raw/episode_df.parquet")

(165, 13)


array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165])

Unnamed: 0,episode_id,episode_name,text,wash_your_hands,vaccine,bacteria,virus,fungi,parasite,ticks,pandemic,epidemic,processed_text
0,1,TPWKY-Episode-95-Tetanus,hey everyone were including a content warning ...,True,True,True,True,False,False,False,True,False,"[everyone, including, content, warning, firsth..."
1,2,TPWKY-Bonus-Episode-Ed-Yong,hi im erin welsh and this is this podcast will...,False,False,False,False,False,False,False,True,False,"[erin, welsh, podcast, kill, welcome, everyone..."
2,3,TPWKY-Episode-88-Endometriosis,this is exactly right hi my name is susie from...,True,False,False,False,False,False,False,False,True,"[exactly, right, name, susie, pretty, young, a..."
3,4,TPWKY-Episode-119-Marburg-Virus,the laboratory assistant marga k had worked si...,True,True,True,True,False,False,False,True,True,"[laboratory, assistant, marga, k, worked, sinc..."
4,5,TPWKY-Episode-61-Typhoid,this is exactly right number one the camp at c...,True,True,True,True,False,True,False,True,True,"[exactly, right, number, one, camp, chickamaug..."


In [14]:
# drop text and processed_text and save to csv
final_episode_df = episode_df.drop(columns=["text", "processed_text"])

# save to csv for Tableau
final_episode_df.to_csv("~/grimoire/this-viz-will-kill-you/data/publish/episode_df.csv")