In [1]:
import os
import pandas as pd
import numpy as np
from docx import Document

In [2]:
# list all .docx files in the transcripts data directory
transcript_files = [f for f in os.listdir("/Users/databug/grimoire/this-viz-will-kill-you/data/raw/transcripts") if f.endswith(".docx")]

# check output
print(transcript_files[:5])

['TPWKY-Episode-95-Tetanus.docx', 'TPWKY-Bonus-Episode-Ed-Yong.docx', 'TPWKY-Episode-88-Endometriosis.docx', 'TPWKY-Episode-119-Marburg-Virus.docx', 'TPWKY-Episode-61-Typhoid.docx']


In [3]:
# change the directory to find the files
os.chdir('/Users/databug/grimoire/this-viz-will-kill-you/data/raw/transcripts')

# initialize empty lists to hold data
dfs = []

# loop through the files
for idx, file in enumerate(transcript_files):
    episode_id = idx + 1 # episode ID
    episode_name = file[:-5] # episode name, remove .docx
    # read the transcript
    doc = Document(file)
    transcript = []
    for table in doc.tables:
        for row in table.rows:
            speaker = row.cells[0].text
            text = row.cells[-1].text
            transcript_dict = {
                "speaker": speaker,
                "text": text
            }
            transcript.append(transcript_dict)

    temp_df = pd.DataFrame(transcript)
    # Add episode_id and episode_name to the DataFrame
    temp_df['episode_id'] = episode_id
    temp_df['episode_name'] = episode_name
    
    dfs.append(temp_df[['episode_id', 'episode_name', 'speaker', 'text']])

transcript_df = pd.concat(dfs, ignore_index=True)
transcript_df["text"] = transcript_df["text"].replace('', np.nan, regex=True)
transcript_df.dropna(subset=["text"], inplace=True)
transcript_df.reset_index(drop=True, inplace=True)

# check output
display(transcript_df.head())

Unnamed: 0,episode_id,episode_name,speaker,text
0,1,TPWKY-Episode-95-Tetanus,Erin Welsh,Hey everyone. We're including a content warnin...
1,1,TPWKY-Episode-95-Tetanus,Erin Allmann Updyke,"""Mrs. Cyril, Orchard Street, age 24 was delive..."
2,1,TPWKY-Episode-95-Tetanus,,"The wrinkled forehead, the elevated brow, the ..."
3,1,TPWKY-Episode-95-Tetanus,,"The paroxysm was renewed by a slight noise, th..."
4,1,TPWKY-Episode-95-Tetanus,TPWKY,(This Podcast Will Kill You intro theme)


In [4]:
# get shape and check values for validation purposes
print(transcript_df.shape)
display(transcript_df["episode_id"].unique())
display(transcript_df.head())

(63307, 4)


array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165])

Unnamed: 0,episode_id,episode_name,speaker,text
0,1,TPWKY-Episode-95-Tetanus,Erin Welsh,Hey everyone. We're including a content warnin...
1,1,TPWKY-Episode-95-Tetanus,Erin Allmann Updyke,"""Mrs. Cyril, Orchard Street, age 24 was delive..."
2,1,TPWKY-Episode-95-Tetanus,,"The wrinkled forehead, the elevated brow, the ..."
3,1,TPWKY-Episode-95-Tetanus,,"The paroxysm was renewed by a slight noise, th..."
4,1,TPWKY-Episode-95-Tetanus,TPWKY,(This Podcast Will Kill You intro theme)


In [5]:
# save to raw data folder
transcript_df.to_parquet("/Users/databug/grimoire/this-viz-will-kill-you/data/raw/transcript_df.parquet")