In [2]:
import os
import json
import pandas as pd

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
files = [f for f in os.listdir(DATASET_DIR) if f.endswith(".json") and "_fixed" not in f]

all_episodes = []
for fname in files:
    with open(os.path.join(DATASET_DIR, fname), encoding='utf-8') as f:
        data = json.load(f)
        if isinstance(data, list):
            all_episodes.extend(data)
        elif isinstance(data, dict) and "episodes" in data:
            all_episodes.extend(data["episodes"])

df = pd.DataFrame(all_episodes)
print("Loaded", len(df), "episodes from", len(files), "podcast files.")
print(df.head())


Loaded 826 episodes from 42 podcast files.
     episode_id                           title release_date  \
0  CASEFILE-001         The Wanda Beach Murders   2016-01-09   
1  CASEFILE-002                The Somerton Man   2016-01-16   
2  CASEFILE-003   Lauria Bible & Ashley Freeman   2016-01-23   
3  CASEFILE-004  Who Put Bella in the Witch Elm   2016-01-30   
4  CASEFILE-005                Katherine Knight   2016-02-06   

                                             summary    series  length  \
0  Explores the unsolved murders of Marianne Schm...  Season 1  57 min   
1  The baffling case of a well-dressed dead man f...  Season 1  71 min   
2  Details the 1999 disappearance of Oklahoma tee...  Season 1  78 min   
3  Focuses on the mysterious discovery of a woman...  Season 1  54 min   
4  The gruesome 2000 murder of John Price by his ...  Season 1  79 min   

                                          utterances  \
0  [On 11 January 1965, two teenagers vanished fr...   
1  [On 1 Decemb

In [3]:
df_filled = df.fillna("Not Available")
print(df_filled.head())


     episode_id                           title release_date  \
0  CASEFILE-001         The Wanda Beach Murders   2016-01-09   
1  CASEFILE-002                The Somerton Man   2016-01-16   
2  CASEFILE-003   Lauria Bible & Ashley Freeman   2016-01-23   
3  CASEFILE-004  Who Put Bella in the Witch Elm   2016-01-30   
4  CASEFILE-005                Katherine Knight   2016-02-06   

                                             summary    series  length  \
0  Explores the unsolved murders of Marianne Schm...  Season 1  57 min   
1  The baffling case of a well-dressed dead man f...  Season 1  71 min   
2  Details the 1999 disappearance of Oklahoma tee...  Season 1  78 min   
3  Focuses on the mysterious discovery of a woman...  Season 1  54 min   
4  The gruesome 2000 murder of John Price by his ...  Season 1  79 min   

                                          utterances  \
0  [On 11 January 1965, two teenagers vanished fr...   
1  [On 1 December 1948, a man's body was discover...   
2 

In [4]:
# REQUIRED LIBRARIES:
# pip install pandas bertopic sentence-transformers tqdm
import os
import json
import pandas as pd
from tqdm import tqdm
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

# --------------------------------------
# 1. LOAD EPISODE DATA
DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
files = [f for f in os.listdir(DATASET_DIR) if f.endswith(".json") and "_fixed" not in f]

all_episodes = []
for fname in files:
    with open(os.path.join(DATASET_DIR, fname), encoding='utf-8') as f:
        data = json.load(f)
        if isinstance(data, list):
            all_episodes.extend(data)
        elif isinstance(data, dict) and "episodes" in data:
            all_episodes.extend(data["episodes"])

df = pd.DataFrame(all_episodes)
df = df.fillna("Not Available")

# Optional: select only relevant columns
cols_to_keep = ['episode_id', 'title', 'summary', 'transcript', 'series', 'length', 'topics']
df = df[cols_to_keep]
print(f"Loaded {len(df)} episodes.")

# --------------------------------------
# 2. TOPIC MODELING (BERTopic)
print("Running BERTopic topic modeling on transcripts...")
texts = df['transcript'].astype(str).tolist()
topic_model = BERTopic(verbose=True)
topics, probs = topic_model.fit_transform(texts)
df['topic_label'] = topics
df['topic_keywords'] = topic_model.get_topics()

# --------------------------------------
# 3. SEMANTIC EMBEDDINGS (Sentence-BERT)
print("Encoding semantic vectors...")
embedder = SentenceTransformer('all-MiniLM-L6-v2')
# For large dataset, batch process for speed
df['embedding'] = list(embedder.encode(df['transcript'].astype(str).tolist(), show_progress_bar=True))

# 4. SENTIMENT ANALYSIS

from transformers import pipeline
sentiment_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
df['sentiment'] = df['transcript'].apply(lambda x: sentiment_analyzer(str(x))[0]['label'])

# --------------------------------------
# 5. SAVE FEATURES
df.to_pickle("podcast_episodes_with_topics_embeddings.pkl")
print("Saved DataFrame with topics and embeddings to podcast_episodes_with_topics_embeddings.pkl")





2025-11-05 16:11:29,390 - BERTopic - Embedding - Transforming documents to embeddings.


Loaded 282 episodes.
Running BERTopic topic modeling on transcripts...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

2025-11-05 16:11:33,492 - BERTopic - Embedding - Completed ✓
2025-11-05 16:11:33,494 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-05 16:11:33,875 - BERTopic - Dimensionality - Completed ✓
2025-11-05 16:11:33,875 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-05 16:11:33,886 - BERTopic - Cluster - Completed ✓
2025-11-05 16:11:33,897 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-05 16:11:33,932 - BERTopic - Representation - Completed ✓


Encoding semantic vectors...


Batches:   0%|          | 0/9 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Saved DataFrame with topics and embeddings to podcast_episodes_with_topics_embeddings.pkl


In [6]:
import os

save_dir = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui"
os.makedirs(save_dir, exist_ok=True)  # Creates the directory if it doesn't exist

save_path = os.path.join(save_dir, "podcast_episodes_with_topics_embeddings.pkl")
df.to_pickle(save_path)
print(f"File saved at {save_path}")


File saved at D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl


In [8]:
import os
import json

folder = r'D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds'

series_map = {
    "Casefile": "https://podcasts.apple.com/gb/podcast/casefile-true-crime/id998568017",
    "RedHanded": "https://podcasts.apple.com/gb/podcast/redhanded/id1250599915",
    "The Magnus Archives": "https://podcasts.apple.com/gb/podcast/the-magnus-archives/id1095138637",
    "Lore": "https://podcasts.apple.com/gb/podcast/lore/id978052928",
    "The Rest Is History": "https://podcasts.apple.com/gb/podcast/the-rest-is-history/id1537788786",
    "You Must Remember This": "https://podcasts.apple.com/gb/podcast/you-must-remember-this/id858124601",
    "Revolutions": "https://podcasts.apple.com/gb/podcast/revolutions/id703889772",
    "Science Vs": "https://podcasts.apple.com/gb/podcast/science-vs/id1051557000",
    "StarTalk Radio": "https://podcasts.apple.com/gb/podcast/startalk-radio/id325404506",
    "Freakonomics Radio": "https://podcasts.apple.com/gb/podcast/freakonomics-radio/id354668519",
    "The Adventure Podcast": "https://podcasts.apple.com/gb/podcast/the-adventure-podcast/id1446862825",
    "Terra Incognita": "https://podcasts.apple.com/gb/podcast/terra-incognita-by-sara-wheeler-free-audiobook/id1818794475?i=1000711666862",
    "The Joe Rogan Experience": "https://podcasts.apple.com/gb/podcast/the-joe-rogan-experience/id360084272",
    "Conan O'Brien Needs a Friend": "https://podcasts.apple.com/gb/podcast/conan-obrien-needs-a-friend/id1438054347"
}

# Guests mapping
jre_guests = ["duncan trussell", "dax shepard", "aubrey plaza", "andy samberg", "ali wong"]
conan_guests = ["julia louis-dreyfus", "will ferrell", "bill burr", "fred armisen"]

def get_audio_link(episode, base_series):
    title = episode.get('title', '').lower()
    series = episode.get('series', '').lower()
    # Guest overrides for mixed/compound files
    if any(g in title for g in jre_guests):
        return series_map["The Joe Rogan Experience"]
    if any(g in title for g in conan_guests):
        return series_map["Conan O'Brien Needs a Friend"]
    # Series override (per-file)
    if base_series in series_map:
        return series_map[base_series]
    # Fallback to exact match
    for s, url in series_map.items():
        if s.lower() == series:
            return url
    return ""

for fname in os.listdir(folder):
    if not fname.lower().endswith('.json'):
        continue
    file_path = os.path.join(folder, fname)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # The "base_series" is the filename, stripped of extension and normalized for mapping
    base_series = os.path.splitext(fname)[0]
    # Correct series names that differ (if needed)
    if base_series == "Terra Incognita":
        base_series = "Terra Incognita: The Adventure Podcast"
    
    for ep in data:
        ep['audio_link'] = get_audio_link(ep, base_series)

    out_path = os.path.join(folder, f"{base_series}_with_links.json")
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Processed {fname}, saved to {out_path}")

print("All JSON podcast files updated with audio_link field!")


Processed Casefile.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Casefile_with_links.json
Processed Conan O'Brien Needs a Friend.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Conan O'Brien Needs a Friend_with_links.json
Processed Freakonomics Radio.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Freakonomics Radio_with_links.json
Processed Lore.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Lore_with_links.json
Processed RedHanded.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\RedHanded_with_links.json
Processed Revolutions.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Revolutions_with_links.json
Processed Science Vs.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Science Vs_with_links.json
Processed StarTalk Radio.json, saved to D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\StarTalk Radio_with_links.j

In [9]:
import os
import json

# Folder containing all podcast JSON files
folder = r'D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds'

# Map each podcast series to its Apple Podcasts homepage
series_map = {
    "Casefile": "https://podcasts.apple.com/gb/podcast/casefile-true-crime/id998568017",
    "RedHanded": "https://podcasts.apple.com/gb/podcast/redhanded/id1250599915",
    "The Magnus Archives": "https://podcasts.apple.com/gb/podcast/the-magnus-archives/id1095138637",
    "Lore": "https://podcasts.apple.com/gb/podcast/lore/id978052928",
    "The Rest Is History": "https://podcasts.apple.com/gb/podcast/the-rest-is-history/id1537788786",
    "You Must Remember This": "https://podcasts.apple.com/gb/podcast/you-must-remember-this/id858124601",
    "Revolutions": "https://podcasts.apple.com/gb/podcast/revolutions/id703889772",
    "Science Vs": "https://podcasts.apple.com/gb/podcast/science-vs/id1051557000",
    "StarTalk Radio": "https://podcasts.apple.com/gb/podcast/startalk-radio/id325404506",
    "Freakonomics Radio": "https://podcasts.apple.com/gb/podcast/freakonomics-radio/id354668519",
    "The Adventure Podcast": "https://podcasts.apple.com/gb/podcast/the-adventure-podcast/id1446862825",
    "Terra Incognita": "https://podcasts.apple.com/gb/podcast/terra-incognita-by-sara-wheeler-free-audiobook/id1818794475?i=1000711666862",
    "The Joe Rogan Experience": "https://podcasts.apple.com/gb/podcast/the-joe-rogan-experience/id360084272",
    "Conan O'Brien Needs a Friend": "https://podcasts.apple.com/gb/podcast/conan-obrien-needs-a-friend/id1438054347"
}

# Guest name mapping (for special episodes with guest titles)
jre_guests = ["duncan trussell", "dax shepard", "aubrey plaza", "andy samberg", "ali wong"]
conan_guests = ["julia louis-dreyfus", "will ferrell", "bill burr", "fred armisen"]

def get_audio_link(episode, base_series):
    title = episode.get('title', '').lower()
    series = episode.get('series', '').lower()
    
    # Guest-based mapping
    if any(g in title for g in jre_guests):
        return series_map["The Joe Rogan Experience"]
    if any(g in title for g in conan_guests):
        return series_map["Conan O'Brien Needs a Friend"]
    
    # File-based series mapping
    if base_series in series_map:
        return series_map[base_series]
    
    # Exact series field match
    for s, url in series_map.items():
        if s.lower() == series:
            return url
    
    return ""

# Process each JSON file in the folder
episode_count = 0
for fname in os.listdir(folder):
    if not fname.lower().endswith('.json'):
        continue
    
    file_path = os.path.join(folder, fname)
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Determine base series name from filename
    base_series = os.path.splitext(fname)[0]
    
    # Add audio_link to each episode
    for ep in data:
        ep['audio_link'] = get_audio_link(ep, base_series)
        episode_count += 1
    
    # Save updated file
    out_path = os.path.join(folder, f"{base_series}_with_links.json")
    with open(out_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    print(f"✓ Processed {fname} ({len(data)} episodes)")

print(f"\n✓ All done! {episode_count} episodes across all files now have audio_link fields.")


✓ Processed Casefile.json (30 episodes)
✓ Processed Casefile_with_links.json (30 episodes)
✓ Processed Conan O'Brien Needs a Friend.json (30 episodes)
✓ Processed Conan O'Brien Needs a Friend_with_links.json (30 episodes)
✓ Processed Freakonomics Radio.json (20 episodes)
✓ Processed Freakonomics Radio_with_links.json (20 episodes)
✓ Processed Lore.json (20 episodes)
✓ Processed Lore_with_links.json (20 episodes)
✓ Processed RedHanded.json (30 episodes)
✓ Processed RedHanded_with_links.json (30 episodes)
✓ Processed Revolutions.json (20 episodes)
✓ Processed Revolutions_with_links.json (20 episodes)
✓ Processed Science Vs.json (20 episodes)
✓ Processed Science Vs_with_links.json (20 episodes)
✓ Processed StarTalk Radio.json (20 episodes)
✓ Processed StarTalk Radio_with_links.json (20 episodes)
✓ Processed Terra Incognita.json (20 episodes)
✓ Processed The Adventure Podcast.json (30 episodes)
✓ Processed The Adventure Podcast_with_links.json (30 episodes)
✓ Processed The Joe Rogan Experi

In [11]:
print(df['series'].unique())


['Season 1' 'Season 2' 'Season 3' 'Season 4' 'Season 5' '']


In [12]:
import pandas as pd
import json
import os

# Load your pickle
df = pd.read_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

# Load correct series names from JSON files
json_folder = r'D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds'
title_to_series = {}

for fname in os.listdir(json_folder):
    if fname.endswith('.json') and not fname.endswith('_with_links.json'):
        # Extract series name from filename (without .json)
        series_name = os.path.splitext(fname)[0]
        file_path = os.path.join(json_folder, fname)
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for ep in data:
                    title = ep.get('title', '')
                    if title:
                        title_to_series[title] = series_name
        except:
            pass

# Update series column in pickle
df['series'] = df['title'].map(title_to_series).fillna(df['series'])

# Save updated pickle
df.to_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings_fixed.pkl")
print("✓ Updated pickle saved with correct series names!")


✓ Updated pickle saved with correct series names!


In [15]:
df = pd.read_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings_fixed.pkl")
df


Unnamed: 0,episode_id,title,summary,transcript,series,length,topics,topic_label,topic_keywords,embedding,sentiment
0,CASEFILE-001,The Wanda Beach Murders,Explores the unsolved murders of Marianne Schm...,"It began as a typical summer day in Sydney, wi...",Casefile,57 min,"[Unsolved, Child Murders, Australia, Cold Cases]",0,"[(the, 0.06592653752731294), (of, 0.0514138948...","[0.022910284, 0.1029885, 0.015534607, 0.055338...",POSITIVE
1,CASEFILE-002,The Somerton Man,The baffling case of a well-dressed dead man f...,"In 1948, a body was found on Australia’s Somer...",Casefile,71 min,"[Unidentified, Mystery, Australia, Espionage]",0,"[(conan, 0.11585577611976432), (and, 0.1073654...","[-0.0713791, 0.11432176, 0.0023223248, 0.06591...",POSITIVE
2,CASEFILE-003,Lauria Bible & Ashley Freeman,Details the 1999 disappearance of Oklahoma tee...,When a mobile home in rural Oklahoma burned in...,Casefile,78 min,"[Disappearance, Double Homicide, Cold Cases, USA]",0,"[(and, 0.06531411265849738), (the, 0.050738143...","[-0.028233984, 0.070574544, 0.04120928, 0.0168...",NEGATIVE
3,CASEFILE-004,Who Put Bella in the Witch Elm,Focuses on the mysterious discovery of a woman...,"One spring day in 1943, boys playing in Hagley...",Casefile,54 min,"[Unsolved, Wartime, Witchcraft, UK]",0,"[(longworth, 0.09210352922598819), (and, 0.082...","[-0.058513053, 0.10916227, -0.018435324, 0.040...",POSITIVE
4,CASEFILE-005,Katherine Knight,The gruesome 2000 murder of John Price by his ...,Katherine Knight’s brutality was legendary eve...,Casefile,79 min,"[Murder, Cannibalism, Australia, Psychopathy]",0,"[(the, 0.10821691302068585), (duncan, 0.076801...","[0.0037221308, 0.06621627, -0.07222428, 0.0478...",POSITIVE
...,...,...,...,...,...,...,...,...,...,...,...
277,YMRT-016,The Children of Hollywood,Explores the unique pressures and challenges o...,"Shares stories of Shirley Temple, Judy Garland...",You Must Remember This,48 min,"[Child Stars, Hollywood, Fame, Family]",3,,"[0.03599285, 0.039195757, 0.012789645, 0.09473...",POSITIVE
278,YMRT-017,Rita Hayworth: Love Goddess,"Charts Rita Hayworth’s controversial career, p...","Explores Hayworth’s breakthrough roles, person...",You Must Remember This,44 min,"[Rita Hayworth, Reinvention, Marriage, Burden]",3,,"[0.051432725, 0.02765081, 0.06407432, -0.04176...",POSITIVE
279,YMRT-018,The Legend of Bogie and Bacall,Recounts the partnership of Humphrey Bogart an...,"Delves into their first meeting, collaboration...",You Must Remember This,40 min,"[Bogart, Bacall, Legend, Romance]",1,,"[-0.029916244, -0.10645205, 0.04197093, 0.0023...",POSITIVE
280,YMRT-019,Mae West: Sex and Censorship,Explores Mae West’s career as a sexual trailbl...,"Chronicles West’s legal struggles, artistic in...",You Must Remember This,40 min,"[Mae West, Censorship, Sex, Laws]",3,,"[0.040137965, -0.059647903, -0.018245239, -0.0...",POSITIVE


In [16]:
import pandas as pd

# Load pickle
df = pd.read_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

# Map Season numbers to actual podcast series based on episodes
series_fix = {
    "Season 1": "Casefile",
    "Season 2": "RedHanded", 
    "Season 3": "The Magnus Archives",
    "Season 4": "Lore",
    "Season 5": "The Rest Is History"
}

# Replace series column
df['series'] = df['series'].replace(series_fix)

# Save fixed pickle
df.to_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

print("✓ Pickle file fixed!")


✓ Pickle file fixed!


In [17]:
import pandas as pd

df = pd.read_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

print("All unique series values and episode counts:")
print(df['series'].value_counts())
print("\nSample episodes from each series:")
for series in df['series'].unique():
    print(f"\n{series}:")
    print(df[df['series'] == series][['episode_id', 'title']].head(2))


All unique series values and episode counts:
series
Casefile               255
RedHanded               13
The Magnus Archives      7
Lore                     3
The Rest Is History      2
                         2
Name: count, dtype: int64

Sample episodes from each series:

Casefile:
     episode_id                    title
0  CASEFILE-001  The Wanda Beach Murders
1  CASEFILE-002         The Somerton Man

RedHanded:
          episode_id                               title
134  REVOLUTIONS-005  Prelude to the American Revolution
135  REVOLUTIONS-006            The Road to Independence

The Magnus Archives:
          episode_id                                              title
138  REVOLUTIONS-009                   Prelude to the French Revolution
139  REVOLUTIONS-010  The Estates-General and the Storming of the Ba...

Lore:
          episode_id                                  title
145  REVOLUTIONS-016  Haitian Revolution: Slaves and Empire
146  REVOLUTIONS-017   Toussaint Louverture

In [18]:
import pandas as pd

df = pd.read_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

# Fix series based on episode_id prefix
def fix_series(row):
    ep_id = str(row['episode_id']).upper()
    if ep_id.startswith('CASEFILE'):
        return 'Casefile'
    elif ep_id.startswith('REDHANDED'):
        return 'RedHanded'
    elif ep_id.startswith('MAGNUS'):
        return 'The Magnus Archives'
    elif ep_id.startswith('LORE'):
        return 'Lore'
    elif ep_id.startswith('REST'):
        return 'The Rest Is History'
    elif ep_id.startswith('REVOLUTIONS'):
        return 'Revolutions'
    elif ep_id.startswith('YMRT'):
        return 'You Must Remember This'
    elif ep_id.startswith('SCIENCE'):
        return 'Science Vs'
    elif ep_id.startswith('STARTALK'):
        return 'StarTalk Radio'
    elif ep_id.startswith('FREAK'):
        return 'Freakonomics Radio'
    elif ep_id.startswith('ADVENTURE'):
        return 'The Adventure Podcast'
    elif ep_id.startswith('TERRA'):
        return 'Terra Incognita: The Adventure Podcast'
    elif ep_id.startswith('JRE') or 'ROGAN' in ep_id:
        return 'The Joe Rogan Experience'
    elif ep_id.startswith('CONAN'):
        return "Conan O'Brien Needs a Friend"
    else:
        return row['series']  # Keep original if no match

df['series'] = df.apply(fix_series, axis=1)

# Save fixed pickle
df.to_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

print("✓ Series column fixed based on episode IDs!")
print("\nNew series counts:")
print(df['series'].value_counts())


✓ Series column fixed based on episode IDs!

New series counts:
series
Casefile                                  30
Conan O'Brien Needs a Friend              30
RedHanded                                 30
The Adventure Podcast                     30
Freakonomics Radio                        20
Lore                                      20
Revolutions                               20
Science Vs                                20
StarTalk Radio                            20
Terra Incognita: The Adventure Podcast    20
The Joe Rogan Experience                  20
You Must Remember This                    20
                                           2
Name: count, dtype: int64


In [19]:
import pandas as pd

# Load your fixed pickle
df = pd.read_pickle(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\Testing Ui\podcast_episodes_with_topics_embeddings.pkl")

# Series to link mapping
series_map = {
    "Casefile": "https://podcasts.apple.com/gb/podcast/casefile-true-crime/id998568017",
    "RedHanded": "https://podcasts.apple.com/gb/podcast/redhanded/id1250599915",
    "The Magnus Archives": "https://podcasts.apple.com/gb/podcast/the-magnus-archives/id1095138637",
    "Lore": "https://podcasts.apple.com/gb/podcast/lore/id978052928",
    "The Rest Is History": "https://podcasts.apple.com/gb/podcast/the-rest-is-history/id1537788786",
    "You Must Remember This": "https://podcasts.apple.com/gb/podcast/you-must-remember-this/id858124601",
    "Revolutions": "https://podcasts.apple.com/gb/podcast/revolutions/id703889772",
    "Science Vs": "https://podcasts.apple.com/gb/podcast/science-vs/id1051557000",
    "StarTalk Radio": "https://podcasts.apple.com/gb/podcast/startalk-radio/id325404506",
    "Freakonomics Radio": "https://podcasts.apple.com/gb/podcast/freakonomics-radio/id354668519",
    "The Adventure Podcast": "https://podcasts.apple.com/gb/podcast/the-adventure-podcast/id1446862825",
    "Terra Incognita: The Adventure Podcast": "https://podcasts.apple.com/gb/podcast/terra-incognita-by-sara-wheeler-free-audiobook/id1818794475",
    "The Joe Rogan Experience": "https://podcasts.apple.com/gb/podcast/the-joe-rogan-experience/id360084272",
    "Conan O'Brien Needs a Friend": "https://podcasts.apple.com/gb/podcast/conan-obrien-needs-a-friend/id1438054347"
}

# Map links
df['audio_link'] = df['series'].map(series_map).fillna("")

# Test: Show sample episodes from each podcast with their links
print("=== LINK TEST: Sample Episodes from Each Podcast ===\n")
for series in df['series'].unique():
    if series and series.strip():  # Skip blank series
        sample = df[df['series'] == series].iloc[0]
        link = sample['audio_link']
        print(f"✓ {series}")
        print(f"  Episode: {sample['title']}")
        print(f"  Link: {link}")
        if not link:
            print("  ⚠️ WARNING: No link found!")
        print()

# Summary
total_episodes = len(df)
episodes_with_links = df['audio_link'].str.len().gt(0).sum()
episodes_without_links = total_episodes - episodes_with_links

print("=== SUMMARY ===")
print(f"Total episodes: {total_episodes}")
print(f"Episodes with links: {episodes_with_links}")
print(f"Episodes without links: {episodes_without_links}")
print(f"\n✓ All podcasts correctly mapped!" if episodes_without_links <= 2 else "⚠️ Some episodes missing links")


=== LINK TEST: Sample Episodes from Each Podcast ===

✓ Casefile
  Episode: The Wanda Beach Murders
  Link: https://podcasts.apple.com/gb/podcast/casefile-true-crime/id998568017

✓ Conan O'Brien Needs a Friend
  Episode: Will Ferrell
  Link: https://podcasts.apple.com/gb/podcast/conan-obrien-needs-a-friend/id1438054347

✓ Freakonomics Radio
  Episode: The Dangers of Safety
  Link: https://podcasts.apple.com/gb/podcast/freakonomics-radio/id354668519

✓ Lore
  Episode: Black Stockings
  Link: https://podcasts.apple.com/gb/podcast/lore/id978052928

✓ RedHanded
  Episode: Murder of Jessie Earl
  Link: https://podcasts.apple.com/gb/podcast/redhanded/id1250599915

✓ Revolutions
  Episode: The English Revolution I: King and Parliament
  Link: https://podcasts.apple.com/gb/podcast/revolutions/id703889772

✓ Science Vs
  Episode: Science Vs: Organic Food - What’s the Real Difference?
  Link: https://podcasts.apple.com/gb/podcast/science-vs/id1051557000

✓ StarTalk Radio
  Episode: Science of St

In [21]:
podcast_details = {
    "The Rest Is History": {
        "title": "The Rest Is History",
        "url": "https://podcasts.apple.com/gb/podcast/the-rest-is-history/id1537788786",
        "thumbnail": "https://is3-ssl.mzstatic.com/image/thumb/Podcasts116/v4/6c/73/66/6c736624-6e43-675e-9244-cf6eb101455d/mza_16431944777842192543.jpg/600x600bb.jpg",
        "genres": "History, Deep Dives, Documentary",
        "duration": "50 min, 2x weekly",
        "release_date": "Nov 2020 – present",
        "creators": "Tom Holland, Dominic Sandbrook",
        "hosts": "Tom Holland, Dominic Sandbrook",
        "rating": "9.7 IMDb, 11M+ monthly, UK Top 10",
        "summary": "Irreverent and enthusiastic deep-dives on history with expert guests, contemporary pop culture context and in-depth explorations.",
        "more_info_url": "https://en.wikipedia.org/wiki/The_Rest_Is_History_(podcast)"
    },
    "You Must Remember This": {
        "title": "You Must Remember This",
        "url": "https://podcasts.apple.com/gb/podcast/you-must-remember-this/id858124601",
        "thumbnail": "https://is1-ssl.mzstatic.com/image/thumb/Podcasts125/v4/c7/34/db/c734db94-7ee2-d275-6236-9112c50e1ee9/mza_9334009523243857506.jpg/600x600bb.jpg",
        "genres": "History, Hollywood, Film, Cultural Commentary",
        "duration": "40–60+ min, varies",
        "release_date": "Apr 2014 – present",
        "creators": "Karina Longworth",
        "hosts": "Karina Longworth",
        "rating": "4.8⭐, critical acclaim",
        "summary": "Exploring the secret and/or forgotten histories of Hollywood’s first century. Deep-dive storytelling, thoroughly researched and narrated.",
        "more_info_url": "http://www.youmustrememberthispodcast.com"
    },
    "Revolutions": {
        "title": "Revolutions",
        "url": "https://podcasts.apple.com/us/podcast/revolutions/id703889772",
        "thumbnail": "https://is1-ssl.mzstatic.com/image/thumb/Podcasts123/v4/65/29/e5/6529e52a-f477-a81b-c3ee-ac112d7b7613/mza_2294281234174034732.jpeg/600x600bb.jpg",
        "genres": "History, Political History, Documentary",
        "duration": "~32 min",
        "release_date": "Sep 2013 – present",
        "creators": "Mike Duncan",
        "hosts": "Mike Duncan",
        "rating": "4.9⭐, 26K+ reviews",
        "summary": "Major historical revolutions of the modern world—each season covers a different revolution in exceptional depth.",
        "more_info_url": "https://podnews.net/podcast/iahz"
    },
    "Freakonomics Radio": {
        "title": "Freakonomics Radio",
        "url": "https://podcasts.apple.com/zw/podcast/freakonomics-radio/id354668519",
        "thumbnail": "https://is2-ssl.mzstatic.com/image/thumb/Podcasts126/v4/5e/eb/5a/5eeb5af8-426e-b637-b11e-03520b7d4b6e/mza_10716374623723333712.jpg/600x600bb.jpg",
        "genres": "Economics, Social Science, Science, Human Behavior",
        "duration": "Weekly, 58 min",
        "release_date": "Sep 2010 – present",
        "creators": "Stephen J. Dubner, Steven Levitt",
        "hosts": "Stephen J. Dubner",
        "rating": "Millions/downloads, Top Apple Podcast, 4.7⭐",
        "summary": "Uncovers the hidden side of everything—behavior, economics, life, and society. Mixing stories, science, and statistics.",
        "more_info_url": "https://en.wikipedia.org/wiki/Freakonomics_Radio"
    },
    "Science Vs": {
        "title": "Science Vs",
        "url": "https://open.spotify.com/show/5lY4b5PGOvMuOYOjOVEcb9",
        "thumbnail": "https://is2-ssl.mzstatic.com/image/thumb/Podcasts115/v4/21/8e/af/218eafda-d948-814f-413a-463e0c56543f/mza_9891187203477542083.jpeg/600x600bb.jpg",
        "genres": "Science, Myth Busting, Health, Debate",
        "duration": "~40 min",
        "release_date": "2015 – present",
        "creators": "Wendy Zukerman",
        "hosts": "Wendy Zukerman",
        "rating": "Popular, Spotify Original",
        "summary": "Pits facts against fads and fiction. Myth-busting science podcast—current events, wellness, and trends.",
        "more_info_url": "https://en.wikipedia.org/wiki/Science_Vs"
    },
    "StarTalk Radio": {
        "title": "StarTalk Radio",
        "url": "https://podcasts.apple.com/gb/podcast/startalk-radio/id325404506",
        "thumbnail": "https://is1-ssl.mzstatic.com/image/thumb/Podcasts126/v4/24/01/3c/24013c53-91f6-896d-a407-77130f4efa41/mza_11568007232803842427.jpg/600x600bb.jpg",
        "genres": "Science, Astronomy, Pop Culture, Comedy",
        "duration": "Weekly, ~1 hr",
        "release_date": "2009 – present",
        "creators": "Neil deGrasse Tyson, Startalk Media",
        "hosts": "Neil deGrasse Tyson, guests",
        "rating": "Global, 4.6⭐, 700+ reviews",
        "summary": "Where science, pop culture, and comedy collide—hosted by astrophysicist Neil deGrasse Tyson and diverse guests.",
        "more_info_url": "https://startalkmedia.com/about-us/"
    },
    "The Adventure Podcast": {
        "title": "The Adventure Podcast",
        "url": "https://podcasts.apple.com/gb/podcast/the-adventure-podcast/id1446862825",
        "thumbnail": "https://is1-ssl.mzstatic.com/image/thumb/Podcasts116/v4/37/b0/ed/37b0edef-eee2-1fd9-d45f-2dde564d7db7/mza_11902358472217933787.jpg/600x600bb.jpg",
        "genres": "Adventure, Nature, Exploration, Travel",
        "duration": "~60 min",
        "release_date": "2018 – present",
        "creators": "Matt Pycroft, Coldhouse Collective",
        "hosts": "Matt Pycroft",
        "rating": "4.7⭐, 290+ Apple reviews",
        "summary": "Long-form conversations with adventurers and explorers; storytellers reveal what drives them to extraordinary acts.",
        "more_info_url": "https://shows.acast.com/the-adventure-podcast/about"
    },
    "Terra Incognita: The Adventure Podcast": {
        "title": "Terra Incognita by Sara Wheeler (Audiobook)",
        "url": "https://podcasts.apple.com/gb/podcast/terra-incognita-by-sara-wheeler-free-audiobook/id1818794475?i=1000711666862",
        "thumbnail": "https://is1-ssl.mzstatic.com/image/thumb/Podcasts113/v4/46/81/05/468105fc-85e3-5e49-9d8d-6a0d9ee8abfc/mza_7246351934581444100.jpg/600x600bb.jpg",
        "genres": "Adventure, Essays, Travel, Audiobook",
        "duration": "13h 42m (audiobook)",
        "release_date": "Dec 7, 2011",
        "creators": "Sara Wheeler, Whole Story Audiobooks",
        "hosts": "Narrator: Patricia Gallimore",
        "rating": "N/A",
        "summary": "Sara Wheeler recounts months in Antarctica—the wild, beautiful, and harsh continent—blending essay, memoir, and reportage.",
        "more_info_url": "https://hotaudiobook.com/free"
    },
    "The Joe Rogan Experience": {
        "title": "The Joe Rogan Experience",
        "url": "https://open.spotify.com/show/4rOoJ6Egrf8K2IrywzwOMk",
        "thumbnail": "https://is2-ssl.mzstatic.com/image/thumb/Podcasts114/v4/02/0e/10/020e10a5-e676-f77a-80e3-2743e38e7865/mza_3006026283836933792.jpg/600x600bb.jpg",
        "genres": "Comedy, Society, Interviews",
        "duration": "2-4+ hrs/episode",
        "release_date": "Dec 2009 – present",
        "creators": "Joe Rogan",
        "hosts": "Joe Rogan",
        "rating": "Top global charts, millions of listeners",
        "summary": "Long-form interviews and freewheeling conversations on every topic imaginable—comedy, science, culture, politics, health.",
        "more_info_url": "https://en.wikipedia.org/wiki/The_Joe_Rogan_Experience"
    },
    "Conan O'Brien Needs a Friend": {
        "title": "Conan O'Brien Needs a Friend",
        "url": "https://podcasts.apple.com/gb/podcast/conan-obrien-needs-a-friend/id1438054347",
        "thumbnail": "https://is1-ssl.mzstatic.com/image/thumb/Podcasts125/v4/84/2c/3d/842c3dea-63c0-abe2-5965-c09ef96356c7/mza_18208937749951826714.jpg/600x600bb.jpg",
        "genres": "Comedy, Pop Culture, Interviews",
        "duration": "50–90 min/episode",
        "release_date": "Nov 2018 – present",
        "creators": "Team Coco, Earwolf",
        "hosts": "Conan O'Brien, Matt Gourley, Sona Movsesian",
        "rating": "Top Apple Podcast, 4.8⭐",
        "summary": "Warm, deep, funny celebrity interviews and banter; every episode is a mix of laughs, heart, and insight.",
        "more_info_url": "https://en.wikipedia.org/wiki/Conan_O'Brien_Needs_a_Friend"
    }
}


In [4]:
list(df.columns)

['episode_id',
 'title',
 'release_date',
 'summary',
 'series',
 'length',
 'utterances',
 'transcript',
 'transcript_link',
 'audio_link',
 'topics',
 'podcast_id',
 'creators',
 'provider',
 'category',
 'language',
 'country',
 'years_active',
 'description',
 'website',
 'episodes']

In [1]:
import base64

# Load image and encode to base64
with open(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\pod.png", "rb") as imgf:
    img_bytes = imgf.read()
img_b64 = base64.b64encode(img_bytes).decode()
img_src = f"data:image/png;base64,{img_b64}"
