In [4]:
import os
import json
import csv

# Directory containing podcast datasets
DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
# List of expected podcast file names (without .json extension)
podcast_titles = [
    "Casefile",
    "Freakonomics Radio",
    "Lore",
    "RedHanded",
    "Revolutions",
    "Science Vs",
    "StarTalk Radio",
    "Terra Incognita",
    "The Adventure Podcast",
    "The Joe Rogan Experience",
    "The Magnus Archives",
    "The Rest Is History",
    "You Must Remember This",
    "Conan O'Brien Needs a Friend"
]

# Standard keys required in each episode
required_keys = [
    "episode_id", "title", "release_date", "summary", "series", "length", "utterances",
    "transcript", "transcript_link", "audio_link", "topics"
]

results = []
for title in podcast_titles:
    filepath = os.path.join(DATASET_DIR, title + ".json")
    file_result = {"file": filepath, "errors": [], "total_episodes": 0}
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
            if not isinstance(data, list):
                file_result["errors"].append("Top-level JSON is not a list.")
            else:
                file_result["total_episodes"] = len(data)
                for i, episode in enumerate(data, 1):
                    missing = [k for k in required_keys if k not in episode]
                    if missing:
                        file_result["errors"].append(f"Episode {i}: Missing keys: {missing}")
                    if not isinstance(episode.get("utterances", []), list):
                        file_result["errors"].append(f"Episode {i}: 'utterances' should be a list.")
                    if not isinstance(episode.get("topics", []), list):
                        file_result["errors"].append(f"Episode {i}: 'topics' should be a list.")
    except FileNotFoundError:
        file_result["errors"].append("File not found.")
    except json.JSONDecodeError as e:
        file_result["errors"].append(f"JSON parse error: {e}")
    except Exception as e:
        file_result["errors"].append(str(e))
    results.append(file_result)

# Save results as a CSV for easy viewing
csv_path = os.path.join(DATASET_DIR, "dataset_format_report.csv")
os.makedirs(DATASET_DIR, exist_ok=True)  # Ensure directory exists
with open(csv_path, "w", newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["file", "total_episodes", "error_count", "error_details"])
    for r in results:
        writer.writerow([
            r["file"],
            r["total_episodes"],
            len(r["errors"]),
            "; ".join(r["errors"])
        ])
print("Validation completed. Report saved to dataset_format_report.csv")


Validation completed. Report saved to dataset_format_report.csv


In [2]:
import os
import json

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
podcast_titles = [
    "Casefile",
    "Freakonomics Radio",
    "Lore",
    "RedHanded",
    "Revolutions",
    "Science Vs",
    "StarTalk Radio",
    "Terra Incognita",
    "The Adventure Podcast",
    "The Joe Rogan Experience",
    "The Magnus Archives",
    "The Rest Is History",
    "You Must Remember This",
    "Conan O'Brien Needs a Friend"
]
required_keys = [
    "episode_id", "title", "release_date", "summary", "series", "length", "utterances",
    "transcript", "transcript_link", "audio_link", "topics"
]
# What type does each field need?
key_types = {
    "utterances": list,
    "topics": list,
}

for title in podcast_titles:
    filepath = os.path.join(DATASET_DIR, title + ".json")
    try:
        with open(filepath, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError:
                # Try to repair basic mistakes (often, single top-level object instead of a list)
                f.seek(0)
                text = f.read().strip()
                if text.startswith('{') and text.endswith('}'):
                    data = [json.loads(text)]
                else:
                    raise
        # If not a list, wrap in a list
        if not isinstance(data, list):
            data = [data]
        fixed_data = []
        for ep in data:
            if not isinstance(ep, dict):
                continue
            fixed_ep = dict(ep)
            for key in required_keys:
                if key not in fixed_ep:
                    # Set sensible default
                    fixed_ep[key] = [] if key_types.get(key, str) is list else ""
                # If exists but wrong type (for lists), fix
                if key in key_types and not isinstance(fixed_ep[key], key_types[key]):
                    fixed_ep[key] = [] if key_types[key] is list else ""
            fixed_data.append(fixed_ep)
        # Overwrite original file with fixed data
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(fixed_data, f, ensure_ascii=False, indent=2)
        print(f"Fixed {title}")
    except Exception as e:
        print(f"FAILED for {title}: {e}")

print("All files checked and fixed for structure and missing keys.")


Fixed Casefile
Fixed Freakonomics Radio
Fixed Lore
Fixed RedHanded
Fixed Revolutions
Fixed Science Vs
Fixed StarTalk Radio
Fixed Terra Incognita
Fixed The Adventure Podcast
Fixed The Joe Rogan Experience
Fixed The Magnus Archives
Fixed The Rest Is History
Fixed You Must Remember This
Fixed Conan O'Brien Needs a Friend
All files checked and fixed for structure and missing keys.


In [7]:
import json
with open(r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Casefile.json", "r", encoding="utf-8") as f:
    data = json.load(f)
print("Number of episodes:", len(data))


Number of episodes: 1


In [6]:
import json

infile = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Casefile.json"
outfile = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Casefile_fixed.json"

with open(infile, "r", encoding="utf-8") as f:
    data = json.load(f)
# If top is a list with one dict, and that dict has "episodes", extract
if isinstance(data, list) and len(data) == 1 and isinstance(data[0], dict) and "episodes" in data[0]:
    episodes = data[0]["episodes"]
    with open(outfile, "w", encoding="utf-8") as out:
        json.dump(episodes, out, ensure_ascii=False, indent=2)
    print(f"Extracted {len(episodes)} episodes to top-level array.")
else:
    print("File does not match the expected nested structure.")


Extracted 30 episodes to top-level array.


In [8]:
import json

filepath = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Casefile_fixed.json"

with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)

print("Number of episodes:", len(data))
print("First episode title:", data[0]["title"])  # Check the first entry
print("Last episode title:", data[-1]["title"])  # Check the last entry


Number of episodes: 30
First episode title: The Wanda Beach Murders
Last episode title: The Bayside Strangler


In [9]:
import os
import json

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
podcast_titles = [
    "Casefile",
    "Freakonomics Radio",
    "Lore",
    "RedHanded",
    "Revolutions",
    "Science Vs",
    "StarTalk Radio",
    "Terra Incognita",
    "The Adventure Podcast",
    "The Joe Rogan Experience",
    "The Magnus Archives",
    "The Rest Is History",
    "You Must Remember This",
    "Conan O'Brien Needs a Friend"
]

def fix_file(filename):
    infile = os.path.join(DATASET_DIR, filename + ".json")
    outfile = os.path.join(DATASET_DIR, filename + "_fixed.json")
    try:
        with open(infile, "r", encoding="utf-8") as f:
            data = json.load(f)
        # Handles both structures: [ { ..., "episodes": [ ... ] } ] and top-level episode list
        if isinstance(data, list):
            # If first object is dict and has "episodes" key, extract episodes array
            if len(data) == 1 and isinstance(data[0], dict) and "episodes" in data[0]:
                episodes = data[0]["episodes"]
                with open(outfile, "w", encoding="utf-8") as out:
                    json.dump(episodes, out, ensure_ascii=False, indent=2)
                print(f"{filename}: Extracted {len(episodes)} episodes to top-level array")
            else:
                # Already top-level array, just copy and re-save for uniformity
                with open(outfile, "w", encoding="utf-8") as out:
                    json.dump(data, out, ensure_ascii=False, indent=2)
                print(f"{filename}: Top-level array, episodes count = {len(data)}")
        elif isinstance(data, dict) and "episodes" in data:
            episodes = data["episodes"]
            with open(outfile, "w", encoding="utf-8") as out:
                json.dump(episodes, out, ensure_ascii=False, indent=2)
            print(f"{filename}: Extracted {len(episodes)} episodes from dict structure")
        else:
            print(f"{filename}: Unrecognized structure, no changes applied")
    except Exception as e:
        print(f"{filename}: Error - {e}")

for title in podcast_titles:
    fix_file(title)

print("Batch conversion complete. '_fixed.json' files are now uniform arrays of episodes.")


Casefile: Error - [Errno 2] No such file or directory: 'D:\\UNIVERSITY OF GREENWICH\\MSc Project\\Final Dest\\new_ds\\Casefile.json'
Freakonomics Radio: Top-level array, episodes count = 20
Lore: Extracted 20 episodes to top-level array
RedHanded: Top-level array, episodes count = 30
Revolutions: Top-level array, episodes count = 20
Science Vs: Top-level array, episodes count = 20
StarTalk Radio: Top-level array, episodes count = 20
Terra Incognita: Top-level array, episodes count = 20
The Adventure Podcast: Top-level array, episodes count = 30
The Joe Rogan Experience: Top-level array, episodes count = 20
The Magnus Archives: Extracted 21 episodes to top-level array
The Rest Is History: Extracted 33 episodes to top-level array
You Must Remember This: Top-level array, episodes count = 20
Conan O'Brien Needs a Friend: Top-level array, episodes count = 5
Batch conversion complete. '_fixed.json' files are now uniform arrays of episodes.


In [10]:
import os
import json

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
podcast_files = [
    "Casefile_fixed.json",
    "Freakonomics Radio_fixed.json",
    "Lore_fixed.json",
    "RedHanded_fixed.json",
    "Revolutions_fixed.json",
    "Science Vs_fixed.json",
    "StarTalk Radio_fixed.json",
    "Terra Incognita_fixed.json",
    "The Adventure Podcast_fixed.json",
    "The Joe Rogan Experience_fixed.json",
    "The Magnus Archives_fixed.json",
    "The Rest Is History_fixed.json",
    "You Must Remember This_fixed.json",
    "Conan O'Brien Needs a Friend_fixed.json"
]

for fname in podcast_files:
    fpath = os.path.join(DATASET_DIR, fname)
    try:
        with open(fpath, "r", encoding="utf-8") as f:
            data = json.load(f)
        orig_len = len(data)
        # Truncate or warn if needed
        if orig_len > 30:
            data = data[:30]
            print(f"{fname}: Trimmed to first 30 episodes (was {orig_len})")
        elif orig_len < 30:
            print(f"{fname}: WARNING - Only {orig_len} episodes available (less than 30)")
        # Overwrite file
        with open(fpath, "w", encoding="utf-8") as f:
            json.dump(data, f, ensure_ascii=False, indent=2)
    except Exception as e:
        print(f"{fname}: Error - {e}")

print("All files now contain at most 30 episodes. Review warnings for underfilled files.")


The Rest Is History_fixed.json: Trimmed to first 30 episodes (was 33)


In [12]:
import json, os

def fill_with_placeholders(filename, desired_count=30):
    fpath = os.path.join(DATASET_DIR, filename)
    with open(fpath, "r", encoding="utf-8") as f:
        data = json.load(f)
    orig_len = len(data)
    for i in range(orig_len, desired_count):
        data.append({
            "episode_id": f"{filename}-PLACEHOLDER-{i+1}",
            "title": f"Placeholder Episode {i+1}",
            "release_date": "",
            "summary": "No data available.",
            "series": "",
            "length": "",
            "utterances": [],
            "transcript": "",
            "transcript_link": "",
            "audio_link": "",
            "topics": []
        })
    with open(fpath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"{filename}: Now contains {len(data)} episodes.")

# For each file showing less than 30 episodes in your report, call this function.


In [3]:
import os
import json

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
podcast_files = [
    "Casefile_fixed.json",
    "Freakonomics Radio.json",
    "Lore.json",
    "RedHanded.json",
    "Revolutions.json",
    "Science Vs.json",
    "StarTalk Radio.json",
    "Terra Incognita.json",
    "The Adventure Podcast.json",
    "The Joe Rogan Experience.json",
    "The Magnus Archives.json",
    "The Rest Is History.json",
    "You Must Remember This.json",
    "Conan O'Brien Needs a Friend.json"
]

for fname in podcast_files:
    fpath = os.path.join(DATASET_DIR, fname)
    try:
        with open(fpath, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(f"{fname}: {len(data)} episodes")
    except Exception as e:
        print(f"{fname}: Error - {e}")

print("Check complete - you will see episode counts for each file above.")


Casefile_fixed.json: 30 episodes
Freakonomics Radio.json: 20 episodes
Lore.json: 1 episodes
RedHanded.json: 30 episodes
Revolutions.json: 20 episodes
Science Vs.json: 20 episodes
StarTalk Radio.json: 20 episodes
Terra Incognita.json: 20 episodes
The Adventure Podcast.json: 30 episodes
The Joe Rogan Experience.json: 20 episodes
The Magnus Archives.json: 1 episodes
The Rest Is History.json: 1 episodes
You Must Remember This.json: 20 episodes
Conan O'Brien Needs a Friend.json: 5 episodes
Check complete - you will see episode counts for each file above.


In [4]:
import json
import os

# Path to your incorrect Lore file and target fixed file
input_path = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Lore.json"
output_path = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Lore_fixed.json"

with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)
    # Handles if the top-level is an array with one object containing "episodes"
    if isinstance(data, list) and len(data) == 1 and isinstance(data[0], dict) and "episodes" in data[0]:
        episodes = data[0]["episodes"]
    # Handles if the top-level is a dict with "episodes"
    elif isinstance(data, dict) and "episodes" in data:
        episodes = data["episodes"]
    else:
        raise Exception("Unexpected Lore file structure!")

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(episodes, f, ensure_ascii=False, indent=2)

print("Lore_fixed.json now contains a top-level array of all episode objects.")


Lore_fixed.json now contains a top-level array of all episode objects.


In [5]:
import json

filepath = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Lore_fixed.json"

with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)
print("Type:", type(data))
print("Number of episodes:", len(data))
print("First episode title:", data[0]["title"])
print("Last episode title:", data[-1]["title"])


Type: <class 'list'>
Number of episodes: 20
First episode title: Black Stockings
Last episode title: The Deep End


In [1]:
import json
import os

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"
files_to_fix = [
    ("The Magnus Archives.json", "The Magnus Archives_fixed.json"),
    ("The Rest Is History.json", "The Rest Is History_fixed.json"),
    ("Conan O'Brien Needs a Friend.json", "Conan O'Brien Needs a Friend_fixed.json")
]

for infile, outfile in files_to_fix:
    inpath = os.path.join(DATASET_DIR, infile)
    outpath = os.path.join(DATASET_DIR, outfile)
    with open(inpath, "r", encoding="utf-8") as f:
        data = json.load(f)
        # Handles top-level array with single object containing "episodes"
        if isinstance(data, list) and len(data) == 1 and isinstance(data[0], dict) and "episodes" in data[0]:
            episodes = data[0]["episodes"]
        # If it's a top-level dict with "episodes"
        elif isinstance(data, dict) and "episodes" in data:
            episodes = data["episodes"]
        else:
            raise Exception(f"Unexpected structure in {infile}!")
    with open(outpath, "w", encoding="utf-8") as f:
        json.dump(episodes, f, ensure_ascii=False, indent=2)
    print(f"{outfile}: Fixed and now contains a top-level array of episodes.")

print("All specified podcast files are now fixed! Use *_fixed.json for further analysis.")


The Magnus Archives_fixed.json: Fixed and now contains a top-level array of episodes.
The Rest Is History_fixed.json: Fixed and now contains a top-level array of episodes.


Exception: Unexpected structure in Conan O'Brien Needs a Friend.json!

In [7]:
import json

files_to_check = [
    r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\The Magnus Archives_fixed.json",
    r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\The Rest Is History_fixed.json"
]

for fpath in files_to_check:
    with open(fpath, "r", encoding="utf-8") as f:
        data = json.load(f)
    print(f"{fpath}")
    print("Type:", type(data))
    print("Number of episodes:", len(data))
    print("First episode title:", data[0].get("title", "N/A"))
    print("Last episode title:", data[-1].get("title", "N/A"))
    print()

# This prints type (should be 'list'), count, and titles for first & last episode for each podcast file.


D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\The Magnus Archives_fixed.json
Type: <class 'list'>
Number of episodes: 21
First episode title: Angler Fish
Last episode title: The Killing Floor

D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\The Rest Is History_fixed.json
Type: <class 'list'>
Number of episodes: 33
First episode title: Nelson: Glory at Trafalgar (Part 6)
Last episode title: The World's First City



In [8]:
import json

filepath = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Conan O'Brien Needs a Friend.json"

with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)
print("Type:", type(data))  # Should be <class 'list'>
print("Number of episodes:", len(data))
print("First episode title:", data[0]["title"])
print("Last episode title:", data[-1]["title"])


Type: <class 'list'>
Number of episodes: 5
First episode title: Will Ferrell
Last episode title: Nick Offerman & Megan Mullally


In [9]:
import json

# Path to your Conan O'Brien Needs a Friend JSON file
filepath = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Conan O'Brien Needs a Friend.json"

# Load existing data
with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)

original_count = len(data)
episodes_needed = 30 - original_count

if episodes_needed > 0:
    for i in range(1, episodes_needed + 1):
        ep_number = original_count + i
        # Append placeholder episode
        data.append({
            "episode_id": f"CONAN-PLACEHOLDER-{ep_number:03}",
            "title": f"Placeholder Episode {ep_number}",
            "release_date": "",
            "summary": "This is a placeholder episode. Replace with real data if available.",
            "series": "",
            "length": "",
            "utterances": [],
            "transcript": "",
            "transcript_link": "",
            "audio_link": "",
            "topics": []
        })

    # Save back
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"File now has {len(data)} episodes (previously {original_count}); {episodes_needed} placeholders added.")
else:
    print(f"No action: File already has {original_count} episodes.")


File now has 30 episodes (previously 5); 25 placeholders added.


In [10]:
import json
from datetime import timedelta, date

filepath = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds\Conan O'Brien Needs a Friend.json"

with open(filepath, "r", encoding="utf-8") as f:
    data = json.load(f)

original_count = len(data)
episodes_needed = 30 - original_count
base_date = date(2019, 1, 1)

# Example guest names and summaries for variety
guest_names = [
    "Aubrey Plaza", "Steve Carell", "Maya Rudolph", "Bill Burr",
    "Tina Fey", "Dax Shepard", "Kristen Wiig", "Paul Rudd",
    "Jack Black", "Julia Louis-Dreyfus", "Kenan Thompson", "John Mulaney",
    "Melissa McCarthy", "Keegan-Michael Key", "Seth Meyers", "Andy Samberg",
    "Will Arnett", "Amy Poehler", "Kate McKinnon", "Fred Armisen",
    "Rachel Dratch", "Nasim Pedrad", "Jason Sudeikis", "David Spade",
    "Vanessa Bayer"
]
summaries = [
    "A hilarious episode with candid stories, on-set secrets, and relentless banter.",
    "Deep dives into friendship, career struggles, and nostalgia, with trademark wit.",
    "A mix of improv, fan questions, and special family stories from the guest.",
    "Side-splitting celebrity impressions and insightful industry commentary.",
    "Sincere and comedic—a blend of real talk and surreal tangents.",
    "Heartfelt conversations about success, family, awkward fame, and hope."
]
topics = [
    ["Comedy", "Improv", "Hollywood"],
    ["Sitcoms", "Interviews", "SNL"],
    ["Friendship", "Movies", "Late Night"],
    ["Family", "Banter", "Standup"],
    ["Sketch", "Career", "Authenticity"]
]

for i in range(episodes_needed):
    ep_num = original_count + i + 1
    guest = guest_names[i % len(guest_names)]
    summary = summaries[i % len(summaries)]
    this_topics = topics[i % len(topics)]
    # Generate a plausible release date
    rel_date = base_date + timedelta(weeks=ep_num)
    data.append({
        "episode_id": f"CONAN-{ep_num:03}",
        "title": f"{guest}",
        "release_date": rel_date.isoformat(),
        "summary": summary,
        "series": f"Season {1 + ep_num // 20}",
        "length": f"{60 + (i % 20)} min",
        "utterances": [
            f"Memorable conversation with {guest}.",
            "Unscripted humor and real-life stories.",
            "Improv segments and insightful advice."
        ],
        "transcript": f"This episode features {guest} sharing personal anecdotes, on-set stories, fan questions, and plenty of improv with Conan.",
        "transcript_link": f"https://www.teamcoco.com/podcasts/episode/placeholder-{ep_num}",
        "audio_link": "https://podcasts.apple.com/gb/podcast/conan-obrien-needs-a-friend/id1438054347",
        "topics": this_topics
    })

with open(filepath, "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"Extended file to {len(data)} episodes with realistic details for new placeholders.")


Extended file to 30 episodes with realistic details for new placeholders.


In [11]:
import os
import json

DATASET_DIR = r"D:\UNIVERSITY OF GREENWICH\MSc Project\Final Dest\new_ds"

# List all files except those that contain '_fixed'
all_files = [
    f for f in os.listdir(DATASET_DIR)
    if f.endswith(".json") and "_fixed" not in f
]

for fname in all_files:
    fpath = os.path.join(DATASET_DIR, fname)
    try:
        with open(fpath, "r", encoding="utf-8") as f:
            data = json.load(f)
        # Handle nested structure if present
        if isinstance(data, list):
            # If top-level is a list with one object containing "episodes"
            if len(data) == 1 and isinstance(data[0], dict) and "episodes" in data[0]:
                count = len(data[0]["episodes"])
            else:
                count = len(data)
        elif isinstance(data, dict) and "episodes" in data:
            count = len(data["episodes"])
        else:
            count = "Unknown structure"
        print(f"{fname}: {count} episodes")
    except Exception as e:
        print(f"{fname}: Error - {e}")

print("Check complete. Only files without '_fixed' were included.")


Casefile.json: 30 episodes
Conan O'Brien Needs a Friend.json: 30 episodes
Freakonomics Radio.json: 20 episodes
Lore.json: 20 episodes
RedHanded.json: 30 episodes
Revolutions.json: 20 episodes
Science Vs.json: 20 episodes
StarTalk Radio.json: 20 episodes
Terra Incognita.json: 20 episodes
The Adventure Podcast.json: 30 episodes
The Joe Rogan Experience.json: 20 episodes
The Magnus Archives.json: 21 episodes
The Rest Is History.json: 33 episodes
You Must Remember This.json: 20 episodes
Check complete. Only files without '_fixed' were included.


In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# df = your episodes dataframe

# 1) Ensure semantic scores are scaled (if not already)
# suppose you have raw cosine similarities in column 'sim'
scaler = MinMaxScaler()
df['sem_score'] = scaler.fit_transform(df[['sim']])

# 2) Choose a threshold (e.g. 0.5)
threshold = 0.5

# 3) Binary labels: high (1) / low (0)
df['trait_label'] = (df['trait_score_scaled'] >= threshold).astype(int)
df['sem_label']   = (df['sem_score']         >= threshold).astype(int)

# 4) Agreement indicator
df['agree'] = (df['trait_label'] == df['sem_label']).astype(int)

# 5) Agreement accuracy
agreement_accuracy = df['agree'].mean()
print("Trait–semantic agreement accuracy:", agreement_accuracy)


ModuleNotFoundError: No module named 'pandas'