In [None]:
%pip install cohere python-dotenv tqdm

In [None]:
import pandas as pd
import cohere
from dotenv import load_dotenv
import os
from tqdm import tqdm

# load the .env file and get the API key
load_dotenv()
api_key = os.getenv("COHERE_API_KEY")

# check if key is loaded
if api_key:
    print("API key loaded successfully.")
else:
    print("API key not found. Make sure .env file is present and contains COHERE_API_KEY.")

In [None]:
co = cohere.Client(api_key)
print("Cohere client initialized.")

In [None]:
# load your dataset (update path if needed)
csv_path = "../data/youtube_transcripts.csv"
df = pd.read_csv(csv_path)

# drop rows with missing important values
df = df.dropna(subset=["topic", "title", "transcript"]).reset_index(drop=True)

print(f"Loaded dataset with {len(df)} entries after cleaning.")
df.head()

In [None]:
import time
import random

# truncate transcript to avoid token limit issues
def truncate(text, max_chars=1500):
    return text if len(text) <= max_chars else text[:max_chars]

# validate topic using cohere.chat()
def validate_topic(title, transcript, topic, retries=3, delay=0.4):
    prompt = f"""
You are helping validate whether YouTube videos are correctly labeled.

Video Title: "{title}"

Transcript:
\"\"\"
{truncate(transcript)}
\"\"\"

The expected topic of this video is: {topic}

Does the content of the video (based on title and transcript) actually appear to be about this topic?

Respond ONLY with "Yes" or "No".
""".strip()

    for attempt in range(retries):
        try:
            response = co.chat(
                model="command-r",
                message=prompt,
                temperature=0.2,
            )
            answer = response.text.strip().lower()
            time.sleep(delay + random.uniform(0.1, 0.3))

            if "yes" in answer:
                return "Yes"
            elif "no" in answer:
                return "No"
            else:
                return "Unclear"

        except Exception as e:
            print(f"Error on attempt {attempt + 1}: {e}")
            time.sleep(1.5)

    return "Error"

In [None]:
# apply validation across dataset
tqdm.pandas()
print("Validating topics using Cohere. This may take a few minutes depending on dataset size...")
df["matches_topic"] = df.progress_apply(
    lambda row: validate_topic(row["title"], row["transcript"], row["topic"]),
    axis=1
)
print("Topic validation completed.")

In [None]:
# filter to keep only relevant rows
df_cleaned = df[df["matches_topic"] == "Yes"]

# save cleaned dataset
output_path = "../data/cleaned_youtube_dataset.csv"
df_cleaned.to_csv(output_path, index=False)

print(f"Saved cleaned dataset with {len(df_cleaned)} rows to {output_path}")
df_cleaned.head()

In [None]:
df_rejected = df[df["matches_topic"] == "No"]
df_rejected.to_csv("../data/rejected_youtube_dataset.csv", index=False)

print(f"Saved {len(df_rejected)} rejected rows to rejected_youtube_dataset.csv")