In [3]:
!pip install sentence-transformers youtube-transcript-api numpy pandas scikit-learn plotly




In [None]:
!pip install kaleido

In [2]:
import numpy as np
from sentence_transformers import SentenceTransformer
from youtube_transcript_api import YouTubeTranscriptApi
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe"


In [3]:
from urllib.parse import urlparse, parse_qs

def get_youtube_transcript(url):
    parsed = urlparse(url)
    vid = parse_qs(parsed.query).get("v", [None])[0]

    if not vid:   # handles youtu.be format
        vid = parsed.path.split("/")[-1]

    transcript = YouTubeTranscriptApi.get_transcript(vid)

    segments = [
        {
            "start": t["start"],
            "end": t["start"] + t["duration"],
            "text": t["text"]
        }
        for t in transcript
    ]

    full_text = " ".join([t["text"] for t in transcript])
    return full_text, segments


In [4]:
def chunk_text(text, chunk_size=10):
    words = text.split()
    segments = []
    for i in range(0, len(words), chunk_size):
        chunk = " ".join(words[i:i+chunk_size])
        start = i * 0.5    # fake timestamp
        end = (i+chunk_size) * 0.5
        segments.append({
            "start": start,
            "end": end,
            "text": chunk
        })
    return segments


In [5]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")


In [6]:
def get_similarity_scores(title, description, segments):
    anchor = title + " " + (description if description else "")

    texts = [anchor] + [s["text"] for s in segments]
    embeddings = embedder.encode(texts)

    anchor_emb = embeddings[0]
    seg_embs = embeddings[1:]

    sims = cosine_similarity([anchor_emb], seg_embs)[0]
    sims = (sims + 1) / 2  # normalize to 0–1

    return sims


In [7]:
def compute_overall_score(segments, sims):
    weighted = 0
    total = 0

    for seg, sim in zip(segments, sims):
        dur = seg["end"] - seg["start"]
        weighted += sim * dur
        total += dur

    return round(100 * (weighted / total), 2)


In [15]:
sample_text = """
Welcome to today's session on AI in Education...
This video explains machine learning, adaptive learning...
Now please visit our sponsor at deals.com for discounts...
Then we continue our topic...
"""


In [21]:
def evaluate_video(title, description, transcript=None, url=None):
    # Load transcript
    if url:
        transcript, segments = get_youtube_transcript(url)
    else:
        segments = chunk_text(transcript)

    # Similarity scores
    sims = get_similarity_scores(title, description, segments)
    score = compute_overall_score(segments, sims)

    # DataFrame
    df = pd.DataFrame({
        "start": [s["start"] for s in segments],
        "end": [s["end"] for s in segments],
        "midpoint": [(s["start"] + s["end"]) / 2 for s in segments],
        "similarity": sims
    })

    # ---- PASTE SAFE FIGURE CODE HERE ----
    try:
        fig = px.scatter(
            df,
            x="midpoint",
            y="similarity",
            title="Relevance by Timestamp",
            labels={"midpoint": "Time (sec)", "similarity": "Relevance (0–1)"}
        )
    except Exception as e:
        print("Plotly Error:", e)
        fig = px.scatter(
            x=df["midpoint"],
            y=df["similarity"],
            title="Fallback Plot"
        )
    # -------------------------------------

    return float(score), fig, df, transcript


In [22]:
score, fig, df, transcript = evaluate_video(
    title="AI in Education",
    description="How AI helps teachers",
    transcript=sample_text   # or url="https://www.youtube.com/..."
)

# 3) print numeric score and dataframe
print("Score:", score)
print(df)
print("df.shape:", df.shape)

Score: 65.44999694824219
   start   end  midpoint  similarity
0    0.0   5.0       2.5    0.788485
1    5.0  10.0       7.5    0.660154
2   10.0  15.0      12.5    0.514894
df.shape: (3, 4)


In [23]:
score, fig, df, transcript = evaluate_video(
    title="AI in Education",
    description="Test description",
    transcript=sample_text
)

fig.show()
df

Unnamed: 0,start,end,midpoint,similarity
0,0.0,5.0,2.5,0.816186
1,5.0,10.0,7.5,0.687822
2,10.0,15.0,12.5,0.498498


In [24]:
print(df)
print(df.shape)

   start   end  midpoint  similarity
0    0.0   5.0       2.5    0.816186
1    5.0  10.0       7.5    0.687822
2   10.0  15.0      12.5    0.498498
(3, 4)


In [20]:
words = transcript.split()
print("Word count:", len(words))

Word count: 29
