In [None]:
import json
import sys

sys.path.append("/Users/personal/Desktop/scriptie")
sys.path.append("/Users/personal/Desktop/scriptie/src")
sys.path.append("/Users/personal/Desktop/scriptie/src/backend")
from src.backend.LlmClient import MlxLlama

In [None]:
llm = MlxLlama(model_name="mlx-community/Meta-Llama-3-8B-Instruct-8bit")

In [None]:
def get_transcript(path):
    with open(path, "r") as f:
        data = json.load(f)

    return data["text"]


def get_transcript_times(path):
    with open(path, "r") as f:
        data = json.load(f)

    filtered = [{"text": d["text"], "start": d["start"]} for d in data["segments"]]

    return filtered


def split_transcript(transcript, chunk_size=10000):
    return [
        transcript[i : i + chunk_size] for i in range(0, len(transcript), chunk_size)
    ]


def get_points(text):
    resp = llm.run(
        [
            {
                "role": "system",
                "content": "Je bent een behulpzame assistent die van een gegeven stuk tekst de hoofd onderwerpen geeft, gescheiden met een ';'. Geef alleen de aller belangrijkste punten mee. Geef enkel en alleen de punten, gescheiden met ';' mee, geen extra informatie of tekst. Genereer maximaal 5 punten en vermijd vergelijkbare punten die je al eerder hebt genoemd.",
            },
            {"role": "user", "content": text},
        ]
    )

    return resp

In [None]:
def generate_agenda(points):
    return [{"agendaPoint": p.replace("\n\n", "").strip()} for p in points if not p.strip().startswith("Hier")]


def clear_dups(agenda):
    dict_tuples = [tuple(sorted(d.items())) for d in agenda]
    unique_dict_tuples = set(dict_tuples)
    unique_agenda = [dict(t) for t in unique_dict_tuples]

    return unique_agenda


def get_agendapoint_start(point, path):
    i = 0
    for sentence in get_transcript_times(path):
        resp = llm.run(
            [
                {
                    "role": "system",
                    "content": "Je bent een behulpzame assistent die van een gegeven stuk tekst een agenda punt, aan geeft of dit agenda punt begint bij de tekst. Reageer alleen 'ja' als het overeenkomt, en 'nee' als dit niet zo is.",
                },
                {
                    "role": "user",
                    "content": f"agenda punt: {point}\n\nText: {sentence['text']}",
                },
            ]
        )
        print("tekt", sentence["text"], "punt", point, "antwoord:", resp)
        if resp.strip() == "ja":
            return sentence["start"]
        i += 1

    return -1


def get_agenda_times(agenda, filepath):
    new_agenda = []

    for point in agenda:
        start = get_agendapoint_start(point["agendaPoint"], filepath)
        if start != -1:
            new_agenda.append({"agendaPoint": point["agendaPoint"], "time": start})

    return new_agenda

In [None]:
def run(path):
    transcript = get_transcript(path)

    agenda = []

    for sub in split_transcript(transcript):
        points = get_points(sub)
        # print(points)
        # print("")
        agenda += generate_agenda(points.split(";"))

    print(agenda)
    print("\n")
    # print(get_agenda_times(agenda, path))


In [None]:
paths = [
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2022/transcripts/919518.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2022/transcripts/919603.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2022/transcripts/953476.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/transcripts/1068473.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/transcripts/1064226.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/transcripts/1065017.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/transcripts/1068441.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/transcripts/1068443.mp4.json",
    "/Users/personal/Desktop/scriptie/notebooks/data/hoekschewaard/2023/transcripts/1068448.mp4.json",
]

for p in paths:
    run(p)

In [None]:
# Segmentation evaluation

fp_919518 = 5
fp_919603 = 4
fp_953476 = 5
fp_1068473 = 7
fp_1064226 = 7
fp_1065017 = 5
fp_1068441 = 4
fp_1068443 = 4
fp_1068448 = 3
fp_1068448 = 4
false_positives = []

miss_919518 = 3
miss_919603 = 1
miss_953476 = 3
miss_1068473 = 0
miss_1064226 = 1
miss_1065017 = 3
miss_1068441 = 1
miss_1068443 = 0
miss_1068448 = 2
miss_1068448 = 1
missed = []

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Segmentation evaluation data
false_positives = [5, 4, 5, 7, 7, 5, 4, 4, 4]
missed = [3, 1, 3, 0, 1, 3, 1, 0, 1]

# Calculate averages
avg_false_positives = np.mean(false_positives)
avg_missed = np.mean(missed)

# Calculate standard deviation for error bars
std_false_positives = np.std(false_positives)
std_missed = np.std(missed)

# Data for plotting
categories = ["False Positives", "Missed"]
averages = [avg_false_positives, avg_missed]
std_devs = [std_false_positives, std_missed]

# Create bar plot
fig, ax = plt.subplots(figsize=(8, 5))

# Bar width
bar_width = 0.4

# Create bars
bars = plt.bar(
    categories,
    averages,
    yerr=std_devs,
    color=["b", "r"],
    width=bar_width,
    capsize=10,
    edgecolor="grey",
)

# Add labels
plt.ylabel("Number")
plt.title("Topic segmentation evaluation results")

# Add text labels on top of the bars
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2 - 0.1, yval + 0.1, round(yval, 2))

# Show the plot
plt.show()