In [None]:
#all imports and initializations

import tiktoken
from openai import OpenAI
from sentence_transformers import SentenceTransformer,util
from transformers import AutoTokenizer
import torch
import textwrap
import re
import json
import uuid
from datetime import date

client = OpenAI(
    base_url="https://router.huggingface.co/v1",
    api_key="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",
)

model=SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
tokenizer = AutoTokenizer.from_pretrained("gpt2", model_max_length=1000000)


In [None]:
#all helper functions can be found here
################################################################################

def chunk_transcript(transcript: str, model_max_tokens: int = 4096, reserved_tokens: int = 512):
    enc = tiktoken.get_encoding("cl100k_base")
    tokens = enc.encode(transcript)
    chunk_size = model_max_tokens - reserved_tokens
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
    return chunks

################################################################################

def extract_entities(transcript: str):
    chunks = chunk_transcript(transcript)
    results = []
    for i, chunk in enumerate(chunks, 1):
        completion = client.chat.completions.create(
            model="mistralai/Mistral-7B-Instruct-v0.2:featherless-ai",
            messages=[
                {
                    "role": "user",
                    "content": f"""Extract entities and information from the following transcript. Leave lists/dictionaries empty if the conversation doesnt have them.

Output format must strictly follow:

persons: [list of people mentioned]
locations: [list of locations mentioned]
dates: {{"date": "context where date appears"}}
events: [list of events mentioned]
action_items: {{"task/to-do": person it is entrusted to}}

For action_items, include all tasks implied or explicitly mentioned that require effort, planning, or follow-up. For action_items keep person as "unknown", if it is unclear whom it is assigned to and keep the tasks slightly detailed (4-5 words)
By dates, i mean all sorts of mentions in the transcript - normalized ones (eg: dd-mm-yy), explicit ones (eg: 10th of august, the 8th (here you will have to guess the month by context)), or relative ones(eg: next friday)
If a date is mentioned without month/year (e.g., ‘the 12th’), still include it in the output with the exact wording used.
If relative dates are used (e.g., ‘next Tuesday’), include them as they are.
Remember the following at all costs:
Output only in the format shown below. Do not include explanations, transcripts, or extra text. Do not include comments.
{chunk}"""
                }
            ],
          temperature=0.1
        )
        results.append(completion.choices[0].message.content)
    return results

################################################################################

def normalize_output(text: str) -> str:
    # squash everything into one line, clean spacing
    return " ".join(text.split())

def parse_llm_output(text: str) -> dict:
    text = normalize_output(text)

    data = {}

    persons_match = re.search(r"persons:\s*(\[.*?\])", text)
    locations_match = re.search(r"locations:\s*(\[.*?\])", text)
    dates_match = re.search(r"dates:\s*({.*?})", text)
    events_match = re.search(r"events:\s*(\[.*?\])", text)
    action_items_match = re.search(r"action_items:\s*({.*?})", text)

    if persons_match:
        data["persons"] = json.loads(persons_match.group(1))
    if locations_match:
        data["locations"] = json.loads(locations_match.group(1))
    if dates_match:
        data["dates"] = json.loads(dates_match.group(1))
    if events_match:
        data["events"] = json.loads(events_match.group(1))
    if action_items_match:
      data["action_items"] = json.loads(action_items_match.group(1))

    return data

def merge_results(parsed_list):
    merged = {"persons": [], "locations": [], "dates": {}, "events": [], "action_items": {}}
    for chunk in parsed_list:
        merged["persons"].extend(chunk.get("persons", []))
        merged["locations"].extend(chunk.get("locations", []))
        merged["events"].extend(chunk.get("events", []))
        merged["dates"].update(chunk.get("dates", {}))

        # merge action_items dicts
        for task, person in chunk.get("action_items", {}).items():
            # keep latest assignment if duplicates
            merged["action_items"][task] = person

    # deduplicate lists
    merged["persons"] = list(set(merged["persons"]))
    merged["locations"] = list(set(merged["locations"]))
    merged["events"] = list(set(merged["events"]))

    return merged

################################################################################

def parse_conversation(text, min_words=5):
    lines = text.strip().splitlines()
    conv = []
    current_speaker, current_dialogue = None, []

    for line in lines:
        if ":" in line:
            # new speaker line
            speaker, dialogue = line.split(":", 1)
            if current_speaker and " ".join(current_dialogue).strip():
                # save previous
                full_dialogue = " ".join(current_dialogue).strip()
                if len(full_dialogue.split()) >= min_words:
                    conv.append((current_speaker.strip(), full_dialogue))
            # start new
            current_speaker = speaker.strip()
            current_dialogue = [dialogue.strip()]
        else:
            # continuation of current speaker's dialogue
            if current_speaker:
                current_dialogue.append(line.strip())

    # add last dialogue
    if current_speaker and " ".join(current_dialogue).strip():
        full_dialogue = " ".join(current_dialogue).strip()
        if len(full_dialogue.split()) >= min_words:
            conv.append((current_speaker.strip(), full_dialogue))

    return conv

################################################################################

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks by tokens."""
    tokens = tokenizer.encode(text)
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(tokenizer.decode(chunk))
    return chunks

def summarize_chunk(chunk):
    """Summarize one chunk using Mistral."""
    completion = client.chat.completions.create(
        model="mistralai/Mistral-7B-Instruct-v0.2:featherless-ai",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that summarizes conversations."},
            {"role": "user", "content": f"Summarize this conversation without omitting any important names, dates mentioned:\n\n{chunk}"}
        ],
        max_tokens=200,
        temperature=0.1
    )
    return completion.choices[0].message.content

def full_summary(chunks):
  summaries=[]
  for i, chunk in enumerate(chunks):
    summary=summarize_chunk(chunk)
    summaries.append(summary)
  return summaries

################################################################################

def important_dialogues(conversation_text, summaries, threshold=0.5):
    summary_embs=model.encode(summaries,convert_to_tensor=True,normalize_embeddings=True)
    conv = parse_conversation(conversation_text)

    important = []
    for speaker, dialogue in conv:
        d_emb = model.encode(dialogue, convert_to_tensor=True, normalize_embeddings=True)
        sims = util.cos_sim(d_emb, summary_embs)
        max_sim = torch.max(sims).item()
        if max_sim >= threshold:
            important.append((speaker, dialogue,d_emb))
    summs=[]
    for i in range(len(summaries)):
        summs.append((summaries[i],summary_embs[i]))
    return important,summs

################################################################################




In [None]:
def generate_json(transcript):
  chunks=chunk_text(transcript)
  summaries=full_summary(chunks)
  important,summs=important_dialogues(transcript,summaries)

  extracted=extract_entities(transcript)
  parsed=[parse_llm_output(out) for out in extracted]
  metadata=merge_results(parsed)

  dat=str(date.today())
  summary_dict={}
  for text,embedding in summs:
    uid=uuid.uuid4().hex[:8]
    summary_dict[f"{dat}_s_{uid}"]={
        "text":text,
        "embedding":embedding
    }

  dialogue_dict={}
  for speaker,dialogue,embedding in important:
    uid=uuid.uuid4().hex[:8]
    dialogue_dict[f"{dat}_d_{uid}"]={
        "speaker":speaker,
        "text":dialogue,
        "embedding":embedding
    }

  final_json={
      "metadata":metadata,
      "summary":summary_dict,
      "important_dialogues":dialogue_dict
  }

  return final_json

In [32]:
with open("test2.txt", "r", encoding="utf-8") as f:
      transcript = f.read()

final=generate_json(transcript)


In [34]:
print(len(final["important_dialogues"]))

5
