In [None]:

import psycopg2
from llama_cpp import Llama
import os, ast
import datetime
from collections import defaultdict

In [3]:
current_date = datetime.date.today()

In [None]:
#all helper functions can be found here
################################################################################

def chunk_transcript(transcript: str, model_max_tokens: int = 4096, reserved_tokens: int = 512):
    tokens = llm.tokenize(transcript.encode("utf-8"))
    chunk_size = model_max_tokens - reserved_tokens
    chunks = []
    for i in range(0, len(tokens), chunk_size):
        chunk_tokens = tokens[i:i + chunk_size]
        chunk_text = llm.detokenize(chunk_tokens).decode("utf-8",errors="ignore")
        chunks.append(chunk_text)
    return chunks

################################################################################

def chunk_text(text, chunk_size=1000, overlap=200):
    """Split text into overlapping chunks by tokens."""
    tokens = llm.tokenize(text.encode('utf-8'))
    chunks = []
    for i in range(0, len(tokens), chunk_size - overlap):
        chunk = tokens[i:i + chunk_size]
        chunks.append(llm.detokenize(chunk).decode('utf-8', errors = "ignore"))
    return chunks




# GPU settings (for ROCm)
os.environ["HSA_OVERRIDE_GFX_VERSION"] = "10.3.0"
os.environ["HIP_VISIBLE_DEVICES"] = "0"


REQUIRED_KEYS = {
   
    "events": {},
    "action_items": {}
}
# Prompt to get the info from the transcript, needs tweaking
def extract_info(transcript: str) -> dict:
    prompt = f"""
You are an information extraction system.
Read the transcript below and return ONLY a valid Python dictionary in this exact schema:

{{
"events": {{"The event mentioned in the transcript": [list of people mentioned involving the event, [date the event is on in the format YYYY-MM-DD, as supported by SQL. If month and year not mentioned, use the current month and year. Do not include anything that is ambiguous about the DD],[list of locations where the event occurs]]}}   
"actions": {{"task/to-do": [[list of "persons entrusted], [date for the task to be completed, in the format YYYY-MM-DD, as supported by SQL. If month and year not mentioned, use the current month and year. if DD is not mentioned, put the last of the current month"],[list of locations where the action is to be performed]]"}}
}}

If no values exist, use [] or {{}} accordingly.
Do not add explanations or extra text.
Do not add ```python in the beginning.
Do not add any mentions about not knowing the current date, I have supplied it to you here: current date is {current_date}
Transcript: \"\"\"{transcript}\"\"\"

"""

    response = llm(prompt, max_tokens=512, stop=["\n\n"])
    raw = response["choices"][0]["text"].strip()
    return raw
    if not raw:
        return REQUIRED_KEYS  # fallback if model gave nothing

    try:
        parsed = ast.literal_eval(raw)
        if not isinstance(parsed, dict):
            return REQUIRED_KEYS
    except Exception:
        return REQUIRED_KEYS  # fallback on bad output

    # Ensure all required keys are present
    return {**REQUIRED_KEYS, **parsed}





In [None]:
# running mistral quantized 4bit via llama.cpp, have to test gemma4b 
llm = Llama(
    model_path="/home/vijay/llama.cpp/models/mistral/mistral-7b-instruct-v0.2.Q4_K_M.gguf",
    verbose=False,
    n_ctx = 4096,
    n_gpu_layers=-1,
            # prints backend info
)


llama_context: n_ctx_per_seq (4096) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


In [6]:
#llm("Hello", max_tokens=1)  # warmup

with open("keyextract.txt", "r", encoding="utf-8") as f:
      transcript = f.read()

output_dict = extract_info(transcript)

{'events': {'conference': [['Alice', 'Bob'], ['2025-09-12'], ['Bangalore']],
  'company retreat': [['Alice'], ['2025-09-25'], ['Goa']]},
 'action_items': {},
 'actions': {'book venue': [['Bob'], ['2025-09-XX'], ['Bangalore']],
  'prepare slides': [['Charlie'], ['2025-09-XX'], ['']],
  'send budget report': [['Bob'], ['2025-09-XX'], ['']],
  'assign client visit preparation': [['Charlie'], ['2025-10-10'], ['Delhi']],
  'follow up with marketing team about social media campaign': [['Bob'],
   ['2025-09-XX'],
   ['']]}}

In [7]:
def dict_convert(input_dict):
    try:
        parsed = ast.literal_eval(input_dict)
        if not isinstance(parsed, dict):
            return REQUIRED_KEYS
    except Exception:
        return REQUIRED_KEYS  # fallback on bad output

    # Ensure all required keys are present
    return {**REQUIRED_KEYS, **parsed}

converted_dict = dict_convert(output_dict)

SyntaxError: invalid syntax (3916002200.py, line 1)

In [None]:
#connect to server
try:
    conn = psycopg2.connect(database = "userdata",
                            user = "postgres",
                            host = "localhost",
                            password = '1312345',
                            port = 5432)
except:
    print('unable to connect')

In [None]:
#update the database ACTIONS NOT IMPLEMENTED CUZ LLM IS MESSING UP
with conn.cursor() as curs:
    person_dict ={}
    location_dict = {}
    date_dict = {}
    event_dict = defaultdict(list)
    for event in converted_dict['events']:
        info_list = converted_dict['events'][event]
        person_list = info_list[0]
        date_list = info_list[1]
        location_list =  info_list[2]

        for person in person_list:
            curs.execute(f"INSERT INTO persons (person_name) VALUES('{person}')")
            curs.execute("SELECT * FROM persons ORDER BY person_id DESC LIMIT 1")
            single_row = curs.fetchone()
            person_dict[person] = single_row[0]
            
        for location in location_list:
            curs.execute(f"INSERT INTO locations (country, city) VALUES('{location}','{location}')")
            curs.execute("SELECT * FROM locations ORDER BY location_id DESC LIMIT 1")
            
            single_row = curs.fetchone()
            location_dict[location] = single_row[0]
            curs.execute(f"INSERT INTO events (event_name, location_id) VALUES ('{event}','{location_dict[location]}')")
            curs.execute("SELECT * FROM event ORDER BY event_id DESC LIMIT 1")
            single_row = curs.fetchone()
            event_dict[event].append(single_row[0])
        for date in date_list:
            
            curs.execute(f"INSERT INTO dates (event_date) VALUES('{date}')")
            curs.execute("SELECT * FROM dates ORDER BY date_id DESC LIMIT 1")
            single_row = curs.fetchone()
            date_dict[date] = single_row[0]
        for person in person_list:
            for location in location_list:
                curs.execute(f"INSERT INTO person_events (person_id, event_id) VALUES ('{person_dict[person]}','{location_dict[location]}')")
        print(person_dict)
        print(location_dict)
        print(date_dict)
    

{'Alice': 73, 'Bob': 74}
{'Bangalore': 41}
{'2025-09-12': 17}
{'Alice': 75, 'Bob': 74}
{'Bangalore': 41, 'Goa': 42}
{'2025-09-12': 17, '2025-09-25': 18}


In [None]:
#Clear database
with conn.cursor() as curs:
    curs.execute("DELETE FROM locations")
    curs.execute("DELETE FROM persons")
    curs.execute("DELETE FROM dates")
    curs.execute("DELETE FROM person_events")

In [None]:
"""{{
  "persons": [list of people mentioned],
  "locations": [list of locations mentioned],
  "dates": {{"date in the format YYYY-MM-DD, as supported by SQL. If month and year not mentioned, use the current month and year. Do not include anything that is ambiguous about the DD": "context where date appears"}},
  "events": [list of events mentioned],
  "action_items": {{"task/to-do": "person it is entrusted to"}}
}}"""

"""
"events": {{"The event mentioned in the transcript": [[list of people mentioned involving the event], [date the event is on in the format YYYY-MM-DD, as supported by SQL. If month and year not mentioned, use the current month and year. Do not include anything that is ambiguous about the DD],[list of locations where the event occurs]]}}   
"actions": {{"task/to-do": [[list of "persons entrusted], [date for the task to be completed, in the format YYYY-MM-DD, as supported by SQL. If month and year not mentioned, use the current month and year. Do not include anything that is ambiguous about the DD"],[list of locations where the action is to be performed]]"}}
"""