In [1]:
import os
import psycopg2
import pandas as pd
import json, ast

from pypdf import PdfReader

from importlib import reload

from dotenv import load_dotenv

from pgvector.psycopg2 import register_vector

from utils import api, db, prompt, prompting

load_dotenv()

conn = psycopg2.connect(os.getenv("POSTGRES_CONFIG"))
register_vector(conn)
cursor = conn.cursor()

# Extract Fact

In [None]:
def extract_summary_fact(p_content, user_language):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt(p_content, user_language)
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    res_json = completion.choices[0].message.content

    try:
        res_json = ast.literal_eval(res_json)
    except:
        res_json = ast.literal_eval(res_json.replace("null", "None"))

    fact_list = [item['chunk'] for item in res_json['chunks']]
    current_summary = res_json['summary']
    return fact_list, current_summary

In [None]:
def extract_summary_fact_rec(page, user_language):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt_rec(page, user_language)
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    
    res_json = completion.choices[0].message.content

    try:
        res_json = ast.literal_eval(res_json)
    except:
        res_json = ast.literal_eval(res_json.replace("null", "None"))

    fact_list = [item['chunk'] for item in res_json['chunks']]
    current_summary = res_json['summary']
    return fact_list, current_summary

In [None]:
def find_split_point(point, chunk):
    for i in range(point, len(chunk)):
        if chunk[i] == ' ':
            return i

In [None]:
def split_half(chunk):
    midpoint = len(chunk) // 2

    split_point = find_split_point(midpoint, chunk)
    return [chunk[:split_point], chunk[split_point+1:]]

In [None]:
def split_third(chunk):
    one_third = len(chunk) // 3
    two_thirds = one_third * 2

    first_split = find_split_point(one_third, chunk)
    second_split = find_split_point(two_thirds, chunk)

    return [chunk[:first_split], chunk[first_split+1:second_split], chunk[second_split+1:]]

In [None]:
def split_chunk(chunk, n):
    if n == 2:
        return split_half(chunk)
    return split_third(chunk)

In [None]:
def extract_summary_fact_rec1(page_content, chunk, user_language):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt_rec1(page_content, chunk, user_language)
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    res_json = completion.choices[0].message.content

    try:
        res_json = ast.literal_eval(res_json)
    except:
        res_json = ast.literal_eval(res_json.replace("null", "None"))
    return res_json["context"]

In [None]:
def execute_summary_prompt(pdf_path, resource_id, user_language):
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
    """
    page_num = 1
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        p_content = page.extract_text()
        fact_list, cur_summary = extract_summary_fact(p_content, user_language)
        for f1 in fact_list:
            token_count = prompt.count_tokens_tiktoken(f1)
            if token_count > 600:
                new_facts = split_chunk(f1, 3)
                new_summary = extract_summary_fact_rec1(p_content, f1, user_language)
                for f2 in new_facts:
                    embed_result = api.get_embeddings_ada(f2)
                    cursor.execute(insert_query, (new_summary, f2, resource_id, embed_result, cur_summary, page_num))
                continue
            elif token_count > 300:
                new_facts = split_chunk(f1, 2)
                new_summary = extract_summary_fact_rec1(p_content, f1, user_language)
                for f2 in new_facts:
                    embed_result = api.get_embeddings_ada(f2)
                    cursor.execute(insert_query, (new_summary, f2, resource_id, embed_result, cur_summary, page_num))
                continue
            embed_result = api.get_embeddings_ada(f1)
            cursor.execute(insert_query, (cur_summary, f1, resource_id, embed_result, cur_summary, page_num))
        conn.commit()
        page_num+=1

In [None]:
pdf_path = ""
resource_id = ""
user_language = "indonesian"
execute_fragment_prompt(pdf_path, resource_id, user_language)

# Get Answer (Chat Completion)

In [None]:
question = "question"
resource_ids = []
user_lang = "indonesia"

retrieved = db.get_retrieved_knowledge(cursor, question, resource_ids, 25)
retrieved = [k[1] for k in retrieved]

chat_prompt = prompt.get_chat_prompt(question, retrieved, memory_max_tokens=500, lang=user_lang)
knowledges = prompt.get_knowledge_from_prompt(chat_prompt)
chat_completion = api.get_completions_dolphin([
    {
        "role": "user",
        "content": chat_prompt,
    }
], temp=0)

response = chat_completion.choices[0].message.content