In [1]:
import os
import psycopg2
import json

from dotenv import load_dotenv

from pgvector.psycopg2 import register_vector

from importlib import reload
from utils import db, prompting, api, prompt

from pypdf import PdfReader

load_dotenv()

conn = psycopg2.connect(os.getenv("POSTGRES_CONFIG"))
register_vector(conn)
cursor = conn.cursor()

In [2]:
reload(prompt)
reload(api)
reload(db)
reload(prompting)

<module 'utils.prompting' from '/Volumes/Personal/Work/Feedloop/fact_ext/from_github/AI-Research/utils/prompting.py'>

In [58]:
pdf_path = "./pdfs/Coffee_and_health - summary_fact.pdf"
user_language = "English"

In [59]:
resource_id = db.get_resource_id(cursor, "Coffee_and_health - summary_fact")
print(resource_id)

f35f69dc-1889-4dde-b6dd-b6351ec98a93


In [62]:
fact_count = db.count_facts_resource(cursor, resource_id)
print(fact_count)

(0,)


In [61]:
db.delete_facts_resource(conn, cursor, resource_id)

# Write to DB

## Fragment Prompt

In [23]:
def execute_fragment_prompt(pdf_path, resource_id, user_language):
    reader = PdfReader(pdf_path)
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
        """
    page_num = 1
    cur_summary = ""
    for page in reader.pages:
        messages = [
        {
            "role": "user",
            "content": prompting.fragment_prompt(page.extract_text(), cur_summary, user_language)
        },
        ]
        completion = api.get_completions_gpt35(messages, 0)
        res_json = json.loads(completion.choices[0].message.content)

        context_list = [item['topic'] for item in res_json['fragments']]
        fact_list = [item['fragment'] for item in res_json['fragments']]
        cur_summary = res_json['docSummary']

        for i in range(len(fact_list)):
            embed_result = api.get_embeddings_ada(fact_list[i])
            cursor.execute(insert_query, (context_list[i], fact_list[i], resource_id, embed_result, cur_summary, page_num))
        
        conn.commit()
        page_num+=1

In [24]:
execute_fragment_prompt(pdf_path, resource_id, user_language)

## Fact New Prompt

In [3]:
test_text = """--- Document Data:\nCurrent document summary: \nCurrent page content: 1\nS\ni\nm\np\na\nn\na\nn\n:\nB\nR\nI\nm\ne\nm\ni\nl\ni\nk\ni\n3\np\nr\no\nd\nu\nk\nS\ni\nm\np\na\nn\na\nn\ny\na\nn\ng\nd\na\np\na\nt\nk\na\nm\nu\np\ni\nl\ni\nh\n,\ny\na\ni\nt\nu\nT\na\nb\nu\nn\ng\na\nn\n,\nD\ne\np\no\ns\ni\nt\no\n,\nd\na\nn\nG\ni\nr\no\nB\nR\nI\np\nr\no\nd\nu\nc\nt\n1\n:\nB\nr\ni\nt\na\nm\na\nB\ni\ns\nn\ni\ns\n-\nJ\ne\nn\ni\ns\n:\nS\ni\nm\np\na\nn\na\nn\n-\nD\ne\ns\nk\nr\ni\np\ns\ni\n:\nU\nn\nt\nu\nk\nm\ne\nm\ne\nn\nu\nh\ni\nk\ne\nb\nu\nt\nu\nh\na\nn\nb\ni\ns\nn\ni\ns\nm\nu\n,\nB\nR\nI\nm\ne\nn\na\nw\na\nr\nk\na\nn\nt\na\nb\nu\nn\ng\na\nn\nB\nr\ni\nt\na\nm\na\nB\ni\ns\nn\ni\ns\n.\nB\nr\ni\nt\na\nm\na\nB\ni\ns\nn\ni\ns\na\nd\na\nl\na\nh\nt\na\nb\nu\nn\ng\na\nn\nu\nn\nt\nu\nk\nm\ne\nn\nu\nn\nj\na\nn\ng\nt\nr\na\nn\ns\na\nk\ns\ni\nk\ne\nb\nu\nt\nu\nh\na\nn\nb\ni\ns\nn\ni\ns\nm\nu\n.\n-\nF\ni\nt\nu\nr\n:\nF\ni\nt\nu\nr\nu\nn\ng\ng\nu\nl\na\nn\nB\nr\ni\nt\na\nm\na\nB\ni\ns\nn\ni\ns\n:\n1\n.\nP\ne\nn\nc\na\nt\na\nt\na\nn\nt\nr\na\nn\ns\na\nk\ns\ni\nl\ne\nb\ni\nh\nd\ne\nt\na\ni\nl\n;\n2\n.\nG\nr\na\nt\ni\ns\na\ns\nu\nr\na\nn\ns\ni\nk\ne\nc\ne\nl\na\nk\na\na\nn\nh\ni\nn\ng\ng\na\nR\np\n1\n5\n0\n.\n0\n0\n0\n.\n0\n0\n0\n,\n-\n;\n3\n.\nA\nk\ns\ne\ns\ni\nb\ni\nl\ni\nt\na\ns\nK\na\nr\nt\nu\nD\ne\nb\ni\nt\nB\nR\nI\nd\ni\nj\na\nr\ni\nn\ng\na\nn\nB\nR\nI\n,\nA\nT\nM\nB\ne\nr\ns\na\nm\na\n,\nL\ni\nn\nk\n,\nP\nr\ni\nm\na\n,\nC\ni\nr\nr\nu\ns\n,\nM\na\ne\ns\nt\nr\no\nd\na\nn\nM\na\ns\nt\ne\nr\nC\na\nr\nd\nb\na\ni\nk\nd\ni\nd\na\nl\na\nm\nm\na\nu\np\nu\nn\nd\ni\nl\nu\na\nr\nn\ne\ng\ne\nr\ni\n;\n4\n.\nL\ni\nm\ni\nt\nt\nr\na\nn\ns\na\nk\ns\ni\nm\ne\nn\nc\na\np\na\ni\nR\np\n1\n.\n0\n0\n0\n.\n0\n0\n0\n.\n0\n0\n0\n,\n-\n;\n5\n.\nF\na\ns\ni\nl\ni\nt\na\ns\nt\nr\na\nn\ns\na\nk\ns\ni\no\nt\no\nm\na\nt\ni\ns\n-\nC\na\nr\na\np\ne\nm\nb\nu\nk\na\na\nn\nr\ne\nk\ne\nn\ni\nn\ng\nb\na\nr\nu\n:\nK\na\nm\nu\nb\ni\ns\na\nm\ne\nm\nb\nu\nk\na\nr\ne\n\n--- Instructions:\n1. Extract all facts from current page\n2. Output as following JSON format in  in indonesia languages\n{\n  \"facts\": <{\"fact\": string, \"context\":<what is this fact about>}[]>\n  \"docSummary\": \"<resummarize the document by combining previous page summary and current page content>\"\n  \"context\": <create maximum 50 words description about the document based on docSummary>\n}"""

In [4]:
messages = [
    {
        "role": "user",
        "content": test_text
    },
]
completion = api.get_completions_gpt35(messages, 0)
file_name = f"./result/summary1/random_test1.txt"
with open(file_name, 'w') as f:
    f.write(completion.choices[0].message.content)

In [5]:
file_prompt = f"./result/summary1/random_prompt.txt"
with open(file_prompt, 'w') as f:
    f.write(test_text)

In [17]:
def execute_fact_prompt(pdf_path, resource_id, user_language):
    reader = PdfReader(pdf_path)
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
        """
    page_num = 1
    cur_summary = ""
    for page in reader.pages:
        messages = [
        {
            "role": "user",
            "content": prompting.fact_prompt(page.extract_text(), cur_summary, user_language)
        },
        ]
        completion = api.get_completions_gpt35(messages, 0)
        res_json = json.loads(completion.choices[0].message.content)

        context_list = [item['context'] for item in res_json['facts']]
        fact_list = [item['fact'] for item in res_json['facts']]
        cur_summary = res_json['docSummary']

        for i in range(len(fact_list)):
            embed_result = api.get_embeddings_ada(fact_list[i])
            cursor.execute(insert_query, (context_list[i], fact_list[i], resource_id, embed_result, cur_summary, page_num))
        
        conn.commit()
        page_num+=1

In [18]:
execute_fact_prompt(pdf_path, resource_id, user_language)

## Context Prompt

In [29]:
def execute_context_prompt(pdf_path, resource_id, user_language):
    reader = PdfReader(pdf_path)
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
        """
    page_num = 1
    for page in reader.pages:
        messages = [
        {
            "role": "user",
            "content": prompting.context_prompt(page.extract_text(), user_language)
        },
        ]
        completion = api.get_completions_gpt35(messages, 0)
        res_json = json.loads(completion.choices[0].message.content)

        context_list = [item['chunk'] for item in res_json['chunks']]
        fact_list = [item['facts'] for item in res_json['chunks']]
        current_summary = res_json['main_idea']

        for i in range(len(context_list)):
            for fact in fact_list[i]:
                context_data = f"{current_summary}\n{context_list[i]}"
                embed_result = api.get_embeddings_ada(fact['fact'])
                cursor.execute(insert_query, (context_data, fact['fact'], resource_id, embed_result, current_summary, page_num))
        
        conn.commit()
        page_num+=1

In [30]:
execute_context_prompt(pdf_path, resource_id, user_language)

## Summary Prompt

### Trial

In [305]:
reader = PdfReader(pdf_path)
for i in range(len(reader.pages)):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt(reader.pages[i].extract_text())
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    file_name = f"./result/summary1/BPK_A_P{i+1}.txt"
    with open(file_name, 'w') as f:
        f.write(completion.choices[0].message.content)

In [242]:
with open('./result/summary1/Tempo_P2.txt', 'r') as file:
    current_page_text = file.read()
current_page_json = json.loads(current_page_text)

In [28]:
content = """
Penghargaan yang pernah diraih oleh BRI: The  Best Corporate - Malam Anugerah BUMN 2021; Tata Kelola Perusahaan (GCG) Terbaik 2021; Transformasi Bisnis & Organisasi Terbaik 2021; Dua insan BRILiaN Nur Arifin Akbar dan Nitia Rahmi sebagai Team Blockchain Center of Excellence (CoE) meraih penghargaan d alam ajang Silver Medal Finalist di UN World Innovation Day Hack 2022.; PT.BRI Raih Penghargaan Internasional, dalam katagori The Best ‘SME Banker of The Year’ untuk Direktur Utama BRI Sunarso dan PT.BRI sebagai ‘Best in Treasury and Working Capital – SMEs ’ dalam acara The Asset Triple A 2022.; PT.BRI meraih penghargaan dalam ajang 12th Annual Treasury & FX Awards 2022 dalam katogori Best FX Bank for Structured Products: Commodities, Credit, Equity, FX and Multi - Asset, Best FX Bank for Money Market Products , dan Best FX Bank for Retail Clients.; PT.BRI Meraih Penghargaan The Best State -Owned Enterprise in 2021 kategori Financial Listed Public Company with Asset Above Rp 15 Trillion; PT Bank Rakyat Indonesia Meraih Penghargaan LPS Banking Award 2022 kategori Bank Teraktif dalam Meningkatkan Literasi Keuangan Masyarakat
"""

In [29]:
prompt.count_tokens_tiktoken(content)

317

In [None]:
messages = [
{
    "role": "user",
    "content": prompting.summary_prompt_rec(content)
},
]
completion = api.get_completions_gpt35(messages, 0)
file_name = f"./result/summary1/BPK_P1_Poin3a.txt"
with open(file_name, 'w') as f:
    f.write(completion.choices[0].message.content)

In [240]:
reader = PdfReader(pdf_path)
prompting.summary_prompt(reader.pages[1].extract_text())

'\n--- Page content:\nPENDAPAT KAMIS, 5 AGUSTUS 2010 A11\nSepak terjang jurnalis infotain-\nmen televisi di Tanah Air telahmenjelma layaknya paparazzi\n(jurnalis foto candid independen,\npen) yang tiada lelah memburu\npara selebritas untuk dijadikan beritadan mengejar siaran up to date . Cara\nyang digunakan untuk memperoleh ba-han pemberitaan juga semakin mirip, be-rani, nekat, dan penuh kontroversi. Tidakjarang, dalam beberapa kejadian liputan,cara jurnalis infotainmen yang kejar-ha-dang tersebut menuai keberatan danprotes.\nSetelah ditelisik lebih dalam, ternyata\nmodel kerja jurnalisme itu lebih banyakdidorong oleh tuntutan kejar-tayang siar-an mengingat infotainmen kini menjadisalah satu program andalan bagi industriTV . Berjejalnya jam tayang infotainmen diTV , mulai pagi hingga sore, membuat stokdan suplai siaran menjadi minim. Jalansatu-satunya adalah terus memaksa jur-nalis di lapangan mengejar sumber beritakapan saja, di mana saja, apa saja, gunamendapatkan bahan pemberitaan 

### Implementation

#### Function

In [63]:
def extract_summary_fact(page, user_language):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt(page.extract_text(), user_language)
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    
    res_json = json.loads(completion.choices[0].message.content)

    fact_list = [item['chunk'] for item in res_json['chunks']]
    current_summary = res_json['summary']
    return fact_list, current_summary

In [64]:
def extract_summary_fact_rec(page, user_language):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt_rec(page, user_language)
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    
    res_json = json.loads(completion.choices[0].message.content)

    fact_list = [item['chunk'] for item in res_json['chunks']]
    current_summary = res_json['summary']
    return fact_list, current_summary

In [65]:
def execute_summary_prompt(pdf_path, resource_id, user_language):
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
    """
    page_num = 1
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        fact_list, cur_summary = extract_summary_fact(page, user_language)
        for f1 in fact_list:
            if prompt.count_tokens_tiktoken(f1) > 300:
                new_facts, new_summary = extract_summary_fact_rec(f1, user_language)
                for f2 in new_facts:
                    embed_result = api.get_embeddings_ada(f2)
                    cursor.execute(insert_query, (new_summary, f2, resource_id, embed_result, cur_summary, page_num))
                continue
            embed_result = api.get_embeddings_ada(f1)
            cursor.execute(insert_query, (cur_summary, f1, resource_id, embed_result, cur_summary, page_num))
        conn.commit()
        page_num+=1

#### Execute

In [66]:
execute_summary_prompt(pdf_path, resource_id, user_language)

# Validate

In [7]:
update_query = """
UPDATE resource SET status = 'ready' WHERE id = %s; 
"""
cursor.execute(update_query, (resource_id,))
conn.commit()

# result = cursor.fetchall()
# for res in result:
#     print(res)

In [67]:
select_query = "SELECT * FROM resource WHERE name ilike %s"

In [73]:
# select_query = """
# SELECT context,fact FROM fact WHERE resource_id = %s and number = 1; 
# """
cursor.execute(select_query, ('CS Knowledges -%',))
test_resource_ids = []
result = cursor.fetchall()
for res in result:
    print(res)
    test_resource_ids.append(res[0])

('ff450050-9dc3-4c93-8410-812a3ad1f7b2', 'CS Knowledges - Fragment', 'document', 'Dokumen ini berisi tentang kendala dan solusi pada aplikasi SenyuM Mobile, pembayaran Tabungan Emas, refund Brizzi, dan refund Online Acquiring.', 'indonesia', 'ready', datetime.datetime(2023, 12, 28, 5, 49, 1, 293868, tzinfo=datetime.timezone.utc), '603f9138-2723-4b28-bc56-c32190521e64', Decimal('100.00'), 'eb47f246-2b18-4f7b-b1fc-231d07fc7eec', 'asd', '', datetime.datetime(2023, 12, 28, 7, 34, 11, 375000, tzinfo=datetime.timezone.utc), True, '9e42a637-93ee-4a29-b883-129fceb90e4d', 'full text')
('997af1a5-487c-4970-b7ff-cfea89b79f10', 'CS Knowledges - ParentChild', 'document', 'Dokumen ini berisi tentang kendala dan solusi pada aplikasi SenyuM Mobile, pembayaran Tabungan Emas, refund Brizzi, dan refund Online Acquiring.', 'indonesia', 'ready', datetime.datetime(2024, 1, 4, 0, 10, 32, 350983, tzinfo=datetime.timezone.utc), '603f9138-2723-4b28-bc56-c32190521e64', Decimal('100.00'), '1c892819-39a6-4f0b-88c3

In [71]:
print(test_resource_ids)

['3216b95d-3b70-41c0-a24d-fc88d59f178a', '24001626-6628-4fda-83f6-0217de9e57f1', 'c05d60db-fb76-499a-a696-0ef068ba138e', '7373123d-886a-4922-b5d3-bed897bffae5', 'b7fad3fa-896c-41c1-a7ee-b706af6f7948', 'ae95f7fd-ffac-4ebb-94e8-5cac892989e9', 'f35f69dc-1889-4dde-b6dd-b6351ec98a93', '6ea45d90-e798-44b0-9022-79399c9d18ce']


In [178]:
conn.rollback()