In [1]:
import os
import psycopg2
import json

from dotenv import load_dotenv

from pgvector.psycopg2 import register_vector

from importlib import reload
from utils import db, prompting, api, prompt

from pypdf import PdfReader

load_dotenv()

conn = psycopg2.connect(os.getenv("POSTGRES_CONFIG"))
register_vector(conn)
cursor = conn.cursor()

In [2]:
reload(prompt)
reload(api)
reload(db)
reload(prompting)

<module 'utils.prompting' from '/Volumes/Personal/Work/Feedloop/fact_ext/from_github/AI-Research/utils/prompting.py'>

In [12]:
pdf_path = "./pdfs/CS Knowledges - context_fact1.pdf"

In [13]:
resource_id = db.get_resource_id(cursor, "CS Knowledges - context_fact1")
print(resource_id)

0aa4f60a-4058-4e13-8d8a-88927c4f34a6


In [19]:
fact_count = db.count_facts_resource(cursor, resource_id)
print(fact_count)

(60,)


In [15]:
db.delete_facts_resource(conn, cursor, resource_id)

# Write to DB

## Fragment Prompt

In [167]:
def execute_fragment_prompt(pdf_path, resource_id):
    reader = PdfReader(pdf_path)
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
        """
    page_num = 1
    cur_summary = ""
    for page in reader.pages:
        messages = [
        {
            "role": "user",
            "content": prompting.fragment_prompt(page.extract_text(), cur_summary)
        },
        ]
        completion = api.get_completions_gpt35(messages, 0)
        res_json = json.loads(completion.choices[0].message.content)

        context_list = [item['topic'] for item in res_json['fragments']]
        fact_list = [item['fragment'] for item in res_json['fragments']]
        cur_summary = res_json['docSummary']

        for i in range(len(fact_list)):
            embed_result = api.get_embeddings_ada(fact_list[i])
            cursor.execute(insert_query, (context_list[i], fact_list[i], resource_id, embed_result, cur_summary, page_num))
        
        conn.commit()
        page_num+=1

In [182]:
execute_fragment_prompt(pdf_path, resource_id)

## Fact New Prompt

In [140]:
def execute_fact_prompt(pdf_path, resource_id):
    reader = PdfReader(pdf_path)
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
        """
    page_num = 1
    cur_summary = ""
    for page in reader.pages:
        messages = [
        {
            "role": "user",
            "content": prompting.fact_prompt(page.extract_text(), cur_summary)
        },
        ]
        completion = api.get_completions_gpt35(messages, 0)
        res_json = json.loads(completion.choices[0].message.content)

        context_list = [item['context'] for item in res_json['facts']]
        fact_list = [item['fact'] for item in res_json['facts']]
        cur_summary = res_json['docSummary']

        for i in range(len(fact_list)):
            embed_result = api.get_embeddings_ada(fact_list[i])
            cursor.execute(insert_query, (context_list[i], fact_list[i], resource_id, embed_result, cur_summary, page_num))
        
        conn.commit()
        page_num+=1

In [150]:
execute_fact_prompt(pdf_path, resource_id)

## Context Prompt

In [17]:
def execute_context_prompt(pdf_path, resource_id):
    reader = PdfReader(pdf_path)
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
        """
    page_num = 1
    for page in reader.pages:
        messages = [
        {
            "role": "user",
            "content": prompting.context_prompt(page.extract_text())
        },
        ]
        completion = api.get_completions_gpt35(messages, 0)
        res_json = json.loads(completion.choices[0].message.content)

        context_list = [item['chunk'] for item in res_json['chunks']]
        fact_list = [item['facts'] for item in res_json['chunks']]
        current_summary = res_json['main_idea']

        for i in range(len(context_list)):
            for fact in fact_list[i]:
                context_data = f"{current_summary}\n{context_list[i]}"
                embed_result = api.get_embeddings_ada(fact['fact'])
                cursor.execute(insert_query, (context_data, fact['fact'], resource_id, embed_result, current_summary, page_num))
        
        conn.commit()
        page_num+=1

In [18]:
execute_context_prompt(pdf_path, resource_id)

## Summary Prompt

### Trial

In [305]:
reader = PdfReader(pdf_path)
for i in range(len(reader.pages)):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt(reader.pages[i].extract_text())
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    file_name = f"./result/summary1/BPK_A_P{i+1}.txt"
    with open(file_name, 'w') as f:
        f.write(completion.choices[0].message.content)

In [242]:
with open('./result/summary1/Tempo_P2.txt', 'r') as file:
    current_page_text = file.read()
current_page_json = json.loads(current_page_text)

In [28]:
content = """
Penghargaan yang pernah diraih oleh BRI: The  Best Corporate - Malam Anugerah BUMN 2021; Tata Kelola Perusahaan (GCG) Terbaik 2021; Transformasi Bisnis & Organisasi Terbaik 2021; Dua insan BRILiaN Nur Arifin Akbar dan Nitia Rahmi sebagai Team Blockchain Center of Excellence (CoE) meraih penghargaan d alam ajang Silver Medal Finalist di UN World Innovation Day Hack 2022.; PT.BRI Raih Penghargaan Internasional, dalam katagori The Best ‘SME Banker of The Year’ untuk Direktur Utama BRI Sunarso dan PT.BRI sebagai ‘Best in Treasury and Working Capital – SMEs ’ dalam acara The Asset Triple A 2022.; PT.BRI meraih penghargaan dalam ajang 12th Annual Treasury & FX Awards 2022 dalam katogori Best FX Bank for Structured Products: Commodities, Credit, Equity, FX and Multi - Asset, Best FX Bank for Money Market Products , dan Best FX Bank for Retail Clients.; PT.BRI Meraih Penghargaan The Best State -Owned Enterprise in 2021 kategori Financial Listed Public Company with Asset Above Rp 15 Trillion; PT Bank Rakyat Indonesia Meraih Penghargaan LPS Banking Award 2022 kategori Bank Teraktif dalam Meningkatkan Literasi Keuangan Masyarakat
"""

In [29]:
prompt.count_tokens_tiktoken(content)

317

In [None]:
messages = [
{
    "role": "user",
    "content": prompting.summary_prompt_rec(content)
},
]
completion = api.get_completions_gpt35(messages, 0)
file_name = f"./result/summary1/BPK_P1_Poin3a.txt"
with open(file_name, 'w') as f:
    f.write(completion.choices[0].message.content)

In [240]:
reader = PdfReader(pdf_path)
prompting.summary_prompt(reader.pages[1].extract_text())

'\n--- Page content:\nPENDAPAT KAMIS, 5 AGUSTUS 2010 A11\nSepak terjang jurnalis infotain-\nmen televisi di Tanah Air telahmenjelma layaknya paparazzi\n(jurnalis foto candid independen,\npen) yang tiada lelah memburu\npara selebritas untuk dijadikan beritadan mengejar siaran up to date . Cara\nyang digunakan untuk memperoleh ba-han pemberitaan juga semakin mirip, be-rani, nekat, dan penuh kontroversi. Tidakjarang, dalam beberapa kejadian liputan,cara jurnalis infotainmen yang kejar-ha-dang tersebut menuai keberatan danprotes.\nSetelah ditelisik lebih dalam, ternyata\nmodel kerja jurnalisme itu lebih banyakdidorong oleh tuntutan kejar-tayang siar-an mengingat infotainmen kini menjadisalah satu program andalan bagi industriTV . Berjejalnya jam tayang infotainmen diTV , mulai pagi hingga sore, membuat stokdan suplai siaran menjadi minim. Jalansatu-satunya adalah terus memaksa jur-nalis di lapangan mengejar sumber beritakapan saja, di mana saja, apa saja, gunamendapatkan bahan pemberitaan 

### Implementation

#### Function

In [8]:
def extract_summary_fact(page):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt(page.extract_text())
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    
    res_json = json.loads(completion.choices[0].message.content)

    fact_list = [item['chunk'] for item in res_json['chunks']]
    current_summary = res_json['summary']
    return fact_list, current_summary

In [9]:
def extract_summary_fact_rec(page):
    messages = [
    {
        "role": "user",
        "content": prompting.summary_prompt_rec(page)
    },
    ]
    completion = api.get_completions_gpt35(messages, 0)
    
    res_json = json.loads(completion.choices[0].message.content)

    fact_list = [item['chunk'] for item in res_json['chunks']]
    current_summary = res_json['summary']
    return fact_list, current_summary

In [10]:
def count_fact_length(fact_list):
    for fact in fact_list:
        if prompt.count_tokens_tiktoken(fact) > 400:
            return True
    return False

In [11]:
def execute_summary_prompt(pdf_path, resource_id):
    insert_query = """
        INSERT INTO fact (context, fact, resource_id, embeddings, summary, number)
        VALUES (%s, %s, %s, %s, %s, %s);
    """
    page_num = 1
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        fact_list, cur_summary = extract_summary_fact(page)
        for f1 in fact_list:
            if prompt.count_tokens_tiktoken(f1) > 400:
                new_facts, new_summary = extract_summary_fact_rec(f1)
                for f2 in new_facts:
                    embed_result = api.get_embeddings_ada(f2)
                    cursor.execute(insert_query, (new_summary, f2, resource_id, embed_result, cur_summary, page_num))
                continue
            embed_result = api.get_embeddings_ada(f1)
            cursor.execute(insert_query, (cur_summary, f1, resource_id, embed_result, cur_summary, page_num))
        conn.commit()
        page_num+=1

#### Execute

In [61]:
execute_summary_prompt(pdf_path, resource_id)

# Validate

In [7]:
update_query = """
UPDATE resource SET status = 'ready' WHERE id = %s; 
"""
cursor.execute(update_query, (resource_id,))
conn.commit()

# result = cursor.fetchall()
# for res in result:
#     print(res)

In [13]:
select_query = """
SELECT context,fact FROM fact WHERE resource_id = %s and number = 1; 
"""
cursor.execute(select_query, (resource_id,))

result = cursor.fetchall()
for res in result:
    print(res)

('Badan Pemeriksa Keuangan Republik Indonesia (BPK RI) memiliki pimpinan yang terdiri dari Ketua, Wakil Ketua, dan beberapa anggota. Tugas dan wewenang pimpinan BPK RI meliputi pemeriksaan pengelolaan dan tanggung jawab keuangan negara, pembinaan tugas, dan memberikan pengarahan pemeriksaan investigatif. Obyek, lingkup tugas, dan portofolio pimpinan BPK RI meliputi kementerian yang terkait bidang Politik, Hukum, dan Keamanan, serta kementerian yang terkait bidang Ekonomi, Keuangan, Industri, dan Perdagangan.', 'A. Pimpinan Badan Pemeriksa Keuangan Republik Indonesia (BPK RI) saat ini adalah sebagai berikut:\nKetua BPK: Dr. Isma Yatun, CSFA., CFrA. \nWakil Ketua BPK: Dr. Agus Joko Pramono, M.Acc., Ak., CA., CSFA., CPA., CFrA., QGIA., CGCAE.\nAnggota I BPK: Nyoman Adhi Suryadnyana, SE., ME., M.Ak., CSFA., CertDA., CGCAE.\nAnggota II BPK: Ir. Daniel Lumban Tobing, CSFA., CFrA.\nAnggota III BPK: Prof. Dr. Achsanul Qosasi, CSFA., CFrA., CGCAE.\nAnggota IV BPK: Haerul Saleh, SH., CRA., CRP.,

In [178]:
conn.rollback()