In [1]:
import sys
sys.path.append("/home/dica/Projects/syvan_projects/pdf_parser")

In [2]:
# from pdf_parser.src.config.settings import get_settings
from pdf_parser.src.config.settings import get_settings
import openai
import json
import time

In [3]:
import re

class LazyDecoder(json.JSONDecoder):
    def decode(self, s, **kwargs):
        regex_replacements = [
            (re.compile(r'([^\\])\\([^\\])'), r'\1\\\\\2'),
            (re.compile(r',(\s*])'), r'\1'),
        ]
        for regex, replacement in regex_replacements:
            s = regex.sub(replacement, s)
        return super().decode(s, **kwargs)

# Setup

In [6]:
start_time = time.time()
try:
    settings = get_settings(env_file_path="/home/dica/Projects/syvan_projects/pdf_parser/.env.dev")

    client = openai.OpenAI(base_url=settings.base_url, api_key=settings.api_key)

    pdf_base64 = open("../pdf_parser/paper_exams/001/de-giua-ky-1-toan-12-nam-2022-2023-truong-thpt-phu-cu-hung-yen-HanhTruong.txt", "r").read()

    prompt_collection = json.load(open("../pdf_parser/src/prompt_collections/collection_01.json", "r"))[0]

    # init_messages_list = [
    #     {
    #         "role": "system",
    #         "content": "You're name is Dicabee"
    #     }
    # ]

    init_messages_list = []

    print("All setups are done")
except Exception as e:
    print(e)
    print("Something went wrong")
    exit(1)

All setups are done


In [38]:
def transform_input_pdf_base64(pdf_base64: str, init_messages_list: list, prompt_collection: dict):
    messages_list = init_messages_list.copy()
    messages_list.append(
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt_collection['extract_information']},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:application/pdf;base64,{pdf_base64}"
                    }
                }
            ]
        }
    )
    return messages_list

In [39]:
def add_messages(transformed_message: str, messages_list: list, message_type: str):
    messages_list_copy = messages_list.copy()
    messages_list_copy.append(
        {
            "role": message_type,
            "content": transformed_message
        }
    )
    return messages_list_copy

In [40]:
def get_response(client, messages_list):
    stream = client.chat.completions.create(
        model = settings.ai_model_name,
        messages = messages_list,
        stream = True,
        temperature = settings.temperature,
    )
    # print(messages_list)
    response_chunks = []
    for chunk in stream:
        chunk_message = chunk.choices[0].delta.content
        if chunk_message:
            response_chunks.append(chunk_message)
    
    full_response =  "".join(response_chunks)

    messages_list = add_messages(full_response, messages_list, "assistant")

    return full_response, messages_list
    # return stream


# Pipeline experiment

### Extract information

In [41]:
messages_list = transform_input_pdf_base64(pdf_base64, init_messages_list, prompt_collection)
messages_list

[{'role': 'user',
  'content': [{'type': 'text',
    'text': 'Trích xuất toàn bộ thông tin của đề thi ở trang đầu tiên? Trả lời dưới dạng JSON chính xác và đẩy đủ thông tin. Danh sách keys:\n- exam_code (chỉ trả lời ở dạng số),\n- number_of_questions (chỉ trả lời ở dạng số),\n- school,\n- subject,\n- grade,\n- duration (chỉ trả lời ở dạng số),\n- year (trả lời đẩy đủ năm học),\n- exam_term (chọn một trong danh sách sau: [Giữa kỳ I, Cuối kỳ I, Giữa kỳ II, Cuối kỳ II, Thi vào lớp 10, Thi THPT, Thi học sinh giỏi, Chất lượng đầu năm, Đánh giá năng lực, Ôn tập chương]),\n- exam_full_name (trả lời đẩy đủ và chính xác tên kỳ thi như trong tài liệu)'},
   {'type': 'image_url',
    'image_url': {'url': 'data:application/pdf;base64,JVBERi0xLjYNJeLjz9MNCjk0IDAgb2JqDTw8L0xpbmVhcml6ZWQgMS9MIDQzNzY1Ni9PIDk2L0UgMjM2ODkwL04gOC9UIDQzNzE1Mi9IIFsgNTQ5IDM3M10+Pg1lbmRvYmoNICAgICAgICAgICAgICAgDQoxMjYgMCBvYmoNPDwvRGVjb2RlUGFybXM8PC9Db2x1bW5zIDUvUHJlZGljdG9yIDEyPj4vRmlsdGVyL0ZsYXRlRGVjb2RlL0lEWzwzQUMwNjdERUU5

In [42]:
information_dict, messages_list = get_response(client, messages_list)
information_dict = json.loads(information_dict.replace('```json\n', '').replace('\n```', ''))
# type(num_questions), num_questions

In [43]:
information_dict

{'exam_code': 305,
 'number_of_questions': 50,
 'school': 'TRƯỜNG THPT PHÙ CỪ',
 'subject': 'TOÁN',
 'grade': 12,
 'duration': 90,
 'year': '2022 - 2023',
 'exam_term': 'Giữa kỳ I',
 'exam_full_name': 'ĐỀ KIỂM TRA GIỮA KỲ I KHỐI 12'}

### Get correct answer table

In [44]:
messages_list = add_messages(prompt_collection['get_correct_answer_table'][0], messages_list, "user")
response, messages_list = get_response(client, messages_list)
print(response)

| Câu | 305 | 306 | 307 | 308 |
|---|---|---|---|---|
| 1 | C | C | C | D |
| 2 | A | D | A | A |
| 3 | D | B | D | B |
| 4 | C | B | A | D |
| 5 | B | D | A | C |
| 6 | B | A | C | C |
| 7 | A | B | A | B |
| 8 | D | C | D | A |
| 9 | D | B | A | D |
| 10 | D | C | B | D |
| 11 | D | C | D | D |
| 12 | D | B | D | C |
| 13 | D | D | B | D |
| 14 | D | B | D | D |
| 15 | B | D | C | D |
| 16 | D | D | C | A |
| 17 | B | B | D | D |
| 18 | A | D | A | C |
| 19 | D | D | D | D |
| 20 | D | A | C | D |
| 21 | C | C | D | A |
| 22 | D | D | C | B |
| 23 | A | B | B | B |
| 24 | C | C | A | D |
| 25 | D | D | D | C |
| 26 | C | B | C | B |
| 27 | B | B | C | A |
| 28 | D | B | C | D |
| 29 | A | A | C | B |
| 30 | D | D | B | B |
| 31 | D | B | A | D |
| 32 | C | C | B | B |
| 33 | D | C | C | B |
| 34 | A | D | D | C |
| 35 | D | C | B | D |
| 36 | B | D | A | A |
| 37 | B | A | B | D |
| 38 | C | D | C | B |
| 39 | B | B | D | D |
| 40 | A | B | D | D |
| 41 | D | B | D | C |
| 42 | C | C

In [45]:
messages_list = add_messages(prompt_collection['get_correct_answer_table'][1], messages_list, "user")
response, messages_list = get_response(client, messages_list)
print(response)

## Đáp án Đề thi Mã 305

| Câu | Đáp án |
|---|---|
| 1 | C |
| 2 | A |
| 3 | D |
| 4 | C |
| 5 | B |
| 6 | B |
| 7 | A |
| 8 | D |
| 9 | D |
| 10 | D |
| 11 | D |
| 12 | D |
| 13 | D |
| 14 | D |
| 15 | B |
| 16 | D |
| 17 | B |
| 18 | A |
| 19 | D |
| 20 | D |
| 21 | C |
| 22 | D |
| 23 | A |
| 24 | C |
| 25 | D |
| 26 | C |
| 27 | B |
| 28 | D |
| 29 | A |
| 30 | D |
| 31 | D |
| 32 | C |
| 33 | D |
| 34 | A |
| 35 | D |
| 36 | B |
| 37 | B |
| 38 | C |
| 39 | B |
| 40 | A |
| 41 | D |
| 42 | C |
| 43 | B |
| 44 | D |
| 45 | D |
| 46 | A |
| 47 | A |
| 48 | B |
| 49 | B |
| 50 | B | 



### Provide formats and define function tags

In [46]:
messages_list = add_messages(prompt_collection['format_and_function_tag'], messages_list, "user")
response, messages_list = get_response(client, messages_list)
print(response)

'OK\n'

### Get questions

In [47]:
questions = []

In [48]:
questions = []
for i in range(information_dict['number_of_questions']):
    # for retry in range(3):
    #     try:
            # messages_list_copy = messages_list.copy()
            # messages_list_copy = add_messages(f"/extract({i+1})", messages_list_copy, "user")
            # response, _ = get_response(client, messages_list_copy)
            # questions.append(json.loads(response.replace('```json\n', '').replace('\n```', '')))
        #     break
        # except:
        #     print(f"Retry {retry+1} for question {i+1}")
        #     time.sleep(2)
    messages_list = add_messages(f"/extract({i+1})", messages_list, "user")
    response, messages_list = get_response(client, messages_list)
    response = response.replace('```json\n', '').replace('\n```', '')
    questions.append(json.loads(response, cls=LazyDecoder))
    # questions.append(response)

In [52]:
with open('../pdf_parser/paper_exams/001/exam_questions.json', 'w') as f:
    json.dump(questions, f, indent=4, ensure_ascii=False)

with open('../pdf_parser/paper_exams/001/exam_info.json', 'w') as f:
    json.dump(information_dict, f, indent=4, ensure_ascii=False)

print("--- %s seconds ---" % (time.time() - start_time))



--- 1194.5704562664032 seconds ---
