In [1]:
%pip install tiktoken
%pip install openai
%pip install textract
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install plotly
%pip install scipy
%pip install scikit-learn

[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
[33mDEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming depende

# 라이브러리 임포트

In [2]:
import os
import numpy as np
import pandas as pd
from typing import Iterator
import tiktoken
import textract

# 논문 파일 리스트 구하기

In [3]:
import os

data_path = "data/paper"
pdf_files = [x for x in os.listdir(data_path)]

print(len(pdf_files))
print(pdf_files[:3])

20
['jkma-2023-66-3-200.pdf', 'jkma-2023-66-1-50.pdf', 'jkma-2023-66-3-160.pdf']


# 파일을 조각(chunk)로 쪼개고 임베딩해 두기

In [4]:
from openai import OpenAI

client = OpenAI(api_key="")

In [7]:
response = client.embeddings.create(input=["some", "other"], model="text-embedding-3-small")

In [14]:
import tiktoken

TEXT_EMBEDDING_CHUNK_SIZE = 2000
tokenizer = tiktoken.get_encoding("cl100k_base")

embedding_storage = []

# 긴 문자열을 최대 길이 chunk_size의 토큰들로 자르고, 토큰을 반환한다.
def chunks(text, chunk_size=TEXT_EMBEDDING_CHUNK_SIZE):
    """Yield successive chunk_size chunks from text."""
    tokens = tokenizer.encode(text)
    i = 0
    while i < len(tokens):
        # 토큰 길이의 0.5 ~ 1.5 사이에서 문장의 끝을 찾는다.
        j = min(i + int(1.5 * chunk_size), len(tokens))
        while j > i + int(0.5 * chunk_size):
            # 디코딩 해서 마침표 혹은 줄 바꿈인지 확인한다.
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # 문장 끝을 못찾으면 chunk_size로 한다.
        if j == i + int(0.5 * chunk_size):
            j = min(i + chunk_size, len(tokens))
        yield tokens[i:j]
        i = j

def get_embeddings(texts):
    response = client.embeddings.create(input=texts, model="text-embedding-3-small")
    return [data.embedding for data in response.data]
    
def create_embeddings_for_text(text):
    # 최대길이 TEXT_EMBEDDING_CHUNK_SIZE의 토큰 리스트로 변환. 쪼개진 개별을 chunk라 한다.
    token_chunks = list(chunks(text))
    # 각 토큰 리스트를 text 리스트로 변환
    text_chunks = [tokenizer.decode(chunk) for chunk in token_chunks]

    # 전체 chunk들의 임베딩을 구하고
    embedding_chunks = get_embeddings(text_chunks)
    return text_chunks, embedding_chunks


def handle_file_string(filename, file_content_string):
    
    # 파일 내용에서 라인 변경, 더블 스페이스, 세미 콜론을 삭제
    clean_file_content_string = " ".join(file_content_string.replace("\n", "; ").replace(";", " ").split())
        
    try:
        text_chunks, embedding_chunks = create_embeddings_for_text(clean_file_content_string)
        print(f"[INFO] Embedded into {len(text_chunks)} chunks")
    except Exception as e:
        print("[handle_file_string] Error creating embedding: {}".format(e))

    for i, (text_chunk, embedding_chunk) in enumerate(zip(text_chunks, embedding_chunks)):
        embedding_storage.append({"embedding":embedding_chunk, "filename": filename, "chunk_index":i, "text":text_chunk })


In [15]:

# Process each PDF file and prepare for embedding
for i, pdf_file in enumerate(pdf_files):
    
    pdf_path = os.path.join(data_path, pdf_file)
    print(f"[INFO] {i}/{len(pdf_files)}. processing paper : {pdf_path}\n")
    
    text = textract.process(pdf_path, method='pdfminer')
    print(text.decode("utf-8")[0:200])
    print()

    handle_file_string(pdf_file, text.decode("utf-8"))
    print()
    print("============================================================")


[INFO] 0/20. processing paper : data/paper/jkma-2023-66-3-200.pdf

ORIGINAL ARTICLE 
J Korean Med Assoc 2023 March; 66(3):200-208

pISSN 1975-8456 / eISSN 2093-5951

https://doi.org/10.5124/jkma.2023.66.3.200

대한민국 육군에서 만성 코로나19 증후군의 발생률 
대한민국 육군에서 만성 코로나19 증후군의 발생률 

[INFO] Embedded into 5 chunks

[INFO] 1/20. processing paper : data/paper/jkma-2023-66-1-50.pdf

SPECIAL CONTRIBUTION 
J Korean Med Assoc 2023 January; 66(1):50-59

pISSN 1975-8456 / eISSN 2093-5951

https://doi.org/10.5124/jkma.2023.66.1.50

만성 코로나19 증후군 시대를 위한  
만성 코로나19 증후군 시대를 위한  
보건의료 대응 방안

[INFO] Embedded into 7 chunks

[INFO] 2/20. processing paper : data/paper/jkma-2023-66-3-160.pdf

FOCUSED ISSUE OF THIS MONTH 
J Korean Med Assoc 2023 March; 66(3):160-165

pISSN 1975-8456 / eISSN 2093-5951

https://doi.org/10.5124/jkma.2023.66.3.160

경흉부 바늘생검을 이용한 폐암의 진단
경흉부 바늘생검을 이용한 폐암의 진단

박 동

[INFO] Embedded into 4 chunks

[INFO] 3/20. processing paper : data/paper/jkma-2023-66-3-166.pdf

FOCUSED ISSUE OF THIS MONTH 
J Kor

In [16]:
from pprint import pprint

print(len(embedding_storage))
print(embedding_storage[0]["embedding"][:10])
print(len(embedding_storage[0]["embedding"]))
print(embedding_storage[0]["filename"])
print(embedding_storage[0]["chunk_index"])
print(embedding_storage[0]["text"])

110
[-0.027969228103756905, 0.02305985987186432, 0.003870846703648567, 0.048857636749744415, 0.015931839123368263, 0.008384867571294308, 0.0012243912788107991, 0.019224418327212334, -0.006054098717868328, 0.04531722888350487]
1536
jkma-2023-66-3-200.pdf
0
ORIGINAL ARTICLE J Korean Med Assoc 2023 March 66(3):200-208 pISSN 1975-8456 / eISSN 2093-5951 https://doi.org/10.5124/jkma.2023.66.3.200 대한민국 육군에서 만성 코로나19 증후군의 발생률 대한민국 육군에서 만성 코로나19 증후군의 발생률 기술분석기술분석 김 태 재·하 범 만·양 지 인·김 미 정·서 경 완 대한민국 육군본부 의무실 Descriptive analysis of the incidence rate of post- acute COVID-19 syndrome in the Republic of Korea Army Taejae Kim, MMSc · Beomman Ha, PhD · Ji-in Yang, MS · Mi-Jung Kim, BN · Kyung-Wan Seo, MS Office of Surgeon General, The ROK Army HQ, Gyeryong, Korea Background: This study aimed to identify the incidence rate of post-coronavirus disease-2019 (COVID-19) conditions in the Republic of Korea (ROK) Army and to investigate the trend of the incidence rate according to changes in dominant varian

# 가장 가까운 조각(chunk) 구하기

In [19]:
from scipy.spatial.distance import cosine
import pandas as pd

def find_closest_n_index(embedding_storage, embedded_query, n=3):
    embeddings = [entity["embedding"] for entity in embedding_storage]
    df = pd.DataFrame({'embeddings':embeddings})
    df["distances"] = df["embeddings"].apply(lambda x: cosine(embedded_query, x))
    sorted_index = df.sort_values(by="distances").index.to_list()
    return sorted_index[:n]

def find_closest_n_chunk(embedding_storage, query, n=3):
    embedded_query = get_embeddings(query)[0]
    closest_indexes = find_closest_n_index(embedding_storage, embedded_query, n)
    return [embedding_storage[i] for i in closest_indexes]

In [20]:
question = "코로나19 증후군의 증상은?"
queryed = find_closest_n_chunk(embedding_storage, question, n=1)[0]

In [21]:
print(queryed["filename"])
print(queryed["chunk_index"])
print(queryed["text"])

jkma-2023-66-1-50.pdf
0
SPECIAL CONTRIBUTION J Korean Med Assoc 2023 January 66(1):50-59 pISSN 1975-8456 / eISSN 2093-5951 https://doi.org/10.5124/jkma.2023.66.1.50 만성 코로나19 증후군 시대를 위한 만성 코로나19 증후군 시대를 위한 보건의료 대응 방안 보건의료 대응 방안 김 혜 준1·송 지 훈1·박 상 민1,2 ¹서울대학교 대학원 의과학과 헬스시스템 데이터 사이언스 연구실 ²서울대학교병원 가정의학과 Healthcare response strategies for the long- COVID era Hye Jun Kim, MS1 · Jihun Song, MS1 · Sang Min Park, MD, PhD, MPH1,2 ¹Health System Data Science Laboratory, Department of Biomedical Sciences, Seoul National University College of Medicine, Seoul, Korea ²Department of Family Medicine, Seoul National University Hospital, Seoul, Korea Background: Coronavirus disease (COVID-19), first reported at the end of 2019, is characterized by a broad spectrum of clinical manifestations ranging from asymptomatic to multi-organ dysfunction. These symptoms may persist even after the acute phase has passed. Post-acute COVID-19 syndrome (long-COVID) is a condition characterized by COVID-19 symptoms that p

# 구한 조각 내용으로 문의하기

In [25]:
prompt = f"""
You are helpful QnA engine.
You will be provided with text delimited by triple quotes and question. 
Step 1. find the text part from the given text which is used to reason the answer of question
Step 2. answer the question in Korean and show the text part with quotation mark.
If not base found just say 'could not answer'. 
"""

text_and_question = f"""
Text:
```{queryed["text"]}```

Question:
```{question}```
"""



MODEL = "gpt-3.5-turbo"
response = client.chat.completions.create(
    model=MODEL,
    messages=[
        {"role": "system", "content": prompt},        
        {"role": "user", "content": text_and_question},
    ],
)


print("질문 : ", question)
print("근거 문서 파일 이름 : ", queryed["filename"])
print("근거 문서 조각 인덱스 : ", queryed["chunk_index"])
print("근거 문서 내용 : ", queryed["text"])
print("답변 : ", response.choices[0].message.content)

질문 :  코로나19 증후군의 증상은?
근거 문서 파일 이름 :  jkma-2023-66-1-50.pdf
근거 문서 조각 인덱스 :  0
근거 문서 내용 :  SPECIAL CONTRIBUTION J Korean Med Assoc 2023 January 66(1):50-59 pISSN 1975-8456 / eISSN 2093-5951 https://doi.org/10.5124/jkma.2023.66.1.50 만성 코로나19 증후군 시대를 위한 만성 코로나19 증후군 시대를 위한 보건의료 대응 방안 보건의료 대응 방안 김 혜 준1·송 지 훈1·박 상 민1,2 ¹서울대학교 대학원 의과학과 헬스시스템 데이터 사이언스 연구실 ²서울대학교병원 가정의학과 Healthcare response strategies for the long- COVID era Hye Jun Kim, MS1 · Jihun Song, MS1 · Sang Min Park, MD, PhD, MPH1,2 ¹Health System Data Science Laboratory, Department of Biomedical Sciences, Seoul National University College of Medicine, Seoul, Korea ²Department of Family Medicine, Seoul National University Hospital, Seoul, Korea Background: Coronavirus disease (COVID-19), first reported at the end of 2019, is characterized by a broad spectrum of clinical manifestations ranging from asymptomatic to multi-organ dysfunction. These symptoms may persist even after the acute phase has passed. Post-acute COVID-19 syndrome (lon