### 의존성 설치

In [None]:
%pip install -r requirements.txt

### MongoDB 연결

In [None]:
# Connection Info (환경 변수에서 값 불러오기)
from dotenv import load_dotenv
import os

load_dotenv()

USERNAME = os.getenv("MONGO_USERNAME")
PASSWORD = os.getenv("MONGO_PASSWORD")
HOST = os.getenv("MONGO_HOST")
PORT = int(os.getenv("MONGO_PORT"))

In [None]:
url = f"mongodb://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/"

In [None]:
from pymongo import MongoClient
from pymongo.errors import ConnectionFailure

try:
    client = MongoClient(url)
    client.admin.command('ping')
    print("Successfully connected to MongoDB!")

except ConnectionFailure as e:
    print(f"MongoDB connection failed: {e}")

### DB와 Collection 생성

In [None]:
db = client['s307_db']
collection = db['s307_collection']

In [None]:
print(collection)

### 샘플 데이터 추가

In [None]:
sample_pdf = "gpt-020-3m-sds.pdf"

In [None]:
import pdfplumber

pdf =  pdfplumber.open(sample_pdf)
page_count = len(pdf.pages)
for i in range(page_count):
    page = pdf.pages[i]
    words = page.extract_words()
    page_num = page.page_number

    data = {
        "file_name": sample_pdf,
        "page_num": page_num,
        "words": words
    }
    result = collection.insert_one(data)
    print(f"Inserted File: {sample_pdf} | Page: {page_num} | ID: {result.inserted_id}")

pdf.close()

### 샘플 데이터 조회

In [None]:
search_data = collection.find_one({"page_number": 2})
print(search_data)

### MongoDB 연결 종료

In [None]:
client.close()

### 데이터 확인 및 시각화

In [None]:
# 다시 MongoDB 연결

from pymongo import MongoClient
from pymongo.errors import ConnectionFailure
from dotenv import load_dotenv
import pdfplumber
import os

load_dotenv()

USERNAME = os.getenv("MONGO_USERNAME")
PASSWORD = os.getenv("MONGO_PASSWORD")
HOST = os.getenv("MONGO_HOST")
PORT = int(os.getenv("MONGO_PORT"))

url = f"mongodb://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/"


try:
    client = MongoClient(url)
    client.admin.command('ping')
    print("Successfully connected to MongoDB!")

except ConnectionFailure as e:
    print(f"MongoDB connection failed: {e}")

db = client['s307_db']
collection = db['s307_collection']

In [None]:
# 확인할 파일 이름
sample_pdf = "gpt-020-3m-sds.pdf"

# 조회할 페이지 번호
target_page_num = 1

# pdf에서 이미지 생성
pdf = pdfplumber.open(sample_pdf)
page = pdf.pages[target_page_num - 1]
img = page.to_image()

# MongoDB에서 조회한 데이터
words = collection.find_one({"file_name": sample_pdf, "page_num": target_page_num})["words"]

In [None]:
# 조회한 데이터로 이미지에 rects 그리기
img.draw_rects(words)

In [None]:
# page.to_image()

In [None]:
# 추후 lines를 객체에 추가해서 페이지의 일치도를 높일 때 사용
# page.to_image().draw_rects(page.lines) 

In [None]:
# page.to_image().draw_rects(page.rects)

In [None]:
len(page.objects)

In [None]:
# 해당 페이지에 존재하는 모든 객체 정보 -> 그대로 DB에 저장해도 될듯?
page.objects.keys()

In [None]:
# 메모리 정리
pdf.close()
client.close()