In [1]:
# 匯入套件和金鑰
import os, sys
from tqdm import tqdm
import faiss

from dotenv import load_dotenv
load_dotenv()

from rich import print as pprint

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


from langchain.indexes import VectorstoreIndexCreator
from langchain_openai import OpenAIEmbeddings

from langchain_community.vectorstores import Chroma

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough


sys.path.append( os.path.join('..') )
from src.paper_extractor import vector_based_RAG

# Get embedding model

In [2]:
# load embeddings model
embeddings_model=OpenAIEmbeddings(model='text-embedding-3-large')

# Get database

In [3]:
database_path = './database/vector_db/'
database_name = 'metadata'
dimension = vector_based_RAG.DEFAULT_EMBEDDIMG_DIMENSION
text_splitter = RecursiveCharacterTextSplitter(chunk_size = vector_based_RAG.DEFAULT_CHUNK_SIZE, chunk_overlap = vector_based_RAG.DEFAULT_CHUNK_OVERLAP)

vector_db = vector_based_RAG.MyVectorDatabaseWithEmbeddedModel(
        embeddings_model = embeddings_model,
        text_splitter = text_splitter,
        database_path = database_path,
        database_name = database_name,
        dimension = dimension,
        verbose = True,
)
vector_db.connect();

print(f'Is database exists    : ', vector_db.is_database_exist() )
print(f'Is database connected : ', vector_db.is_connected())
print(f'Data number           : ', len(vector_db))
print(f'Data max id           : ', vector_db._get_max_id())
print(f'check datanumber      : ', vector_db.check_datanumber_in_database())

Database connected: ./database/vector_db/metadata
Is database exists    :  True
Is database connected :  True
Data number           :  1986
Data max id           :  1986
check datanumber      :  True


# Insert 1 file

In [4]:
# load PDF file
pdf_filepath = '/Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu'
pdf_filename = 'CCWu 2021 Modified distributed Bragg reflector for protecting organic light-emitting diode displays against ultraviolet light.pdf'
pdf_filename = 'CCWu 2022 Analyses of emission efficiencies of white organic light-emitting diodes having multiple emitters in single emitting layer.pdf'
# pdf_filename = 'CCWu 2019 Three‐dimensional pixel configurations for optical outcoupling of OLED displays-optical simulation.pdf'
# pdf_filename = 'CCWu 2013 Analyzing nanostructures in mesogenic host–guest systems for polarized phosphorescence.pdf'
# pdf_filename = 'CCWu 2022 Analyses of emission efficiencies of white organic light-emitting diodes having multiple emitters in single emitting layer.pdf'

In [5]:
vector_db.insert_pdf_file(pdf_filepath, pdf_filename, progress = True, force = False);

0it [00:00, ?it/s]


# Insert Multiple File into database

In [6]:
# pdf_filepath = '/Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu'
# pdf_filenames = [ fname for fname in os.listdir(pdf_filepath) if fname.endswith('.pdf') and not vector_db.is_pdf_file_in_database(pdf_filepath = pdf_filepath, pdf_filename = fname) ]

# for pdf_filename in tqdm( pdf_filenames ):
#     vector_db.insert_pdf_file(pdf_filepath, pdf_filename, progress = False, force = False);
        

In [7]:
print(f'Is database exists    : ', vector_db.is_database_exist() )
print(f'Is database connected : ', vector_db.is_connected())
print(f'Data number           : ', len(vector_db))
print(f'Data max id           : ', vector_db._get_max_id())
print(f'check datanumber      : ', vector_db.check_datanumber_in_database())
# vector_db._get_datanumber_in_faiss_index(), vector_db._get_datanumber_in_sqlite3()

Is database exists    :  True
Is database connected :  True
Data number           :  1986
Data max id           :  1986
check datanumber      :  True


In [8]:
ans, D, I = vector_db.query('What is the most important display technology', k = 5, get_distance_and_index=True)
D, I

Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)


(array([1.0109465, 1.0704274, 1.0802474, 1.0844302, 1.0850697],
       dtype=float32),
 array([ 922,  900,  919, 1545, 1544]))

In [15]:
for r in ans:
    print( f'[Paper] {r.metadata['source']}' )
    print( f'Page: {r.metadata['page']}' )
    print( r.page_content )

[Paper] /Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu/CCWu 2018 A Vision toward Ultimate Optical Out-Coupling for Organic Light-Emitting Diode Displays- 3D Pixel Configuration.pdf
Page: 0
viewing characteristics are simultaneously achievable with optimized struc -
tures using highly transparent top electrodes. This scheme is scalable and  
wavelength insensitive, and generally applicable to all red, green, and blue pixels 
in high-resolution full-color displays. Results of this work are believed to shed 
light on the development of future generations of advanced OLED displays.Organic Light-Emitting Diodes
© 2018 The Authors. Published by WILEY-VCH Verlag GmbH & Co. KGaA, 
Weinheim. This is an open access article under the terms of the Creative 
Commons Attribution License, which permits use, distribution and re -
production in any medium, provided the original work is properly cited.Active-matrix organic light-emitting diode displays (AMOLEDs) 
have become one of the major disp