In [1]:
# 匯入套件和金鑰
import os, sys
from tqdm import tqdm
import faiss

from dotenv import load_dotenv
load_dotenv()

from rich import print as pprint
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings

sys.path.append( os.path.join('..') )
from src.paper_extractor import vector_based_RAG

# Get embedding model

In [2]:
# load embeddings model
embeddings_model=OpenAIEmbeddings(model='text-embedding-3-large')

# Get database

In [3]:
database_path = './database/vector_db/'
database_name = 'metadata'
dimension = vector_based_RAG.DEFAULT_EMBEDDIMG_DIMENSION
text_splitter = RecursiveCharacterTextSplitter(chunk_size = vector_based_RAG.DEFAULT_CHUNK_SIZE, chunk_overlap = vector_based_RAG.DEFAULT_CHUNK_OVERLAP)

vector_db = vector_based_RAG.MyVectorDatabaseWithEmbeddedModel(
        embeddings_model = embeddings_model,
        text_splitter = text_splitter,
        database_path = database_path,
        database_name = database_name,
        dimension = dimension,
        verbose = True,
)
vector_db.connect();

print(f'Is database exists    : ', vector_db.is_database_exist() )
print(f'Is database connected : ', vector_db.is_connected())
print(f'Data number           : ', len(vector_db))
print(f'Data max id           : ', vector_db._get_max_id())
print(f'check datanumber      : ', vector_db.check_datanumber_in_database())

Database connected: ./database/vector_db/metadata
Is database exists    :  True
Is database connected :  True
Data number           :  1986
Data max id           :  1986
check datanumber      :  True


# Insert 1 file

In [4]:
# load PDF file
pdf_filepath = '/Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu'
pdf_filename = 'CCWu 2021 Modified distributed Bragg reflector for protecting organic light-emitting diode displays against ultraviolet light.pdf'
pdf_filename = 'CCWu 2022 Analyses of emission efficiencies of white organic light-emitting diodes having multiple emitters in single emitting layer.pdf'
# pdf_filename = 'CCWu 2019 Three‐dimensional pixel configurations for optical outcoupling of OLED displays-optical simulation.pdf'
# pdf_filename = 'CCWu 2013 Analyzing nanostructures in mesogenic host–guest systems for polarized phosphorescence.pdf'
# pdf_filename = 'CCWu 2022 Analyses of emission efficiencies of white organic light-emitting diodes having multiple emitters in single emitting layer.pdf'

In [5]:
vector_db.insert_pdf_file(pdf_filepath, pdf_filename, progress = True, force = False);

0it [00:00, ?it/s]


# Insert Multiple File into database

In [7]:
# pdf_filepath = '/Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu'
# pdf_filenames = [ fname for fname in os.listdir(pdf_filepath) if fname.endswith('.pdf') and not vector_db.is_pdf_file_in_database(pdf_filepath = pdf_filepath, pdf_filename = fname) ]

# for pdf_filename in tqdm( pdf_filenames ):
#     vector_db.insert_pdf_file(pdf_filepath, pdf_filename, progress = False, force = False);
        

In [8]:
print(f'Is database exists    : ', vector_db.is_database_exist() )
print(f'Is database connected : ', vector_db.is_connected())
print(f'Data number           : ', len(vector_db))
print(f'Data max id           : ', vector_db._get_max_id())
print(f'check datanumber      : ', vector_db.check_datanumber_in_database())
# vector_db._get_datanumber_in_faiss_index(), vector_db._get_datanumber_in_sqlite3()

Is database exists    :  True
Is database connected :  True
Data number           :  1986
Data max id           :  1986
check datanumber      :  True


In [9]:
ans, D, I = vector_db.query('what is SpiroAC-TRZ', k = 5, get_distance_and_index=True, format = 'page')
ans, D, I = vector_db.query('how to achieve high efficency OLED devices?', k = 5, get_distance_and_index=True, format = 'page')

D, I

(array([0.6226542, 0.6232054, 0.6277211, 0.6310095, 0.6457577],
       dtype=float32),
 array([1195, 1173,  989, 1931,  387]))

In [10]:
for r in ans:
    print( f'[Paper] {r.metadata['source']}' )
    print( f'Page: {r.metadata['page']}' )
    print( r.page_content )
    print( '\n' )

[Paper] /Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu/CCWu 2020 High-efficiency organic light emitting diodes using high-index transparent electrode.pdf
Page: 6
Organic Electronics 87 (2020) 105984
7and (f)) appears not so consistent with the enhanced microcavity effect, 
as one may expect more directed emission with a stronger microcavity. 
This would be true if the resonant wavelength of the microcavity is set at 
the PL peak wavelength (~480 nm). However, for devices in Fig. 3, to 
achieve optimal φout, their resonant wavelengths were designed at 
~510 nm, longer than the PL peak wavelength (~480 nm). One should 
be aware that the emission pattern of a microcavity OLED (either into air 
or into the substrate) is strongly dependent on the shape of the intrinsic 
(PL) emission spectrum of the emitter and the choice of the resonance 
wavelength relative to PL peak wavelength, since the emission pattern of 
any single wavelength varies with wavelengths and the spectrally inte-
g