In [1]:
# 匯入套件和金鑰
import os
from rich import print as pprint
from langchain_openai import ChatOpenAI
from tqdm import tqdm

from dotenv import load_dotenv
load_dotenv()

from langchain_community.document_loaders import PyPDFLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

# LLM Model

In [12]:
# load LLM model
chat_model = ChatOpenAI(model_name="gpt-3.5-turbo", api_key=os.getenv("OPENAI_API_KEY")) # "gpt-3.5-turbo", "gpt-4o"
embeddings_model=OpenAIEmbeddings(model='text-embedding-3-large')

# To Chroma

In [3]:
# load PDF file
pdf_filepath = '/Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu'
pdf_filenames = [\
    'CCWu 2021 Modified distributed Bragg reflector for protecting organic light-emitting diode displays against ultraviolet light.pdf',
    'CCWu 2022 Analyses of emission efficiencies of white organic light-emitting diodes having multiple emitters in single emitting layer.pdf',
    'CCWu 2019 Three‐dimensional pixel configurations for optical outcoupling of OLED displays-optical simulation.pdf',
    'CCWu 2013 Analyzing nanostructures in mesogenic host–guest systems for polarized phosphorescence.pdf',
    'CCWu 2022 Analyses of emission efficiencies of white organic light-emitting diodes having multiple emitters in single emitting layer.pdf',
    'CCWu 2023 Fully electromagnetic wave optic simulation and analyses of the cross-scale reflective 3D OLED pixel configuration.pdf',
    'CCWu 2022 Using angle-selective optical film to enhance the light extraction of a thin-film encapsulated 3D reflective pixel for OLED displays.pdf',
    'CCWu 2022 P-128 Optimizing OLED Pixel Structures for Consistently Low Ambient Light Reflection over Viewing Angles.pdf',
    'CCWu 2020 Integrating Molecular Rigidity and Chirality into TADF for Highly Efficient Sky-Blue CPEL.pdf',
    'CCWu 2020 High-efficiency organic light emitting diodes using high-index transparent electrode.pdf',
    'CCWu 2019 SID P-179 Optics of Curved OLEDs.pdf',
    'CCWu 2020 Enhance external quantum efficiency of organic light-emitting devices using thin transparent electrodes.pdf',
    'CCWu 2016 Triboluminescence and Metal Phosphor for Organic Light-Emitting Diodes Functional Pt(II) Complexes with Both 2‑Pyridylimidazol-2-ylidene and Bipyrazolate Chelates.pdf',
    'CCWu 2018 Quantitative analyses of high electroluminescence efficiency of thermally activated delayed fluorescence emitters based on acridine–triazine hybrids.pdf',
    'CCWu 2017 Achieving Nearly 30% External Quantum Efﬁciency for Orange–Red Organic Light Emitting Diodes by Employing Thermally Activated Delayed Fluorescence Emitters  Composed of 1,8-Naphthalimide-Acridine Hybrids.pdf',
    'CCWu 2016 Efﬁcient and Tunable Thermally Activated Delayed Fluorescence Emitters Having Orientation-Adjustable  CN-Substituted Pyridine and Pyrimidine Acceptor Units.pdf'
]

# pdf_filepath = '/Users/weikai/Library/CloudStorage/Dropbox/paper/CCWu'
# pdf_filenames = [ fname for fname in os.listdir(pdf_filepath) if fname.endswith('.pdf') ]


In [4]:
docs = []
for pdf_filename in tqdm(pdf_filenames):
    loader = PyPDFLoader(file_path=os.path.join(pdf_filepath, pdf_filename))
    docs += loader.load()
print(f"Loaded {len(docs)} documents")

100%|██████████| 16/16 [00:11<00:00,  1.41it/s]

Loaded 182 documents





In [5]:
database_path = './database/vector_db_chroma/'
# Chroma.from_documents(documents=docs,
#                       embedding=embeddings_model,
#                       persist_directory=database_path,
#                       collection_metadata={"hnsw:space": "cosine"})

# load Chroma
db = Chroma(persist_directory=database_path, 
            embedding_function=embeddings_model)

# Retrieve

In [6]:
question = "how to protect OLED display against UV light?"
retriever = db.as_retriever(search_type="similarity",
                            search_kwargs={"k": 5})
retrieved_docs = retriever.invoke(question)
print(f'傳回 {len(retrieved_docs)} 筆資料')
# pprint( retrieved_docs )

傳回 5 筆資料


# Question-Answer

In [13]:
str_parser = StrOutputParser()
template = (
    "請根據以下內容加上自身判斷回答問題 (請用繁體中文回答/專有名詞則使用英文):\n"
    "{context}\n"
    "問題: {question}"
    )
prompt = ChatPromptTemplate.from_template(template)
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | chat_model
    | str_parser
)

In [14]:
questions = [\
    "how to protect OLED display against UV light?", 
    "What is the key idea of 3D pixel OLED?",
    "How to achieve high emission efficient OLED?",
    "How to fabricate high emission OLED pixel device? What is the possible structure and material?",
    "What is the issue to fabricate high efficient red OLED?",
    "What is the optical issue in curve OLED?",
    "Can you summarize the outcoupling techniques for OLED?",
]

for ii, question in enumerate( questions ):
    pprint(f'問題-{ii+1}: {question}')
    pprint(f'回答-{ii+1}: {chain.invoke(question)}')
    pprint('')