In [None]:
%pip install -qU pypdf langchain_community

In [None]:
from langchain_community.document_loaders import PyPDFLoader
import glob

# Specify the PDF file directory.
directory_path = "D:/VSCODE/paper_code/PDF"

# Get the paths of all PDF files in the directory.
pdf_files = glob.glob(f"{directory_path}/*.pdf")

all_docs = []

for file_path in pdf_files:
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    all_docs.extend(docs)

In [None]:
pip install -qU langchain-openai

In [None]:
import os
import textwrap

# Here is the personal OpenAI API key for using the related APIs
os.environ["OPENAI_API_KEY"] = "xx-xxxx-xxxxxxxxxxx"

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o")

In [None]:
%pip install langchain_chroma langchain_openai

In [None]:
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Set the chunk size and overlap for the text splitter.
text_splitter = None
splits = None

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=10)
splits = text_splitter.split_documents(all_docs)

# Define a function to process the documents in batches.
def batch_documents(documents, batch_size):
    for i in range(0, len(documents), batch_size):
        yield documents[i:i + batch_size]

max_batch_size = 5461

vectorstore = None

# Process in batches and create vector storage.
for batch in batch_documents(splits, max_batch_size):
    if vectorstore is None:
        vectorstore = Chroma.from_documents(documents=batch, embedding=OpenAIEmbeddings())
    else:
        vectorstore.add_documents(documents=batch)

In [None]:
# Create a retriever.
retriever = None
retriever = vectorstore.as_retriever(search_kwargs={"k": 788})

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "Note that the terms 'model', 'part', 'CAD file' and 'component' are equivalent. "
    "'Planar surface', 'Flat surface', 'Level plane', 'Even plane', 'Even surface', 'Flat area' and 'flat plane' are the same. "
    "'Round hole', 'Circular hole', 'Cylinder hole' and 'cylindrical hole' are the same."
    "'curved surface', 'Curved area', 'Curved plane', 'Curved profile', 'Curved face', 'Arched surface' and 'Curved shape' are the same."
    "'straight edge', 'Straight boundary', 'Linear edge', 'Straight line', 'Straight border', 'Straight perimeter' and 'Direct edge' are the same."
    "\n\n"
    "When comparing multiple parameters, structure your answer as follows:\n"
    "1. Parameter A: [brief description]\n"
    "2. Parameter B: [brief description]\n"
    "3. Parameter C: [brief description]\n"
    "Conclude with a brief summary if necessary."
    "\n\n"
    "When asked to compare the size of parameters and output the maximum or minimum value, structure your answer as follows:\n"
    "1. Compare Parameter A, B, C, etc.\n"
    "2. State which parameter is the largest or smallest.\n"
    "Example: 'The largest parameter is Parameter B with a value of X.'"
    "\n\n"
    "When asked to find more than one largest parameters, structure your answer as follows:\n"
    "State which are the two largest parameters.\n"
    "Example: 'The two largest parameters are Parameter B with a value of X and Parameter C with a value of Y.'"
    "\n\n"
    "A complex model contains many curved surfaces and edges, and has many holes.\n"
    "A complex model also includes more historical designs, such as more sketches and more extrusions.\n"
    "A simple model mostly consists of straight edges and flat planes, with no holes or very few holes.\n"
    "A simple model means fewer surfaces, fewer holes, small mean Gaussian curvature and mean curvature.\n"
    "A simpler model also means less design history, such as fewer sketches and fewer extrusions.\n"
    "Example: 'The complex model is A: [brief description].'"
    "\n\n"
    "'NewBodyFeatureOperation' and 'NewBody' are the same. "
    "\n\n"
    "If an image of a model needs to be drawn, then focus on referencing the design history information for that model. "
    "\n\n"
    "Queries about model 'similarity' can be answered in terms of the similarity of the various parameters of the model," 
    "such as volume, surface area, "
    "proportion of straight edges, number of holes, and whether the design steps are similar."
    "The response begins with an explanation of which models are more or less similar, followed by a brief analysis."
    "Example: 'The similarity between Model A and Model B is very high: [brief description].'"
    "\n\n"
    "'SymmetricFeatureExtentType' and 'Symmetric' are the same. "
    "'OneSideFeatureExtentType' and 'OneSide' are the same. "
    "'TwoSideFeatureExtentType' and 'TwoSide' are the same. "
    "\n\n"
    "When asked about the design history of the part, which is DEEP DATA, first answer how many sketches the part contains and" 
    "show the information in the sketch. Then answer how many extrudes the part contains and show the information in the extrude.\n"
    "Example: 'This model contains 3 sketches, each containing the following lines:"
    "1. Sketch 1:"
    "   - Line ending at (1.1, 1.1)"
    "   - Line ending at (1.1, -1.1)"
    "   - Line ending at (-1.1, -1.1)"
    "   - Line ending at (-1.1, 1.1)"
    "Regarding the extrude operations:"
    "1. Extrude 1:"
    "   - Operation: NewBodyFeatureOperation"
    "   - Direction: (0, 0, 0)"
    "   - Origin: (0, 0, 0)"
    "   - Scale: 1"
    "   - Extent one: 0.05"
    "   - Extent two: 0.0"
    "   - Extent type: OneSideFeatureExtentType'"
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
results = None
results = rag_chain.invoke({"input": "Which two models are more similar?"})
results