In [1]:

import chromadb
from chromadb.utils import embedding_functions
from langchain_community.document_loaders import PyPDFLoader 
from langchain_text_splitters import RecursiveCharacterTextSplitter


In [2]:
file_path = "data/laws_of_chess.pdf"
def load_and_split_doc(file_path):
    loader=PyPDFLoader(file_path)
    doc = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2000,
    chunk_overlap= 200,
    length_function = len,
    is_separator_regex=False
    )
    chunks = text_splitter.split_documents(doc)
    print("# of docs: {0}".format(len(chunks)))
    print("Sample metadata: {0}".format(chunks[0].metadata))
    print("Sample content: {0}".format(chunks[0].page_content))
    return chunks



In [3]:
def get_ids(chunks):
    ids = []
    current_page = 0
    count = 1
    for chunk in chunks:
        page = chunk.metadata.get("page")
        source = chunk.metadata.get("source")
        if page > current_page:
            count = 1
            current_page = page
        ids.append(f"{source}:{current_page}:{count}")
        count += 1
    return ids

In [4]:
chunks = load_and_split_doc(file_path=file_path)
ids = get_ids(chunks)
print(f"ids: {ids}")

# of docs: 43
Sample metadata: {'producer': 'Adobe PDF Library 9.0', 'creator': 'Acrobat PDFMaker 9.1 for Word', 'creationdate': '2009-07-10T14:50:44+04:00', 'author': 'INTERUNFALL Versicherung AG', 'company': '', 'keywords': '', 'moddate': '2009-07-10T14:51:13+04:00', 'sourcemodified': 'D:20090710105007', 'subject': '', 'title': 'E', 'source': 'data/laws_of_chess.pdf', 'total_pages': 25, 'page': 0, 'page_label': '1'}
Sample content: 1 
E.I.01 
 
FIDE LAWS of CHESS 
 
Contents: 
 
 PREFACE page 2 
 
 BASIC RULES OF PLAY 
 
Article 1: The nature and objectives of the game of chess page 2 
 
Article 2: The initial position of the pieces on the chessboard page 3 
 
Article 3: The moves of the pieces page 4 
 
Article 4: The act of moving the pieces page 7 
 
Article 5: The completion of the game page 8 
 
 COMPETITION RULES 
 
Article 6: The chess clock page 9 
 
Article 7: Irregularities page 11 
 
Article 8: The recording of the moves page 11 
 
Article 9: The drawn game page 12 
 
Arti

In [5]:
# initialise chromadb
client = chromadb.Client()

# initalise embedding function
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="all-mpnet-base-v2"
)

# create in-memory collection
collection = client.create_collection(name="chess_collection")


In [6]:
# insert / update chunks in db
collection.upsert(
    documents=[chunk.page_content for chunk in chunks],
    ids=ids)


In [22]:
# query the db
results = collection.query(
    query_texts= "how many pieces does each player start with?",
    n_results=2
)
# print(results)
# print(len(results))
for i in range(len(results['ids'][0])):
    print(f"""------------------------------------------------------------------------
result id: {results['ids'][0][i]} ; distance: {results['distances'][0][i]}
content: {results['documents'][0][i]}""")


------------------------------------------------------------------------
result id: data/laws_of_chess.pdf:17:2 ; distance: 1.0920754671096802
content: C.6 The eight ranks (from bottom to top for White and from top to bottom for Black) are 
numbered 1, 2, 3, 4, 5, 6, 7, 8, respectively. Consequently, in the initial position the white 
pieces and pawns are placed on the first and second ranks; the black pieces and pawns on 
the eighth and seventh ranks. 
 
C.7 As a consequence of the previous rules, each of the sixty -four squares is invariably indicated 
by a unique combination of a letter and a number.
------------------------------------------------------------------------
result id: data/laws_of_chess.pdf:2:1 ; distance: 1.1138403415679932
content: 3 
Article 2: The initial position of the pieces on the chessboard 
 
2.1 The chessboard is composed of an 8 x 8 grid of 64 equal squares alternately light (the 
‘white’ squares) and dark (the ‘black’ squares). 
The chessboard is placed b

In [8]:
# load doc


def load_and_split_doc(file_path):
    loader= PyPDFLoader(file_path)
    text = ""

    for page in loader.lazy_load():
        text += page.page_content
    return text

In [9]:
file_path = "data/laws_of_chess.pdf"
text = load_doc(file_path)
print(text)

NameError: name 'load_doc' is not defined

In [None]:
# instantiate in-memory client
chroma_client = chromadb.Client()

In [None]:
docs = [
        "harley davidson",
        "tesla",
        "boeing",
        "suzuki",
        "ford",
        "triumph",
        "indian", 
        "harley",
        "davidson",
        "kawasaki"
    ]

collection.upsert(
    documents= docs,
    ids= [str(x + 1) for x in range(len(docs))]
    # ["id1", "id2", "id3", "id4", "id5", "id6", "id7"]
)

In [None]:
results = collection.query(
    query_texts= ["truck"],
    # n_results=2
)

print(results)

{'ids': [['1', '5', '2', '8', '3', '4', '9', '10', '6', '7']], 'embeddings': None, 'documents': [['harley davidson', 'ford', 'tesla', 'harley', 'boeing', 'suzuki', 'davidson', 'kawasaki', 'triumph', 'indian']], 'uris': None, 'included': ['metadatas', 'documents', 'distances'], 'data': None, 'metadatas': [[None, None, None, None, None, None, None, None, None, None]], 'distances': [[0.8810828328132629, 0.9143844842910767, 1.1023205518722534, 1.2331926822662354, 1.2681713104248047, 1.319493293762207, 1.354628562927246, 1.4447104930877686, 1.492676854133606, 1.5280427932739258]]}
