# Check Current Working Directory
Use Python's os module to print the current working directory to ensure you are in the base folder.

In [2]:
import os
print(os.getcwd())

c:\Users\neilb\projects\RagDocumentLocalModel


# Create a New Jupyter Notebook File
Use Python code to create a new .ipynb file in the base folder, optionally using nbformat or by writing a minimal notebook structure.

In [None]:
import nbformat as nbf
notebook = nbf.v4.new_notebook()
notebook.cells.append(nbf.v4.new_markdown_cell("# This is a new notebook"))
with open("created_notebook.ipynb", "w") as f:
    nbf.write(notebook, f)
print("created_notebook.ipynb created")

# List Files in Base Folder
Use Python to list all files in the base folder to confirm the new notebook was created.

In [1]:
import os
print(os.listdir('.'))

['.venv', 'new_notebook.ipynb', "Sword Coast Adventurer's Guide.pdf"]


In [1]:
from langchain_community.llms import Ollama
llm = Ollama(model="phi3:mini")
response = llm.invoke("what is the sword coast") 
print(response) 

  llm = Ollama(model="phi3:mini")


Sword Coast appears to be a fictional setting from various works of fantasy, most notably in "Dungeons & Dragons." It refers to an area that could refer to regions like Sunfyre or Llywnpennar within the Forgotten Realms. The Sword Coast is known as one of the primary haunts for adventurers and storytellers, often seen in role-playing game campaigns with a mix of cultures including elves, dwarfs, giths (also called lizardfolk), Halflings, high-elf races like Darkfae or Night Elves who were once native to the region but moved away during cataclysmic events. It's characterized by its history as a hub of political intrigue and conflict before many elven societies disappeared from Sword Coast due to past catastrophes, leaving behind remnants in ruins known today as Dwarrowdyn or the Forgotten Realms for explorers.


In [43]:
# Use Poppler for pdf2image conversion (Windows)
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

poppler_path = r"C:\Poppler\Library\bin"

images = convert_from_path("Sword Coast Adventurer's Guide.pdf", poppler_path=poppler_path)

all_text = []
for img in images:
    text = pytesseract.image_to_string(img)
    all_text.append(text)

full_text = "\n".join(all_text)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=500)
split_docs = text_splitter.create_documents([full_text])

print(split_docs[0].page_content[:1000])

RL
SWORDICOASTE

DUNGEONS & DRAGONS®

Explore the Sword: Coastiinithis campaign sourcebook
for the world’s greatest roleplaying game


SWORD COAST
ADVENTURER’S GUIDE


REDITS

This book was a collaboration between Wizards of the Coast and
Green Ronin Publishing. Members of the Green Ronin creative
team are marked with an asterisk below.

Lead Designer: Steve Kenson*

Designers: Joseph Carriker,* Brian Cortijo,* Jeremy Crawford,
Peter Lee, Jon Leitheusser,* Mike Mearls, Jack Norris,* Sean K
Reynolds, Matt Sernett, Rodney Thompson

Managing Editor: Jeremy Crawford

Editor: Kim Mohan*

Editorial Assistance: Chris Sims, Matt Sernett, Dan Helmick

Producer: Greg Bilsland

D&D Lead Designers: Mike Mearls, Jeremy Crawford

Art Directors: Kate Irwin, Hal Mangold,* Shauna Narciso

Graphic Designer: Emi Tanji

Cover Illustrator: Tyler Jacobson


In [44]:
from langchain.prompts import PromptTemplate
template = """Answer the question based on the context below. use just one sentence. it must be short and concise.
If you cant answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}
"""
prompt = PromptTemplate.from_template(template)
prompt.format(context="Here is some context", question="here is a question")
print(prompt)



input_variables=['context', 'question'] input_types={} partial_variables={} template="Answer the question based on the context below. use just one sentence. it must be short and concise.\nIf you cant answer, just say that you don't know, don't try to make up an answer.\n\nContext: {context}\nQuestion: {question}\n"


In [45]:
from langchain_core.output_parsers import StrOutputParser
parser = StrOutputParser()

In [24]:
chain = prompt | llm | parser

In [46]:
from langchain_community.vectorstores import DocArrayInMemorySearch
from langchain_ollama import OllamaEmbeddings
embeddings = OllamaEmbeddings(model="Phi3:mini")
vectorstore = DocArrayInMemorySearch.from_documents(split_docs, embedding=embeddings)

In [59]:
# Other possible search_type values for DocArrayInMemorySearch include:
# "similarity" (default), "mmr", and "similarity_score_threshold"
# Example usage:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 5})



retriever.invoke("what is Calimshan?")

[Document(metadata={}, page_content='Were these gewgaws and trifles of help to the Lan-\ntanese when the whole world was ripped from under\nthem? To whom did they turn when in that other world\ntheir prayers to their favored god, Gond, went unan-\nswered? What happened in their century away, and\nnow that they are returned, are they happy to be here,\nor does it seem like their world has once more been\nripped away?'),
 Document(metadata={}, page_content='panions have in pursuit of righteousness seems to me\nsomething uniquely human. And it’s not just those few\ntouched by the gods who seek these high ideals; Eltur-\ngard’s armed forces swell with men and women who\naspire to join the Companions. They are the Hellriders,\nso named because long ago warriors of Elturel literally\nrode through a gate into the Nine Hells to pursue and\ndestroy devils that had been plaguing their people. With\nthese bright examples to look up to, is it any wonder that\nthe common people of Elturgard also te

In [68]:
from operator import itemgetter

chain = (
    {"context" : itemgetter("question") | retriever, "question" : itemgetter("question")}
    | prompt
    | llm
    | parser  
)

response = chain.invoke({"question": "tell me about nelson mandela"})
print(response)

Nelson Mandela was a South African anti-apartheid revolutionary, political leader, and philanthropist who served as President of South Africa from 1994 to 1999. He is celebrated for his fight against racial segregation in South Africa and advocated reconciliation between the country's black majority and white minority even after years of oppression under apartheid.


In [62]:
# Debugging retriever output: inspect top returned chunks for a query
query = "the gods of the dwarves"
results = retriever.invoke(query)

for i, r in enumerate(results):
    print(f"Result {i+1}:")
    # Try different possible keys/attributes for content
    if hasattr(r, "page_content"):
        print(r.page_content[:2000])
    elif hasattr(r, "content"):
        print(r.content[:2000])
    elif isinstance(r, dict) and "page_content" in r:
        print(r["page_content"][:2000])
    elif isinstance(r, dict) and "content" in r:
        print(r["content"][:2000])
    else:
        print(str(r)[:2000])
    print("---")

Result 1:
Were these gewgaws and trifles of help to the Lan-
tanese when the whole world was ripped from under
them? To whom did they turn when in that other world
their prayers to their favored god, Gond, went unan-
swered? What happened in their century away, and
now that they are returned, are they happy to be here,
or does it seem like their world has once more been
ripped away?
---
Result 2:
horand paid to their followers wavered and diminished.

Each new incarnation of Isis, Osiris, and Thoth was a little

more human and a little less divine. When the magically

changers, and smugglers. She is interested in anything
that increases trade and the flow of money, whether new
trade routes, new inventions, or the whim of changing
fashion. Those who take Waukeen as a patron can be re-

liably thought of as greedy, but the Coinmaiden is said to powerful Imaskari returned with a vengeance a little over
frown upon misers and smile upon the industrious and a century ago, they stole the scep

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')
def tiktoken_len(text):
    tokens = tokenizer.encode(
    text,
    disallowed_special=()
)
    return len(tokens)
tiktoken.encoding_for_model('gpt-3.5-turbo')

# Count tokens in each chunk
token_counts = []
for doc in split_docs:
    token_counts.append(tiktoken_len(doc.page_content))
min_token_count = min(token_counts)
avg_token_count = int(sum(token_counts) / len(token_counts))
max_token_count = max(token_counts)

# Print token count statistics
print(f"Token counts across {len(split_docs)} chunks:")
print(f"Min: {min_token_count}")
print(f"Avg: {avg_token_count}")
print(f"Max: {max_token_count}")

Min: 0
Avg: 1232
Max: 4713


In [61]:
# Inspect all chunks for keywords: 'dwarven gods', 'religion', 'deity'
keywords = ['dwarven gods', 'religion', 'deity']
for idx, doc in enumerate(split_docs):
    text = doc.page_content if hasattr(doc, 'page_content') else str(doc)
    found = any(kw.lower() in text.lower() for kw in keywords)
    prefix = '***MATCH***' if found else ''
    print(f"{prefix} Chunk {idx}: {text[:200].replace('\n', ' ')}\n")
    if found:
        print(f"--> Keywords found: {[kw for kw in keywords if kw.lower() in text.lower()]}\n")

 Chunk 0: RL SWORDICOASTE  DUNGEONS & DRAGONS®  Explore the Sword: Coastiinithis campaign sourcebook for the world’s greatest roleplaying game   SWORD COAST ADVENTURER’S GUIDE   REDITS  This book was a collabor

 Chunk 1: Designers: Joseph Carriker,* Brian Cortijo,* Jeremy Crawford, Peter Lee, Jon Leitheusser,* Mike Mearls, Jack Norris,* Sean K Reynolds, Matt Sernett, Rodney Thompson  Managing Editor: Jeremy Crawford  

 Chunk 2: Graphic Designer: Emi Tanji  Cover Illustrator: Tyler Jacobson  Interior Illustrators: Conceptopolis, Olga Drebas, Jason A. Engle, Randy Gallegos, Ilich Henriquez, David Heuso, Tyler Jacobson, McLean 

 Chunk 3: Playtesters: Adam Hennebeck, Anthony Caroselli, Arthur Wright, Bill Benham, Bryce Haley, Christopher Hackler, Claudio Pozas, Daniel Oquendo, David “Oak” Stark, Gregory L. Harris, Jason Baxter, Jason F

 Chunk 4: Leahy, Shawn Merwin, Stacy Bermes, Teos Abadia, Tom Lommel,  Travis Brock, Yan Lacharité  BIBLIOGRAPHY  Here are the Forgotten Realms works th