In [None]:
############################################################################################ 
## Step 0. install dependencies, (execute only once)
##         note: use python 3.10 for maximum compatibility

# install Anaconda https://www.anaconda.com/download/success and VS Code https://code.visualstudio.com/
# then create a new conda environment with python 3.10
# > conda create -n llm python=3.10
# > conda activate llm
# then install the dependencies below

# run the following commands in Mac Terminal or Windows command prompt (without the #)
# pip install langchain chromadb sentence-transformers pymupdf
# pip install langchain-community
# pip install llama-cpp-python

## for visualizations install these dependencies too:
# pip install umap-learn pandas scikit-learn matplotlib plotly nbformat

## get free textbooks from:
# https://open.umn.edu/opentextbooks/subjects
#
## get free literature classics from:
# https://www.gutenberg.org/
#
############################################################################################

In [None]:
# # Step 1. load all documents in a folder
from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader

folder = 'docs_textbooks'  # change this to the folder with your documents
loader = DirectoryLoader(
    folder,
    glob="**/*.*", # use glob to match all files in the folder and sub folders
    loader_cls=PyMuPDFLoader
)
documents = loader.load()

print(f'length of documents {len(documents)}')
documents[:10] #examine the first 10 rows

In [None]:
# # Step 2. split the documents into chunks
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 1000  # size of each chunk in characters
chunk_overlap = 200  # number of characters to overlap between chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
all_splits = text_splitter.split_documents(documents)

print(f"length of all_splits {len(all_splits)}")
all_splits[:10]

In [None]:
# Step 3.b. load the embeddings model 
from langchain_community.embeddings import LlamaCppEmbeddings

embedding_model = LlamaCppEmbeddings(model_path="models/all-MiniLM-L6-v2-Q6_K.gguf")


In [None]:
## Save embeddings to a vector database, persist to disk
from langchain.vectorstores import Chroma

persist_folder = f'chroma_db_c{chunk_size}o{chunk_overlap}_{folder}'
vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding_model, persist_directory=persist_folder)

print(f'DONE!\n vector database created at {persist_folder}')

In [None]:
# the end!