In [1]:
# Load environment variables

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())
OPENAI_API_KEY="XXXXXXXXXX"

In [2]:
import markdown
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


from pathlib import Path

markdown_document = ""
pathlist = Path('docs').glob('**/*.md')
for path in pathlist:
     # because path is object not string
     path_in_str = str(path)
     f = open(path_in_str, 'r')
     markdown_document += f.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)

# Char-level splits

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)

In [3]:
# Add metadata information needed for Chroma
for i, text in enumerate(splits): text.metadata["source"] = f"{i}-pl"
splits

 Document(page_content='* that one\n* the other one  \nNote that --- not considering the asterisk --- the actual text\ncontent starts at 4-columns in.  \n> Block quotes are\n> written like so.\n>\n> They can span multiple paragraphs,\n> if you like.', metadata={'source': '1-pl'}),
 Document(page_content='> if you like.  \nUse 3 dashes for an em-dash. Use 2 dashes for ranges (ex., "it\'s all\nin chapters 12--14"). Three dots ... will be converted to an ellipsis.\nUnicode is supported. ☺  \nAn h2 header\n------------  \nHere\'s a numbered list:', metadata={'source': '2-pl'}),
 Document(page_content="Here's a numbered list:  \n1. first item\n2. second item\n3. third item  \nNote again how the actual text starts at 4 columns in (4 characters\nfrom the left side). Here's a code sample:", metadata={'source': '3-pl'}),
 Document(page_content='for i in 1 .. 10 { do-something(i) }  \nAs you probably guessed, indented 4 spaces. By the way, instead of\nindenting the block, you can use delimited b

In [5]:
# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model_name="ada",openai_api_key=OPENAI_API_KEY)

In [6]:
# Turn the first text chunk into a vector with the embedding

query_result = embeddings.embed_query(splits[0].page_content)
print(query_result)

[0.0014113798251568187, 0.026957607747412873, -0.006647962820946935, -0.009137139012652045, -0.006051914957400711, 0.021146143522559026, 0.005648905589615431, -0.024708882753915928, -0.007457368311166952, -0.025413301643970522, 0.0029971712283332072, 0.03676530089759273, -0.023042659071028673, 0.009590947371872702, -0.0070103325299226095, -0.00023410107689617618, 0.02228405210658273, 0.005408454857068568, 0.01784078833133498, 0.0015722448502398542, -0.010471472847086154, -0.007721525860598728, -0.0032867285063133225, -0.009977024363394591, -0.00542877445364272, -0.017420845655963708, 0.031536337983348574, -0.019737303879804626, 0.004974965628760761, -0.01281502324547754, 0.010451153250512003, -0.00892039416566749, -0.020333350346366946, -0.00668521572510708, -0.006654735864584551, 0.013201100236999429, -0.013275606045319719, -0.010166675173183473, 0.016174564648447725, -0.0062043137943520515, 0.02111904948536335, 0.013580402787899796, 0.01987276847784737, -0.016174564648447725, -0.0010

In [7]:
# Import and initialite Chroma
from langchain.vectorstores import Chroma

In [8]:
# Upload vectors to Chroma

index_name = "langchain"
search = Chroma.from_documents(splits, embeddings)

In [13]:
# Do a simple vector similarity search

query = "Which styles of links exists in Markdown?"
result = search.similarity_search(query, k=10)

print(result[1].page_content)

### Links  
Markdown supports two style of links: *inline* and *reference*.  
In both styles, the link text is delimited by [square brackets].  
To create an inline link, use a set of regular parentheses immediately
