In [2]:
!python3 -m venv venv
!source venv/bin/activate
!pip3 install markdown langchain opensearch-py


[0m

In [3]:
# Load environment variables

from dotenv import load_dotenv,find_dotenv
load_dotenv(find_dotenv())
OPENAI_API_KEY="XXXXXXXXXX"

In [4]:
import markdown
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter


from pathlib import Path

markdown_document = ""
pathlist = Path('docs').glob('**/*.md')
for path in pathlist:
     # because path is object not string
     path_in_str = str(path)
     f = open(path_in_str, 'r')
     markdown_document += f.read()

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
]

# MD splits
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(markdown_document)

# Char-level splits

chunk_size = 250
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size, chunk_overlap=chunk_overlap
)

# Split
splits = text_splitter.split_documents(md_header_splits)

In [5]:
# Add metadata information needed for Chroma
for i, text in enumerate(splits): text.metadata["source"] = f"{i}-pl"
splits

 Document(page_content='* that one\n* the other one  \nNote that --- not considering the asterisk --- the actual text\ncontent starts at 4-columns in.  \n> Block quotes are\n> written like so.\n>\n> They can span multiple paragraphs,\n> if you like.', metadata={'source': '1-pl'}),
 Document(page_content='> if you like.  \nUse 3 dashes for an em-dash. Use 2 dashes for ranges (ex., "it\'s all\nin chapters 12--14"). Three dots ... will be converted to an ellipsis.\nUnicode is supported. ☺  \nAn h2 header\n------------  \nHere\'s a numbered list:', metadata={'source': '2-pl'}),
 Document(page_content="Here's a numbered list:  \n1. first item\n2. second item\n3. third item  \nNote again how the actual text starts at 4 columns in (4 characters\nfrom the left side). Here's a code sample:", metadata={'source': '3-pl'}),
 Document(page_content='for i in 1 .. 10 { do-something(i) }  \nAs you probably guessed, indented 4 spaces. By the way, instead of\nindenting the block, you can use delimited b

In [10]:
# Import and instantiate OpenAI embeddings

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model_name="ada",openai_api_key=OPENAI_API_KEY)

                    model_name was transferred to model_kwargs.
                    Please confirm that model_name is what you intended.


In [None]:
# Turn the first text chunk into a vector with the embedding

query_result = embeddings.embed_query(splits[0].page_content)
print(query_result)

In [6]:
# Import and initialite Chroma
#from langchain.vectorstores import Chroma
# Import OpenSearch
from langchain.vectorstores import OpenSearchVectorSearch

In [None]:
# Upload vectors to Chroma

index_name = "langchain"
#search = Chroma.from_documents(splits, embeddings)
docsearch = OpenSearchVectorSearch.from_documents(
    splits, embeddings, opensearch_url="https://localhost:9200", http_auth=("admin", "admin"), verify_certs = False, index_name=index_name
)

In [13]:
# Do a simple vector similarity search
# Create docsearch in case that the vector database is already populated
docsearch = OpenSearchVectorSearch(
    index_name="langchain",
    embedding_function=embeddings,
    opensearch_url="https://localhost:9200", http_auth=("admin", "admin"), verify_certs = False
)
query = "Which styles of links exists in Markdown?"
result = docsearch.similarity_search(query, k=10)

print(result[1].page_content)



### Links  
Markdown supports two style of links: *inline* and *reference*.  
In both styles, the link text is delimited by [square brackets].  
To create an inline link, use a set of regular parentheses immediately


