# File Directory

Create the documentation from a [folder directory](https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory)

In [2]:
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import MarkdownHeaderTextSplitter

# Load Files 
loader = DirectoryLoader('../', glob="**/*.md", loader_cls=TextLoader, show_progress=True, use_multithreading=True)
documents = loader.load()


# Split Markdowns by titles and subtitles 
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
docs = []

for document in documents:
    docs.extend(markdown_splitter.split_text(document.page_content))

100%|██████████| 27/27 [00:00<00:00, 2472.30it/s]


# Vector Stores

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings()

query = "How install huggingface hub"

## Annoy

In [4]:
from langchain_community.vectorstores import Annoy

vector_store_from_docs = Annoy.from_documents(docs, embeddings)
vector_store_from_docs.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'Header 1': 'Install'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'Header 1': 'Links'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Dependencies'}),
 Document(page_content='Requirement:\n- npm\n- miniflare\n- wrangler  \nmore details on the version to be come.', metadata={'Header 1': 'ʕ •́؈•̀) `cron-worker`', 'Header 2': 'Getting started'})]

## AwaDB

In [5]:
from langchain_community.vectorstores import AwaDB

db = AwaDB.from_documents(docs)
db.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'Header 1': 'Install'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'Header 1': 'Links'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Dependencies'}),
 Document(page_content='Requirement:\n- npm\n- miniflare\n- wrangler  \nmore details on the version to be come.', metadata={'Header 1': 'ʕ •́؈•̀) `cron-worker`', 'Header 2': 'Getting started'})]

## Chroma

In [6]:
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(docs, embeddings)
db.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'Header 1': 'Install'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'Header 1': 'Links'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Dependencies'}),
 Document(page_content='Requirement:\n- npm\n- miniflare\n- wrangler  \nmore details on the version to be come.', metadata={'Header 1': 'ʕ •́؈•̀) `cron-worker`', 'Header 2': 'Getting started'})]

## FAISS

In [7]:
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(docs, embeddings)
db.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'Header 1': 'Install'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'Header 1': 'Links'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Dependencies'}),
 Document(page_content='Requirement:\n- npm\n- miniflare\n- wrangler  \nmore details on the version to be come.', metadata={'Header 1': 'ʕ •́؈•̀) `cron-worker`', 'Header 2': 'Getting started'})]

## LanceDB

In [8]:
from langchain.vectorstores import LanceDB

docsearch = LanceDB.from_documents(documents, embeddings)
docsearch.similarity_search(query)

[2024-02-23T00:31:43Z WARN  lance::dataset] No existing dataset at /tmp/lancedb/vectorstore.lance, it will be created


[Document(page_content='# Install\n\n```shell\npip install huggingface_hub\npip install transformers\n```\n\n# Links\n\n* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'vector': [0.0014892116887494922, -0.086399145424366, 0.017949510365724564, 0.03291476517915726, 0.028388479724526405, -0.000717666232958436, 0.008717209100723267, 0.020742187276482582, 0.025531567633152008, -0.007908790372312069, -0.009657509624958038, 0.02348226308822632, 0.036304641515016556, 0.09148702025413513, 0.011356882750988007, -0.08281201869249344, -0.02466333657503128, -0.010046933777630329, -0.07834256440401077, 0.006462138146162033, 0.044968292117118835, 0.06217135861515999

## Qdrant

In [9]:
from langchain_community.vectorstores import Qdrant

qdrant = Qdrant.from_documents(docs, embeddings, location=":memory:", collection_name="my_documents")

qdrant.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'Header 1': 'Install', '_id': 'd0abd694e37b424dba6dbe6a98d858d6', '_collection_name': 'my_documents'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'Header 1': 'Links', '_id': '458198b5f7494be18957731b9817b87b', '_collection_name': 'my_documents'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Dependencies', '_id': '496acc26bc2b407aa4400a375a868821', '_collection_name': 'my_

## scikit-learn

In [10]:
from langchain_community.vectorstores import SKLearnVectorStore

vector_store = SKLearnVectorStore.from_documents(documents=docs, embedding=embeddings)
vector_store.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'id': 'fcde5af8-cd1b-4f55-a660-f7b439903f51', 'Header 1': 'Install', '_id': 'd0abd694e37b424dba6dbe6a98d858d6', '_collection_name': 'my_documents'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'id': '87fec8ea-9796-4f85-ac87-3c949980ef33', 'Header 1': 'Links', '_id': '458198b5f7494be18957731b9817b87b', '_collection_name': 'my_documents'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'id': '5207198b-79ab-4609-bbd5-9433945e

## TileDB

In [11]:
from langchain_community.vectorstores import TileDB

db = TileDB.from_documents(documents, embeddings, index_uri="/tmp/tiledb_index", index_type="FLAT")
db.similarity_search(query)



[Document(page_content='# Install\n\n```shell\npip install huggingface_hub\npip install transformers\n```\n\n# Links\n\n* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'source': '../Dolly_2/ABOUT.md'}),
 Document(page_content=" # Dev Launchers Strapi Service\n\n# Getting Started\n1. Copy the `.env.example` file into `.env`\n2. Run `npm install`\n3. Run `npm run develop`\n4. Go to http://localhost:1337/admin to create an account\n\n# Running from Docker\nAlternatively, you can run it with Docker. There are 2 make targets available to do this.\n- Ensure that Docker is running. This usually means that you need to start up Docker Desktop.\n- cd to the proj

## USearch

In [12]:
from langchain_community.vectorstores import USearch

db = USearch.from_documents(docs, embeddings)
db.similarity_search(query)

[Document(page_content='```shell\npip install huggingface_hub\npip install transformers\n```', metadata={'Header 1': 'Install', '_id': 'd0abd694e37b424dba6dbe6a98d858d6', '_collection_name': 'my_documents'}),
 Document(page_content='* [Huggingface Online](https://python.langchain.com/docs/integrations/llms/huggingface_hub)\n* [get a token](https://huggingface.co/docs/api-inference/quicktour#get-your-api-token)\n* [Huggingface Offline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines)\n* [server](https://python.langchain.com/docs/integrations/llms/huggingface_textgen_inference)', metadata={'Header 1': 'Links', '_id': '458198b5f7494be18957731b9817b87b', '_collection_name': 'my_documents'}),
 Document(page_content='- The project is implemented in `Python 3.11.6`.\n- All dependencies are include in `requirements.txt` files.', metadata={'Header 1': 'Onboarding Bot Model', 'Header 2': 'Dependencies', '_id': '496acc26bc2b407aa4400a375a868821', '_collection_name': 'my_

## Vearch

In [None]:
# https://python.langchain.com/docs/integrations/vectorstores/vearch