# Visualize Vector Store

- visualize high-dimensional data

![](images/high-dimensional-data.png)

In [None]:
%pip install llama_index
#!pip install llama_hub --force-reinstall
%pip uninstall -y llama_hub
%pip install git+https://github.com/selamanse/llama-hub.git@add_extra_info_to_web
%pip install langchain
%pip install chromadb
#!pip install git+https://github.com/mtybadger/chromaviz/
%pip install -e /Users/selamanse/Documents/GITHUB/chromaviz

## import libraries and set api key

In [None]:
import os
import logging
import sys
import getpass

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# using this to connect to openai
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
os.environ["OPENAI_API_BASE"] = "https://api.openai.com/v1"

## load data from website via sitemap

using the [llama_hub loader for sitemaps](https://llama-hub-ui.vercel.app/l/web-sitemap)

In [None]:
from llama_hub.web.sitemap.base import SitemapReader

# for jupyter notebooks uncomment the following two lines of code:
import nest_asyncio
nest_asyncio.apply()

loader = SitemapReader(html_to_text=True)
documents = loader.load_data(sitemap_url='https://deepshore.de/sitemap.xml', filter='https://deepshore.de/knowledge')

print(len(documents))
print(documents[0].extra_info)

## Add to Vector DB from Documents

In [None]:
from langchain.vectorstores import Chroma
from llama_index.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
import chromadb
from chromadb.config import Settings
from langchain.vectorstores import Chroma

#https://docs.trychroma.com/telemetry#opting-out
chromadb_settings = Settings(anonymized_telemetry=False, persist_directory="./chroma", chroma_db_impl="duckdb+parquet")



In [None]:

chromadb_client = chromadb.Client(chromadb_settings)
chroma_client = Chroma(collection_name='deepshore-sitemap', client=chromadb_client, embedding_function=OpenAIEmbeddings())

langchain_documents = []
for d in documents:
    langchain_documents.append(d.to_langchain_format())

vectordb = chroma_client.from_documents(langchain_documents, OpenAIEmbeddings(), collection_name='deepshore-sitemap', client_settings=chromadb_settings, persist_directory="./chroma")

vectordb.persist()

# Visualize storage

visualize high-dimensional data with t-SNE a statistical method.

https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding#:~:text=t%2Ddistributed%20stochastic%20neighbor%20embedding%20(t%2DSNE)%20is,two%20or%20three%2Ddimensional%20map.

In [None]:
#%pip install -e /Users/selamanse/Documents/GITHUB/chromaviz
%pip install git+https://github.com/selamanse/chromaviz/

from chromaviz import visualize_collection

visualize_collection(col=vectordb._collection)

# import requests
# import json
# data = vectordb._collection.get(include=["documents", "metadatas","embeddings"])
# x = requests.post('http://lima-0:16875/import-data', data=json.dumps(data))
# print(x)


# Visualize with atlas

In [None]:
"""
Visualizing your pinecone vector database index in Atlas
"""
import numpy as np
from nomic import atlas
import nomic

nomic.login(getpass.getpass("Nomic API Key:"))

num_embeddings = 999

#now pull the embeddings out of pinecone by id
vectors = vectordb._collection.get()

ids = []
info_jsons = []
embeddings = []
titles = []
for id in vectors['ids']:
    ids.append(id)
    meta_source = vectordb._collection.get(ids=id, include=['metadatas'])['metadatas'][0]['Source']
    text = vectordb._collection.get(ids=id, include=['documents'])['documents'][0]
    idx = text.find('\n### ')
    info_jsons.append({'id': id, 'Source': meta_source, 'Document': text[idx + 6:idx + 50]})    
    embeddings.append(vectordb._collection.get(ids=id, include=['embeddings'])['embeddings'][0])

embeddings = np.array(embeddings)

atlas.map_embeddings(embeddings=embeddings, data=info_jsons, id_field='id')


## retrieve relevant documents from vector store

- show basic usage of what we can further do with the data inside a vector db

In [111]:
retriever = vectordb.as_retriever()

retriever.get_relevant_documents("Advent")


[Document(page_content='[ ![Deepshore Logo\nwhite](/user/themes/deepshore/images/Deepshore_LogoMINI_bis1000px_transparentRGB.svg)\n![Deepshore\nLogo](/user/themes/deepshore/images/Deepshore_LogoMINI_bis1000px_RGB.svg) ](/)\n\n[ Home ](/) [ Wissen ](/knowledge) [ Unternehmen ](/company) [ Chatbot\n](/chat) [ Kontakt | Impressum | Datenschutz ](/company/contact-imprint-\nprivacy) [ __   en ](/en/knowledge/2021-04-08)\n\n![](/images/d/e/e/p/s/deepshore-sap-cloud-archivierung-copyright-shutterstock-\nden-rise-b6a6dcff.jpg?g-fe3cd0e1)\n\n##\n\n###  SAP-S/4-HANA-Archivierung in der Cloud – ArchiveLink versus BC-ILM versus\nCMIS\n\n8\\. April 2021 \\-  Falk Borgmann\n\n[Beitrag](/knowledge/category:Beitrag) [Zentrale\nSysteme](/knowledge/tag:Zentrale Systeme) [Cloud](/knowledge/tag:Cloud) [ECM\nEIM](/knowledge/tag:ECM EIM)\n\nWas in der Welt der SAP-R3-Systeme notwendig, aber unbeliebt war, wird unter\nS/4 HANA nicht unbedingt beliebter. Ich spreche hier nicht von teuren SAP-\nBeratern, sonde