In [None]:
# %pip install wikipedia

In [1]:
from decouple import config
from openai import OpenAI

OPENAI_API_KEY = config("OPENAI_API_KEY")
UPSTASH_VECTOR_REST_URL = config("UPSTASH_VECTOR_REST_URL")
UPSTASH_VECTOR_REST_TOKEN = config("UPSTASH_VECTOR_REST_TOKEN")

In [2]:
import os
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

In [3]:
import wikipedia

In [4]:
ny = wikipedia.page(title="New York City, New York")

In [6]:
# ny.content

In [7]:
# https://python.langchain.com/v0.2/docs/integrations/vectorstores/upstash/
# pip install langchain-openai langchain langchain-community upstash-vector

In [8]:
from langchain_openai import OpenAIEmbeddings

# dim -> 1536
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# dim -> 3072
# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [9]:
from langchain_community.vectorstores import UpstashVectorStore

store = UpstashVectorStore(
    embedding=embeddings,
    index_url=UPSTASH_VECTOR_REST_URL,
    index_token=UPSTASH_VECTOR_REST_TOKEN
)

In [10]:
wikipedia.search("Boise, Idaho")

['Boise, Idaho',
 'Boise State University',
 'Boise metropolitan area',
 'Boise County, Idaho',
 'Boise Airport',
 'Boise State Broncos football',
 'List of mayors of Boise, Idaho',
 'Idaho',
 'List of people from Boise, Idaho',
 'Boise State Broncos']

In [11]:
from langchain_core.documents import Document

documents = []
cities = ["New York City, New York", "Boise, Idaho"]
for city in cities:
    wikipedia_page_result = wikipedia.page(title=city)
    doc = Document(
        page_content=wikipedia_page_result.content,
        metadata={
            "source": f"{wikipedia_page_result.url}",
            "title": city,
        }
    )
    documents.append(doc)
    # raw_documents.append(page.content)

In [13]:
documents[0].metadata

{'source': 'https://en.wikipedia.org/wiki/New_York_City',
 'title': 'New York City, New York'}

In [18]:
len(documents)

2

In [14]:
%pip install tiktoken

Note: you may need to restart the kernel to use updated packages.


In [15]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import TokenTextSplitter

OPENAI_LLM_MODEL = "gpt-4o"
text_splitter = TokenTextSplitter.from_tiktoken_encoder(
    model_name=OPENAI_LLM_MODEL, chunk_size=100, chunk_overlap=0
)

In [16]:
docs = text_splitter.split_documents(documents)

In [17]:
len(docs)

312

In [19]:
inserted_vectors = store.add_documents(docs)

In [22]:
# result = store.similarity_search("The city with big buildings", k=5)
# result

In [26]:
result = store.similarity_search_with_score("The city named after trees", k=2)
for doc, score in result:
    print(f"{doc.metadata} - {score}")

{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise, Idaho'} - 0.749708
{'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise, Idaho'} - 0.7259105
