In [1]:
import os
import wikipedia
import tiktoken
from dotenv import load_dotenv
from openai import OpenAI
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [37]:
load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [4]:
ny = wikipedia.page(title='New York City, New York')

In [13]:
# dim -> 1536
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

# dim -> 3072
# embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [14]:
# Pinecone
vectorstore = PineconeVectorStore(index_name='langchain-fastapi', embedding=embeddings)

index_name = 'langchain-fastapi'

namespace = 'Search Wikipedia'

In [41]:
# Initialize Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

# Connect to your pinecone index
pinecone_index = pc.Index('langchain-fastapi')

In [16]:
wikipedia.search('Boise, Idaho')

['Boise, Idaho',
 'Boise State University',
 'Boise County, Idaho',
 'Boise metropolitan area',
 'Boise State–Idaho football rivalry',
 'Boise State Broncos football',
 'List of people from Boise, Idaho',
 'Boise Airport',
 'Idaho',
 '2022 University of Idaho killings']

In [18]:
from langchain_core.documents import Document

documents = []
cities = ["New York City, New York", "Boise, Idaho"]
for city in cities:
    wikipedia_page_result = wikipedia.page(title=city)
    doc = Document(
        page_content=wikipedia_page_result.content,
        metadata={
            "source": f"{wikipedia_page_result.url}",
            "title": city,
        }
    )
    documents.append(doc)
    # raw_documents.append(page.content)

In [22]:
documents

[Document(metadata={'source': 'https://en.wikipedia.org/wiki/New_York_City', 'title': 'New York City, New York'}, page_content='New York, often called New York City or NYC, is the most populous city in the United States, located at the southern tip of New York State on one of the world\'s largest natural harbors. The city comprises five boroughs, each coextensive with a respective county. New York is a global center of finance and commerce, culture, technology, entertainment and media, academics and scientific output, the arts and fashion, and, as home to the headquarters of the United Nations, international diplomacy.\nWith an estimated population in 2023 of 8,258,035 distributed over 300.46 square miles (778.2 km2), the city is the most densely populated major city in the United States. New York City has more than double the population of Los Angeles, the nation\'s second-most populous city. New York is the geographical and demographic center of both the Northeast megalopolis and the

In [23]:
len(documents)

2

In [31]:
tokenizer = tiktoken.get_encoding('p50k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,
        chunk_overlap=0,
        length_function=tiktoken_len,
        separators=["\n\n", "\n", " ", ""]
)

In [32]:
docs = text_splitter.split_documents(documents)

In [33]:
len(docs)

258

In [51]:
inserted_vectors = vectorstore.add_documents(docs)

In [53]:
result = vectorstore.similarity_search('The city with big buildings', k=5)

result

[Document(id='67b7df8e-cca9-4f32-893b-7af6861bb196', metadata={'source': 'https://en.wikipedia.org/wiki/New_York_City', 'title': 'New York City, New York'}, page_content="New York has architecturally noteworthy buildings in a wide range of styles and from distinct time periods, from the Dutch Colonial Pieter Claesen Wyckoff House in Brooklyn, the oldest section of which dates to 1656, to the modern One World Trade Center, the skyscraper at Ground Zero in Lower Manhattan and the most expensive office tower in the world by construction cost.\nManhattan's skyline, with its many skyscrapers, has been recognized as an iconic symbol of the city, and the city has been home to several of the tallest buildings in the world. As of 2019, New York City had 6,455 high-rise buildings, the third most in the world after Hong Kong and Seoul."),
 Document(id='0a464d45-26d8-48d8-8aa0-ed8b4d2e1d1e', metadata={'source': 'https://en.wikipedia.org/wiki/New_York_City', 'title': 'New York City, New York'}, pag

In [54]:
result = vectorstore.similarity_search_with_score("The city named after trees", k=2)
for doc, score in result:
    print(f"{doc.id} - {doc.metadata} - {score}")

26bdc409-b128-413e-8c19-828682f71f1e - {'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise, Idaho'} - 0.486383885
75e33363-d711-4c85-bcfa-99da33bccf60 - {'source': 'https://en.wikipedia.org/wiki/Boise,_Idaho', 'title': 'Boise, Idaho'} - 0.437352031
