<a href="https://colab.research.google.com/github/duper203/upstage_cookbook/blob/main/(1)wiki_tutorial_upstage_weaviate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import userdata
upstage_api_key = userdata.get('upstage_api_key')
WEAVIATE_INSTANCE_URL = userdata.get('WEAVIATE_INSTANCE_URL')
Weaviate_API_Key=userdata.get('Weaviate_API_Key')

# 1. Collect & Chunk Data



In [None]:
pip install -U langchain-community

In [None]:
pip install wikipedia

In [None]:
from langchain.document_loaders import WikipediaLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
search_term = "2024 Summer Olympics"
docs = WikipediaLoader(query=search_term, load_max_docs=20).load_and_split()

In [None]:
# Checks : See how many documents were loaded
len(docs)

20

In [None]:
# Checks : See what kind if documents were loaded
docs[3].page_content

# 2. Embedding

In [None]:
pip install -qU langchain-core langchain-upstage

In [None]:
from langchain_upstage import UpstageEmbeddings

embeddings = UpstageEmbeddings(
  api_key=upstage_api_key,
  model="solar-embedding-1-large"
)

In [None]:
wiki_content = [doc.page_content for doc in docs]
wiki_source = [doc.metadata['source'] for doc in docs]

print(wiki_content)
print(wiki_source)

["The 2024 Summer Olympics, officially the Games of the XXXIII Olympiad and branded as Paris 2024, were an international multi-sport event that occurred from 26 July to 11 August 2024 in France, with the opening ceremony having taken place on 26 July. Paris was the host city, with events (mainly football) held in 16 additional cities spread across metropolitan France, including the sailing centre in the second-largest city of France, Marseille on the Mediterranean Sea, as well as one subsite for surfing in Tahiti, French Polynesia.\nParis was awarded the Games at the 131st IOC Session in Lima, Peru, on 13 September 2017. After multiple withdrawals that left only Paris and Los Angeles in contention, the International Olympic Committee (IOC) approved a process to concurrently award the 2024 and 2028 Summer Olympics to the two remaining candidate cities; both of the bids were praised for high technical plans and innovative ways to use a record-breaking number of existing and temporary fac

In [None]:
wiki_vector = embeddings.embed_documents(wiki_content)

wiki_json_structure = [
    {
        "wiki_content": wiki_content,
        "vector": vector,
        "wiki_source": wiki_source
    }
    for wiki_content, vector, wiki_source in zip(wiki_content, wiki_vector, wiki_source)
]


# 3. Store in VectorDB

In [None]:
pip install -U weaviate-client

In [None]:
import weaviate
import json

client = weaviate.Client(
    url = WEAVIATE_INSTANCE_URL,  # Replace with your Weaviate endpoint
    auth_client_secret=weaviate.auth.AuthApiKey(api_key=Weaviate_API_Key),  # Replace with your Weaviate instance API key
)

In [None]:
# Check connection
client.is_ready()

True

In [None]:
# Class definition object. Weaviate's autoschema feature will infer properties when importing.
class_obj = {
    "class": "Wiki",
    "vectorizer": "none",
}

# Add the class to the schema
client.schema.create_class(class_obj)

In [None]:
data = wiki_json_structure

# Configure a batch process
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    # Batch import all Questions
    for i, d in enumerate(data):
        print(f"importing question: {i+1}")

        properties = {
            "wiki_content": d["wiki_content"],
            "wiki_source": d["wiki_source"]
            }

        batch.add_data_object(properties, "Wiki", vector=d["vector"])

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10
importing question: 11
importing question: 12
importing question: 13
importing question: 14
importing question: 15
importing question: 16
importing question: 17
importing question: 18
importing question: 19
importing question: 20


# 4. Retrieval & Generation

In [None]:
from langchain_upstage import ChatUpstage
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

## 1. Set up Upstage Solar

llm = ChatUpstage(upstage_api_key="CkRQIx0788kRXMcbsZQY0vckTLcyP8Jf")

prompt_template = PromptTemplate.from_template(
    """
    You are a bot that answers questions about the 2024 Summer Olympics, using only the context provided.
    When you generate the answer always provide the link(wiki url) we can be searching for (provide just the link itself).
If you don't know the answer, simply state that you don't know."
    ---
    Question: {question}
    ---
    Context: {context}
    """
)
chain = prompt_template | llm | StrOutputParser()

In [None]:
## 2. Retrieval based on the Question and Generate Response

def ask_question(question):
    # Embed the question using the embeddings model
    question_vector = embeddings.embed_query(question)
    query_result = {"vector":question_vector}

    # Retrieve relevant documents from the vector database
    retrieved_doc = client.query.get(
        "Wiki", ["wiki_content"]
    ).with_near_vector(
        query_result
    ).with_limit(2).with_additional(['certainty']).do()

    # Use the chain to invoke the answer generation process
    answer = chain.invoke({"question": question, "context": retrieved_doc})

    return answer



# 5. Make a Chatbot

In [None]:
def chatbot():
    print("Welcome to the chatbot! Ask me anything.")

    while True:
        question = input("You: ")

        if question.lower() in ["exit", "quit", "bye"]:
            print("Goodbye!")
            break

        answer = ask_question(question)

        print(f"Chatbot: {answer}")

In [None]:
chatbot()
# How many medals have the United States won?
# Which country has the most medals?
# Tell me about the Men's Football Tournament
# How many participating nations in wrestling

Welcome to the chatbot! Ask me anything.
You: How many medals have the United States won?
Chatbot: The United States won a total of 128 medals at the 2024 Summer Olympics in Paris. You can find more information about this in the context provided.
You: Which country has the most medals?
Chatbot: The United States of America has the most medals.

Link: https://en.wikipedia.org/wiki/United_States_at_the_2024_Summer_Olympics
You: Tell me about the Men's Football Tournament
Chatbot: The men's football tournament at the 2024 Summer Olympics was held from 24 July to 9 August 2024. It was the 28th edition of the men's Olympic football tournament. Teams participating in the men's competition were restricted to under-23 players (born on or after 1 January 2001) with a maximum of three overage players allowed. Spain won their second gold medal and first since 1992, defeating hosts France 5–3 after extra time in the final, held at Parc des Princes in Paris.

Reference(s):
date: 11 August 2024
url: