In [1]:
# load OpenAI APIKEY from env
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv('/home/jovyan/.env')

In [2]:
llm_model = "gpt-3.5-turbo"

In [3]:
# imports (tins)
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [4]:
!pip install langchain-iris

Collecting langchain-iris
  Using cached langchain_iris-0.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting sqlalchemy-iris>=0.13.0 (from langchain-iris)
  Using cached sqlalchemy_iris-0.13.3-py3-none-any.whl.metadata (2.4 kB)
Using cached langchain_iris-0.1.2-py3-none-any.whl (6.8 kB)
Using cached sqlalchemy_iris-0.13.3-py3-none-any.whl (141 kB)
Installing collected packages: sqlalchemy-iris, langchain-iris
Successfully installed langchain-iris-0.1.2 sqlalchemy-iris-0.13.3


In [5]:
!pip install tiktoken

Collecting tiktoken
  Using cached tiktoken-0.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Using cached tiktoken-0.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
Installing collected packages: tiktoken
Successfully installed tiktoken-0.6.0


In [6]:
# imports (sample)
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings

from langchain_iris import IRISVector

In [7]:
loader = TextLoader("/app/data/wiki-es-cervantes.txt", encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

embeddings = OpenAIEmbeddings()


Created a chunk of size 513, which is longer than the specified 400
Created a chunk of size 602, which is longer than the specified 400
Created a chunk of size 455, which is longer than the specified 400
Created a chunk of size 1251, which is longer than the specified 400
Created a chunk of size 525, which is longer than the specified 400
Created a chunk of size 1053, which is longer than the specified 400
Created a chunk of size 639, which is longer than the specified 400
Created a chunk of size 836, which is longer than the specified 400
Created a chunk of size 931, which is longer than the specified 400
Created a chunk of size 598, which is longer than the specified 400
Created a chunk of size 694, which is longer than the specified 400
Created a chunk of size 712, which is longer than the specified 400
Created a chunk of size 626, which is longer than the specified 400
Created a chunk of size 2448, which is longer than the specified 400
Created a chunk of size 889, which is longer 

In [8]:
username = 'demo'
password = 'demo' 
hostname = 'iris'
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"

In [9]:
print(CONNECTION_STRING)

iris://demo:demo@iris:1972/USER


In [27]:
COLLECTION_NAME = "wikicervantes"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [33]:
# connect to existing collection
db = IRISVector(
    embedding_function=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [34]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

Number of docs in vector store: 86


# test

In [35]:
# create llm
llm = ChatOpenAI(temperature=0.0, model=llm_model)

rsp_schema = ResponseSchema(
    name="rsp",
    description="response to question",
)

In [36]:
# prompt response schema
response_schemas = [rsp_schema]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [37]:
query_template = """\
Interprete and evaluate the following question in Spanish: {question}

{format_instructions}

Use the following context:
{context}

Do not use any other information.
"""

In [38]:
 # build prompt
from langchain.prompts import PromptTemplate
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "query"],
    partial_variables={"format_instructions": format_instructions},
    template=query_template,
)

In [39]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": QA_CHAIN_PROMPT
    }
)

In [45]:
result = qa_chain("dime 3 preguntas sencillas con opciones para un niño sobre Cervantes")
print(result['result'])



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mInterprete and evaluate the following question in Spanish: dime 3 preguntas sencillas con opciones para un niño sobre Cervantes

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"rsp": string  // response to question
}
```

Use the following context:
Obra de Cervantes

Novelas

Hay diversas obras que se han atribuido a Cervantes, con variado fundamento. Entre las más conocidas cabe mencionar:

El padre del escritor era Rodrigo de Cervantes (1509-1585), casado con Leonor de Cortinas, de la cual apenas se sabe nada, excepto que era natural de Arganda del Rey.​ Los hermanos de Cervantes fueron Andrés (1543), Andrea (1544), Luisa (1546), que llegó a ser priora de un convento carmelita; Rodrigo (1550), también soldado, que le acompañó en el cautiverio argelino;

In [51]:
output_dict = output_parser.parse(result["result"])
output_dict

{'rsp': '¿Cuál era el nombre del padre de Cervantes? A) Rodrigo B) Juan C) Andrés'}