In [1]:
# dependencies
!pip install pandas python-dotenv openai langchain-iris langchain tiktoken langchain-community langchain-core

Collecting pandas
  Using cached pandas-2.2.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Collecting python-dotenv
  Using cached python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting openai
  Downloading openai-1.30.4-py3-none-any.whl.metadata (21 kB)
Collecting langchain-iris
  Downloading langchain_iris-0.2.0-py3-none-any.whl.metadata (2.5 kB)
Collecting langchain
  Using cached langchain-0.2.1-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken
  Using cached tiktoken-0.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-community
  Using cached langchain_community-0.2.1-py3-none-any.whl.metadata (8.9 kB)
Collecting langchain-core
  Using cached langchain_core-0.2.1-py3-none-any.whl.metadata (5.9 kB)
Collecting numpy>=1.23.2 (from pandas)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using 

In [2]:
# load OpenAI APIKEY from env
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv('/home/jovyan/.env')

In [3]:
# OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain

# text loading and splitting
from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

# IRIS as vector store
from langchain_iris import IRISVector

# parse response from llm
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser

In [4]:
# optional. LangChain LangSmith https://www.langchain.com/langsmith
from langsmith.wrappers import wrap_openai
from langsmith import traceable

In [5]:
# open llm model
llm_model = "gpt-3.5-turbo"

# load text & split in chunks
loader = TextLoader("/app/data/wiki-es-cervantes.txt", encoding='utf-8')
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
docs = text_splitter.split_documents(documents)

# function to use to calculate vectors (embeddings) from text
embeddings = OpenAIEmbeddings()


Created a chunk of size 513, which is longer than the specified 400
Created a chunk of size 602, which is longer than the specified 400
Created a chunk of size 455, which is longer than the specified 400
Created a chunk of size 1251, which is longer than the specified 400
Created a chunk of size 525, which is longer than the specified 400
Created a chunk of size 1053, which is longer than the specified 400
Created a chunk of size 639, which is longer than the specified 400
Created a chunk of size 836, which is longer than the specified 400
Created a chunk of size 931, which is longer than the specified 400
Created a chunk of size 598, which is longer than the specified 400
Created a chunk of size 694, which is longer than the specified 400
Created a chunk of size 712, which is longer than the specified 400
Created a chunk of size 626, which is longer than the specified 400
Created a chunk of size 2448, which is longer than the specified 400
Created a chunk of size 889, which is longer 

In [6]:
# IRIS connection string
username = 'demo'
password = 'demo' 
hostname = 'iris'
port = '1972' 
namespace = 'USER'
CONNECTION_STRING = f"iris://{username}:{password}@{hostname}:{port}/{namespace}"
print(CONNECTION_STRING)

iris://demo:demo@iris:1972/USER


In [7]:
# load documents (vectors from splitted text)
# this will create the collection
COLLECTION_NAME = "wikicervantes"

db = IRISVector.from_documents(
    embedding=embeddings,
    documents=docs,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [11]:
# use the following if you are connecting to an existing collection
#db = IRISVector(
#    embedding_function=embeddings,
#    collection_name=COLLECTION_NAME,
#    connection_string=CONNECTION_STRING,
#)

In [8]:
print(f"Number of docs in vector store: {len(db.get()['ids'])}")

Number of docs in vector store: 86


# Questions & Answers using documents as context

In [9]:
# create llm
llm = ChatOpenAI(temperature=0.0, model=llm_model)

  warn_deprecated(


In [10]:
# response schema to parse response afterwards
rsp_schema = ResponseSchema(
    name="rsp",
    description="response to question",
    type="string"
)

# prompt response schema
response_schemas = [rsp_schema]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [11]:
query_template = """\
You are a Co-Pilot history teacher that helps students to understand history lessons.
Using the context, provide a comprensible and clear response that will help a student to study History.

{format_instructions}

Use the following context:
{context}

Question:
{question}

Do not use any other information.
"""

In [12]:
# build prompt
from langchain.prompts import PromptTemplate
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "query"],
    partial_variables={"format_instructions": format_instructions},
    template=query_template,
)

In [13]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(search_kwargs={"k": 5}),
    return_source_documents=True,
    chain_type_kwargs={
        "verbose": True,
        "prompt": QA_CHAIN_PROMPT
    }
)

In [14]:
result = qa_chain("por qué fue Cervantes relevante ?")
print(result)

  warn_deprecated(




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mYou are a Co-Pilot history teacher that helps students to understand history lessons.
Using the context, provide a comprensible and clear response that will help a student to study History.

The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":

```json
{
	"rsp": string  // response to question
}
```

Use the following context:
Cervantes y la religión

Cervantes es sumamente original. Parodiando un género que empezaba a periclitar, como el de los libros de caballerías, creó otro género sumamente vivaz, la novela polifónica, donde se superponen las cosmovisiones y los puntos de vista hasta confundirse en complejidad con la misma realidad, recurriendo incluso a juegos metaficcionales. En la época la épica podía escribirse también en prosa, y con el precedente en el teatro de

In [15]:
# extract actual response
output_dict = output_parser.parse(result["result"])
output_dict

{'rsp': 'Cervantes fue relevante por su originalidad al crear un nuevo género literario, la novela polifónica, que reflejaba la sociedad de su tiempo y el comportamiento humano. Su obra maestra, El Quijote, es considerada la primera novela moderna y una de las mejores obras de la literatura universal, lo que lo llevó a ser mundialmente conocido y a ser considerado una de las máximas figuras de la literatura española.'}