In [3]:
import weaviate, os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.vectorstores import Weaviate    
from langchain.document_loaders import PyPDFLoader
import json




AZURE_OPEN_AI_ORGANIZATION = os.getenv('OPEN_AI_ORGANIZATION')

with  weaviate.connect_to_local(
    port=8082,     
    grpc_port=50051,
) as client:
    
    print(client.collections.list_all())

{}

### Create a Schema definiton

In [3]:

class_definition= {       
    "class": "RecipeOpenAI",
    "description": "Document from github or stackoverflow",
    "vectorizer": "text2vec-openai",
    "vectorIndexConfig": {
        "distance": "cosine" # Set to "cosine" for English models; "dot" for multilingual 
    },
    "moduleConfig": {
        "text2vec-openai": {
            "resourceName":AZURE_OPEN_AI_ORGANIZATION,
            "deploymentId": "text-embedding-ada-002"
        },
        "generative-openai": {
            "resourceName":AZURE_OPEN_AI_ORGANIZATION,
            "deploymentId": "gpt-35-turbo"
            },
    },
    "properties": [
        {
            "name": "docSource",
            "description": "Type of document ('learn', 'astro', 'airflow', 'stackoverflow', 'code_samples')",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": "False",
                    "vectorizePropertyName": "False"
                }
            }
        },
        {
            "name": "docLink",
            "description": "The url of source data",
            "dataType": ["text"],
            "tokenization": "field",
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": "True",
                    "vectorizePropertyName": "False"
                }
            }
        },
        {

}





In [3]:
client.create_schema(class_definition)


AttributeError: 'WeaviateClient' object has no attribute 'create_schema'

In [15]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
embeddings = AzureOpenAIEmbeddings(model="text-embedding-ada-002", client=client )


# import first article
loader = PyPDFLoader("brazil-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Brazil")

idx = [i*16 for i in range(int(len(docs) /16+1))] + [len(docs)]

for i in range(len(idx) - 1):
    Weaviate.from_documents(
        docs[idx[i]:idx[i+1]], 
        embeddings, 
        index_name="RecipeOpenAI", 
        client=client, 
        by_text=False
    )


# import second article
loader = PyPDFLoader("netherlands-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Netherlands")


# divide documents in 16 chunks
idx = [i*16 for i in range(int(len(docs) /16+1))] + [len(docs)]


for i in range(len(idx) - 1):
    Weaviate.from_documents(
        docs[idx[i]:idx[i+1]], 
        embeddings, 
        index_name="RecipeOpenAI", 
        client=client, 
        by_text=False
    )





KeyboardInterrupt: 

In [5]:
response = (
    client.query
    .aggregate("RecipeOpenAI")
    .with_fields("source { count type topOccurrences { occurs value } }")
    .do()
)

print(json.dumps(response, indent=2))

# Let's query some objects
response = (
    client.query
    .get("RecipeOpenAI", "text source")
    .with_limit(4)
    .do()
)

print(json.dumps(response, indent=2))

{
  "data": {
    "Aggregate": {
      "RecipeOpenAI": [
        {
          "source": {
            "count": 521,
            "topOccurrences": [
              {
                "occurs": 274,
                "value": "netherlands-wikipedia-article-text.pdf"
              },
              {
                "occurs": 247,
                "value": "brazil-wikipedia-article-text.pdf"
              }
            ],
            "type": "text"
          }
        }
      ]
    }
  }
}
{
  "data": {
    "Get": {
      "RecipeOpenAI": [
        {
          "source": "netherlands-wikipedia-article-text.pdf",
          "text": "the larger Zuiderzee Works in which four polders totalling 2,500 square kilometres (965 sq mi) were reclaimed from the sea.\nThe Netherlands is one of the countries that may suffer most from climate change. Not only is the rising sea a problem, but erratic\nweather patterns may cause the rivers to overflow.\nDelta Works"
        },
        {
          "source": "netherla

In [6]:
# lets do a RAG directly using only Weaviate

# This is our prompt.
generateTask = "Quelle est la nourriture traditionnelle de ce pays ? Answer in Spanish"
# lets filter it out, and only use this specific file
source_file = "brazil-wikipedia-article-text.pdf"

result = (
  client.query
  .get("RecipeOpenAI", "text")
  .with_generate(grouped_task = generateTask)
  .with_where({
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  })
  .with_near_text({
   "concepts": ["tradicional Food"]
  })
  .with_limit(5).do()
)

print(json.dumps(result, indent=1))


{
 "data": {
  "Get": {
   "RecipeOpenAI": null
  }
 },
 "errors": [
  {
   "locations": [
    {
     "column": 6,
     "line": 1
    }
   ],
   "message": "explorer: get class: vectorize params: vectorize params: vectorize params: vectorize keywords: remote client vectorize: API Key: no api key found neither in request header: X-Azure-Api-Key nor in environment variable under AZURE_APIKEY",
   "path": [
    "Get",
    "RecipeOpenAI"
   ]
  }
 ]
}


In [7]:
db = Weaviate(client=client, index_name="RecipeOpenAI", text_key="text", embedding=embeddings)
docs = db.similarity_search("traditional food")
print(docs)

ValueError: Error during query: [{'locations': [{'column': 6, 'line': 1}], 'message': 'explorer: get class: vectorize params: vectorize params: vectorize params: vectorize keywords: remote client vectorize: API Key: no api key found neither in request header: X-Azure-Api-Key nor in environment variable under AZURE_APIKEY', 'path': ['Get', 'RecipeOpenAI']}]

In [8]:
from langchain.prompts import PromptTemplate

prompt_template = """Text: {context}

Question: {question}

Answer the question based on the text provided. If the text doesn't contain the answer, 
reply that the answer is not available."""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [9]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI  



# Let's answer some question
#source_file = "brazil-wikipedia-article-text.pdf"
source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = {
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  }

# we want our retriever to filter the results
retriever = db.as_retriever(search_kwargs={"where_filter": where_filter})

openai_client = AzureChatOpenAI(
    model_name="gpt-35-turbo", 
    deployment_name = "gpt-35-turbo",
    # azure_endpoint=AZURE_OPEN_AI_BASE_URL,
    )


qa = RetrievalQA.from_chain_type(llm=openai_client, 
                                 chain_type="stuff", #map_reduce
                                 retriever=retriever, 
                                 chain_type_kwargs=chain_type_kwargs, 
                                 return_source_documents=True)
                                 
answer = qa({"query": "What is the traditional food of this country?"})
print(answer)

ValueError: Error during query: [{'locations': [{'column': 6, 'line': 1}], 'message': 'explorer: get class: vectorize params: vectorize params: vectorize params: vectorize keywords: remote client vectorize: API Key: no api key found neither in request header: X-Azure-Api-Key nor in environment variable under AZURE_APIKEY', 'path': ['Get', 'RecipeOpenAI']}]

In [11]:
answer

{'query': 'What is the traditional food of this country?',
 'result': 'The traditional food of this country includes cakes such as Vlaai, Moorkop, and Bossche Bol, as well as the savoury pastry worstenbroodje. Additionally, the traditional diet consists of potatoes, meat, and seasonal vegetables. The text also mentions Kibbeling and lekkerbek as national fast food options.',
 'source_documents': [Document(page_content='cream, custard or fruits. Cakes, such as the \nVlaai\n from Limburg and the \nMoorkop\n and \nBossche Bol\n from Brabant, are typical\npastries. Savoury pastries also occur, with the \nworstenbroodje\n (a roll with a sausage of ground beef, literally translates into sausage\nbread) being the most popular. The traditional alcoholic beverage of the region is beer. There are many local brands, ranging from'),
  Document(page_content='toppings, with cereal for breakfast as an alternative. Traditionally, dinner consists of potatoes, a portion of meat, and (seasonal)\nvegetabl

In [14]:
import json
from airflow.models.connection import Connection

c = Connection(
    conn_id='weaviate_default',
    conn_type='weaviate',
    host='http://weaviate:8080/',
)
print(f"AIRFLOW_CONN_{c.conn_id.upper()}='{c.get_uri()}'")

AIRFLOW_CONN_WEAVIATE_DEFAULT='weaviate://http://weaviate%3A8080%2F'
