In [None]:
import weaviate, os
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.embeddings import AzureOpenAIEmbeddings
# from langchain.vectorstores import Weaviate    
# from langchain.document_loaders import PyPDFLoader
import json




AZURE_OPEN_AI_ORGANIZATION = os.getenv('OPEN_AI_ORGANIZATION')


client = weaviate.connect_to_local(
    # host="0.0.0.0",  # Use a string to specify the host
    port=8083,
    grpc_port=50051,
)

print(client.is_ready())

In [None]:

collection_name = "test_collection"





In [None]:
#list collections


### Create a Schema definiton

In [None]:

class_definition= {       
    "class": "RecipeOpenAI",
    "description": "Document from github or stackoverflow",
    "vectorizer": "text2vec-openai",
    "vectorIndexConfig": {
        "distance": "cosine" # Set to "cosine" for English models; "dot" for multilingual 
    },
    "moduleConfig": {
        "text2vec-openai": {
            "resourceName":AZURE_OPEN_AI_ORGANIZATION,
            "deploymentId": "text-embedding-ada-002"
        },
        "generative-openai": {
            "resourceName":AZURE_OPEN_AI_ORGANIZATION,
            "deploymentId": "gpt-35-turbo"
            },
    },
    "properties": [
        {
            "name": "docSource",
            "description": "Type of document ('learn', 'astro', 'airflow', 'stackoverflow', 'code_samples')",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": "False",
                    "vectorizePropertyName": "False"
                }
            }
        },
        {
            "name": "docLink",
            "description": "The url of source data",
            "dataType": ["text"],
            "tokenization": "field",
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": "True",
                    "vectorizePropertyName": "False"
                }
            }
        },
        {

}





In [None]:
client.create_schema(class_definition)


In [None]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=50)
embeddings = AzureOpenAIEmbeddings(model="text-embedding-ada-002", client=client )


# import first article
loader = PyPDFLoader("brazil-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Brazil")

idx = [i*16 for i in range(int(len(docs) /16+1))] + [len(docs)]

for i in range(len(idx) - 1):
    Weaviate.from_documents(
        docs[idx[i]:idx[i+1]], 
        embeddings, 
        index_name="RecipeOpenAI", 
        client=client, 
        by_text=False
    )


# import second article
loader = PyPDFLoader("netherlands-wikipedia-article-text.pdf", extract_images=False)
docs = loader.load_and_split(text_splitter)
print(f"GOT {len(docs)} docs for Netherlands")


# divide documents in 16 chunks
idx = [i*16 for i in range(int(len(docs) /16+1))] + [len(docs)]


for i in range(len(idx) - 1):
    Weaviate.from_documents(
        docs[idx[i]:idx[i+1]], 
        embeddings, 
        index_name="RecipeOpenAI", 
        client=client, 
        by_text=False
    )



In [None]:
response = (
    client.query
    .aggregate("RecipeOpenAI")
    .with_fields("source { count type topOccurrences { occurs value } }")
    .do()
)

print(json.dumps(response, indent=2))

# Let's query some objects
response = (
    client.query
    .get("RecipeOpenAI", "text source")
    .with_limit(4)
    .do()
)

print(json.dumps(response, indent=2))

In [None]:
# lets do a RAG directly using only Weaviate

# This is our prompt.
generateTask = "Quelle est la nourriture traditionnelle de ce pays ? Answer in Spanish"
# lets filter it out, and only use this specific file
source_file = "brazil-wikipedia-article-text.pdf"

result = (
  client.query
  .get("RecipeOpenAI", "text")
  .with_generate(grouped_task = generateTask)
  .with_where({
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  })
  .with_near_text({
   "concepts": ["tradicional Food"]
  })
  .with_limit(5).do()
)

print(json.dumps(result, indent=1))


In [None]:
db = Weaviate(client=client, index_name="RecipeOpenAI", text_key="text", embedding=embeddings)
docs = db.similarity_search("traditional food")
print(docs)

In [None]:
from langchain.prompts import PromptTemplate

prompt_template = """Text: {context}

Question: {question}

Answer the question based on the text provided. If the text doesn't contain the answer, 
reply that the answer is not available."""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

chain_type_kwargs = {"prompt": PROMPT}

In [None]:
from langchain.chains import RetrievalQA
from langchain.chat_models import AzureChatOpenAI  



# Let's answer some question
#source_file = "brazil-wikipedia-article-text.pdf"
source_file = "netherlands-wikipedia-article-text.pdf"
where_filter = {
      "operator": "Equal",
      "path": ["source"],
      "valueText": source_file
  }

# we want our retriever to filter the results
retriever = db.as_retriever(search_kwargs={"where_filter": where_filter})

openai_client = AzureChatOpenAI(
    model_name="gpt-35-turbo", 
    deployment_name = "gpt-35-turbo",
    # azure_endpoint=AZURE_OPEN_AI_BASE_URL,
    )


qa = RetrievalQA.from_chain_type(llm=openai_client, 
                                 chain_type="stuff", #map_reduce
                                 retriever=retriever, 
                                 chain_type_kwargs=chain_type_kwargs, 
                                 return_source_documents=True)
                                 
answer = qa({"query": "What is the traditional food of this country?"})
print(answer)

In [None]:
answer

In [None]:
import json
from airflow.models.connection import Connection

c = Connection(
    conn_id='weaviate_default',
    conn_type='weaviate',
    host='http://weaviate:8083/',
)
print(f"AIRFLOW_CONN_{c.conn_id.upper()}='{c.get_uri()}'")

In [None]:
import pandas as pd

In [None]:
df = pd.read_parquet("/home/isma/repos/book/data-pipelines-with-airflow-2nd-ed/chapter13_genai/recipe_book/notebooks/splitted (1).parquet")
df

In [None]:
print(df.chunk[0])

In [None]:
print(df.chunk[1])

In [None]:
import os
print(os.environ.pop("X-Azure-Api-Key", None))


In [None]:
print(os.environ.pop("X-Azure-Api-Key", None))


In [None]:
print(os.environ.pop("AZURE_API_KEY", None))


In [None]:
os.environ["AZURE_API_KE"] = "22"

In [None]:
  os.environ["X-Azure-Api-Ke"]