In [1]:
import requests
import json
import os

# Download the data
resp = requests.get('https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json')
data = json.loads(resp.text)  # Load data

# Parse the JSON and preview it
print(type(data), len(data))

def json_print(data):
    print(json.dumps(data, indent=2))

json_print(data[0])

<class 'list'> 10
{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [2]:
import weaviate, os
from weaviate import EmbeddedOptions
import openai

openai.api_key = os.environ['OPENAI_API_KEY']
# openai_base_url = os.environ['OPENAI_API_BASE']

client = weaviate.Client(
    embedded_options=EmbeddedOptions(),
    additional_headers={
        "X-OpenAI-BaseURL": 'https://api.openai.com',
        "X-OpenAI-Api-Key": openai.api_key
    }
)
print(f"Client created? {client.is_ready()}")

Started /home/pierre/.cache/weaviate-embedded: process ID 423413


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2023-12-08T14:52:35+01:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2023-12-08T14:52:35+01:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"question_2zZ9B34bq0QN","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-08T14:52:35+01:00","took":95039}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2023-12-08T14:52:35+01:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2023-12-08T14:52:35+01:00"}


Client created? True


In [3]:
# resetting the schema. CAUTION: This will delete your collection 
if client.schema.exists("Question"):
    client.schema.delete_class("Question")
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-openai",  # Use OpenAI as the vectorizer
    "moduleConfig": {
        "text2vec-openai": {
            "model": "ada",
            "modelVersion": "002",
            "type": "text",
            "baseURL": os.environ["OPENAI_API_BASE"]
        }
    }
}

client.schema.create_class(class_obj)

{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"question_NvnSxsBzJMUj","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2023-12-08T14:52:42+01:00","took":105858}


In [4]:
# reminder for the data structure
json_print(data[0])

{
  "Category": "SCIENCE",
  "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
  "Answer": "Liver"
}


In [5]:
with client.batch.configure(batch_size=5) as batch:
    for i, d in enumerate(data):  # Batch import data
        
        print(f"importing question: {i+1}")
        
        properties = {
            "answer": d["Answer"],
            "question": d["Question"],
            "category": d["Category"],
        }
        
        batch.add_data_object(
            data_object=properties,
            class_name="Question"
        )

importing question: 1
importing question: 2
importing question: 3
importing question: 4
importing question: 5
importing question: 6
importing question: 7
importing question: 8
importing question: 9
importing question: 10


In [6]:
# write a query to extract the vector for a question
result = (client.query
          .get("Question", ["category", "question", "answer"])
          .with_additional("vector")
          .with_limit(1)
          .do())

json_print(result)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "vector": [
              0.0076300455,
              0.00061342155,
              -0.02073168,
              0.0049861,
              0.01675592,
              0.024458133,
              -0.015115757,
              -0.03789436,
              -0.02800089,
              -0.02363149,
              0.0017861392,
              0.025665294,
              -0.010949738,
              -0.011684532,
              0.020154342,
              -0.014420327,
              0.0065245745,
              -0.0077153337,
              -0.010621705,
              0.014958301,
              -0.018815968,
              0.003939675,
              0.008266429,
              0.00072331255,
              -0.01007061,
              0.011461469,
              0.02582275,
              -0.015338819,
              -0.008935616,
              0.010083731,
              0.018501056,
              0.004369398,
              -

In [7]:
response = (
    client.query
    .get("Question",["question","answer","category"])
    .with_near_text({"concepts": "biology"})
    .with_additional('distance')
    .with_limit(2)
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.19695163
          },
          "answer": "DNA",
          "category": "SCIENCE",
          "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
        },
        {
          "_additional": {
            "distance": 0.20142835
          },
          "answer": "species",
          "category": "SCIENCE",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        }
      ]
    }
  }
}


In [8]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"]})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.19695163
          },
          "answer": "DNA",
          "category": "SCIENCE",
          "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
        },
        {
          "_additional": {
            "distance": 0.20142835
          },
          "answer": "species",
          "category": "SCIENCE",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        }
      ]
    }
  }
}


In [9]:
response = (
    client.query
    .get("Question", ["question", "answer"])
    .with_near_text({"concepts": ["animals"], "distance": 0.24})
    .with_limit(10)
    .with_additional(["distance"])
    .do()
)

json_print(response)

{
  "data": {
    "Get": {
      "Question": [
        {
          "_additional": {
            "distance": 0.18963969
          },
          "answer": "Elephant",
          "question": "It's the only living mammal in the order Proboseidea"
        },
        {
          "_additional": {
            "distance": 0.19144487
          },
          "answer": "the nose or snout",
          "question": "The gavial looks very much like a crocodile except for this bodily feature"
        },
        {
          "_additional": {
            "distance": 0.20419747
          },
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        },
        {
          "_additional": {
            "distance": 0.21438634
          },
          "answer": "species",
          "question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification"
        },
        {
    

In [10]:
#Create an object
object_uuid = client.data_object.create(
    data_object={
        'question':"Leonardo da Vinci was born in this country.",
        'answer': "Italy",
        'category': "Culture"
    },
    class_name="Question"
 )

In [11]:
print(object_uuid)

92689faa-117f-4d63-a20e-7e88bbd526ba


In [12]:
data_object = client.data_object.get_by_id(object_uuid, class_name="Question")
json_print(data_object)

{
  "class": "Question",
  "creationTimeUnix": 1702043720310,
  "id": "92689faa-117f-4d63-a20e-7e88bbd526ba",
  "lastUpdateTimeUnix": 1702043720310,
  "properties": {
    "answer": "Italy",
    "category": "Culture",
    "question": "Leonardo da Vinci was born in this country."
  },
  "vectorWeights": null
}


In [13]:
data_object = client.data_object.get_by_id(
    object_uuid,
    class_name='Question',
    with_vector=True
)

json_print(data_object)

{
  "class": "Question",
  "creationTimeUnix": 1702043720310,
  "id": "92689faa-117f-4d63-a20e-7e88bbd526ba",
  "lastUpdateTimeUnix": 1702043720310,
  "properties": {
    "answer": "Italy",
    "category": "Culture",
    "question": "Leonardo da Vinci was born in this country."
  },
  "vector": [
    0.022491472,
    -0.013062453,
    -0.0031088893,
    -0.047720812,
    -0.0038829134,
    0.012032554,
    -0.017399546,
    -0.0040556295,
    -0.004519404,
    -0.034671154,
    0.0105100935,
    0.025689917,
    -0.0032112396,
    -0.0072348844,
    0.010337377,
    -0.001473684,
    0.01306885,
    0.0053605954,
    0.03953279,
    -0.00806648,
    0.002354056,
    0.021109743,
    0.011699915,
    -0.01088751,
    0.0051399027,
    0.0018854839,
    0.016516775,
    -0.03216357,
    0.014827995,
    -0.031702995,
    -0.007880971,
    0.0006992602,
    -0.009953563,
    -0.0133950915,
    -0.018742893,
    -0.014431388,
    -0.007682667,
    -0.028095149,
    0.00814964,
    0.015301

In [14]:
client.data_object.update(
    uuid=object_uuid,
    class_name="Question",
    data_object={
        'answer':"Florence, Italy"
    })

In [15]:
data_object = client.data_object.get_by_id(
    object_uuid,
    class_name='Question',
)

json_print(data_object)

{
  "class": "Question",
  "creationTimeUnix": 1702043720310,
  "id": "92689faa-117f-4d63-a20e-7e88bbd526ba",
  "lastUpdateTimeUnix": 1702043800676,
  "properties": {
    "answer": "Florence, Italy",
    "category": "Culture",
    "question": "Leonardo da Vinci was born in this country."
  },
  "vectorWeights": null
}
