In [2]:
import json
import weaviate
import requests

# Get schema

In [3]:
client = weaviate.Client("http://localhost:8080")
schema = client.schema.get()
print(schema)

{'classes': [{'class': 'OpenAIDocument', 'description': 'The main class', 'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2}, 'cleanupIntervalSeconds': 60, 'stopwords': {'additions': None, 'preset': 'en', 'removals': None}}, 'properties': [{'dataType': ['string'], 'description': 'The chunk id', 'name': 'chunk_id', 'tokenization': 'word'}, {'dataType': ['string'], 'description': 'The document id', 'name': 'document_id', 'tokenization': 'word'}, {'dataType': ['text'], 'description': "The chunk's text", 'name': 'text', 'tokenization': 'word'}, {'dataType': ['string'], 'description': 'The source of the data', 'name': 'source', 'tokenization': 'word'}, {'dataType': ['string'], 'description': 'The source id', 'name': 'source_id', 'tokenization': 'word'}, {'dataType': ['string'], 'description': 'The source url', 'name': 'url', 'tokenization': 'word'}, {'dataType': ['date'], 'description': 'Creation date of document', 'name': 'created_at'}, {'dataType': ['string'], 'description': 'Document

# Retrieve properties of `OpenAIDocument` class

In [4]:
# Get the classes from the schema
classes = schema['classes']

# Find the 'OpenAIDocument' class in the classes list
openai_class = next((cls for cls in classes if cls['class'] == 'OpenAIDocument'), None)

# If the 'OpenAIDocument' class is found, retrieve its properties
if openai_class is not None:
    properties = openai_class['properties']
    print("Properties for class OpenAIDocument:")
    for prop in properties:
        print(f"- Name: {prop['name']}, Data Type: {', '.join(prop['dataType'])}, Description: {prop['description']}")
else:
    print("Class OpenAIDocument not found in the schema.")

Properties for class OpenAIDocument:
- Name: chunk_id, Data Type: string, Description: The chunk id
- Name: document_id, Data Type: string, Description: The document id
- Name: text, Data Type: text, Description: The chunk's text
- Name: source, Data Type: string, Description: The source of the data
- Name: source_id, Data Type: string, Description: The source id
- Name: url, Data Type: string, Description: The source url
- Name: created_at, Data Type: date, Description: Creation date of document
- Name: author, Data Type: string, Description: Document author
- Name: timestamp, Data Type: text, Description: This property was generated by Weaviate's auto-schema feature on Fri Jul 28 16:00:02 2023


# Retrieve all objects and print a sample object

Another way to quickly do a count of all `OpenAIDocument` is to connect to your instance at `https://console.weaviate.cloud/dashboard` then use GraphQL code as below:

``{
  Aggregate {
    OpenAIDocument {
      meta {
        count
      }
    }
  }
}``

In [56]:
client.query.aggregate("OpenAIDocument").with_meta_count().do()

{'data': {'Aggregate': {'OpenAIDocument': [{'meta': {'count': 2285}}]}}}

In [4]:
class_name = "OpenAIDocument"

In [7]:
# Make the API request to fetch the objects
url = f"http://localhost:8080/v1/objects/?class={class_name}"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the response JSON to get the objects
    data = response.json()

    # Extract the objects list from the response data
    objects_list = data["objects"]

    # Print the total number of objects and a sample object
    print("Total number of objects:", len(objects_list))
    if len(objects_list) > 0:
        print("Sample object:")
        print(objects_list[0])
    else:
        print("No objects found.")
else:
    print("Failed to fetch objects. Status code:", response.status_code)

Total number of objects: 25
Sample object:
{'class': 'OpenAIDocument', 'creationTimeUnix': 1690663605777, 'id': '005e5859-04a2-5e85-ab81-08e0506dfda4', 'lastUpdateTimeUnix': 1690663605777, 'properties': {'author': 'Abhijit Banerjee, Esther Duflo', 'chunk_id': 'b1e741c2-33a8-45d3-9b25-2e248e09e975_52', 'created_at': '2019-11-12T00:00:00Z', 'document_id': 'b1e741c2-33a8-45d3-9b25-2e248e09e975', 'source': 'book', 'source_id': 'Good Economics for Hard Times (Abhijit V. Banerjee  Esther Duflo).pdf', 'text': 'For a combination of these two reasons, if you plotted the wages of nonmigrants in cities against the share of migran ts in cities, you would find a nice upwa rd-sloping line; the more migrants, the higher the wages. Good news for the pro-migration view, but perhaps entirely spurious. To find out the real impact of immigration on the wages of the natives, we need to look for changes in migration that are not a direct response to the wages in that city. And even that may not be enough, b

# Deletion dry run

In [54]:
# Perform a dry run for the batch delete request to delete all objects
result = client.batch.delete_objects(
    class_name='OpenAIDocument',
    where={
        'path': ['author'],  # Replace with the unique property you have
        'operator': 'Equal',   # Choose an operator that makes sense for a dummy filter
        'valueString': 'Ezra Klein'  # A dummy value to satisfy the filter
    },
    dry_run=True,
    output='verbose'
)

# Print the dry run result
print(json.dumps(result, indent=2))

{
  "dryRun": true,
  "match": {
    "class": "OpenAIDocument",
    "where": {
      "operands": null,
      "operator": "Equal",
      "path": [
        "author"
      ],
      "valueString": "Ezra Klein"
    }
  },
  "output": "verbose",
  "results": {
    "failed": 0,
    "limit": 10000,
    "matches": 192,
    "objects": [
      {
        "id": "419c982b-e1f3-5f57-9cc5-e3a34b1b58af",
        "status": "DRYRUN"
      },
      {
        "id": "ad5562b7-5696-5935-8abb-2ee345bd8682",
        "status": "DRYRUN"
      },
      {
        "id": "e5a2f7bb-f26e-5136-a20c-b205c2438557",
        "status": "DRYRUN"
      },
      {
        "id": "75b60afa-a2d1-503d-806f-eedf1636e550",
        "status": "DRYRUN"
      },
      {
        "id": "efea616d-bb58-5c88-87c9-c0feac17fc5f",
        "status": "DRYRUN"
      },
      {
        "id": "e321eb8d-2da3-5784-bdc6-6fdc6c779f0a",
        "status": "DRYRUN"
      },
      {
        "id": "00a930cf-b60f-569a-b92d-fd6bf4bed6a5",
        "status": "DR

# Actual deletion

In [13]:
result = client.batch.delete_objects(
    class_name='OpenAIDocument',
    where={
        'path': ['source'],  # Replace with the unique property you have
        'operator': 'Equal',   # Choose an operator that makes sense for a dummy filter
        'valueString': 'chat'  # A dummy value to satisfy the filter
    },
    dry_run=False,
    output='verbose'
)

# Print the result
print(json.dumps(result, indent=2))

{
  "dryRun": false,
  "match": {
    "class": "OpenAIDocument",
    "where": {
      "operands": null,
      "operator": "Equal",
      "path": [
        "source"
      ],
      "valueString": "chat"
    }
  },
  "output": "verbose",
  "results": {
    "failed": 0,
    "limit": 10000,
    "matches": 1,
    "objects": [
      {
        "id": "03359ef9-dd10-539d-99fb-f34f312858b8",
        "status": "SUCCESS"
      }
    ],
    "successful": 1
  }
}


# Another bulk deletion (WARNING: no dry run!)

In [8]:
def get_all_objects(client, class_name):
    url = f"http://localhost:8080/v1/objects/?class={class_name}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        objects_list = data["objects"]
        return objects_list
    else:
        print("Failed to fetch objects. Status code:", response.status_code)
        return []



# Get all objects in the class
objects_list = get_all_objects(client, class_name)

In [9]:
objects_list

[{'class': 'OpenAIDocument',
  'creationTimeUnix': 1690663605777,
  'id': '005e5859-04a2-5e85-ab81-08e0506dfda4',
  'lastUpdateTimeUnix': 1690663605777,
  'properties': {'author': 'Abhijit Banerjee, Esther Duflo',
   'chunk_id': 'b1e741c2-33a8-45d3-9b25-2e248e09e975_52',
   'created_at': '2019-11-12T00:00:00Z',
   'document_id': 'b1e741c2-33a8-45d3-9b25-2e248e09e975',
   'source': 'book',
   'source_id': 'Good Economics for Hard Times (Abhijit V. Banerjee  Esther Duflo).pdf',
   'text': 'For a combination of these two reasons, if you plotted the wages of nonmigrants in cities against the share of migran ts in cities, you would find a nice upwa rd-sloping line; the more migrants, the higher the wages. Good news for the pro-migration view, but perhaps entirely spurious. To find out the real impact of immigration on the wages of the natives, we need to look for changes in migration that are not a direct response to the wages in that city. And even that may not be enough, becau se both cur

In [None]:
def delete_objects_by_uuid(client, class_name, uuids):
    for uuid in uuids:
        response = client.data_object.delete(uuid, class_name=class_name)
        if response.status_code == 204:
            print(f"Deleted object with UUID: {uuid}")
        else:
            print(f"Failed to delete object with UUID: {uuid}. Status code:", response.status_code)

# Delete all objects one by one
if objects_list:
    print("Total number of objects to delete:", len(objects_list))
    delete_objects_by_uuid(client, class_name, [obj['id'] for obj in objects_list])
else:
    print("No objects found to delete.")            

# Update objects

In [37]:
# The GraphQL query to get objects with the matching document_id
query = """
{
    Get {
        OpenAIDocument(where: {
            path: ["document_id"],
            operator: Equal,
            valueString: "7560d102-676a-4e23-8285-04b43fdb153f"
        }) {
            _additional {
                id
            }
            chunk_id
            document_id
            text
            source
            source_id
            url
            created_at
            author
            timestamp
        }
    }
}
"""

# Execute the query to get objects with the matching document_id
query_result = client.query.raw(query)

# Check if the query was successful
if "data" in query_result and "OpenAIDocument" in query_result["data"]["Get"]:
    # Extract the objects from the query result
    matching_objects = query_result["data"]["Get"]["OpenAIDocument"]
    print("Objects with matching document_id:")
    print(matching_objects)
else:
    # Print the error message if the query was not successful
    print("Error in query result:", query_result.get("errors"))

Objects with matching document_id:
[{'_additional': {'id': '419c982b-e1f3-5f57-9cc5-e3a34b1b58af'}, 'author': 'Ezra Klein', 'chunk_id': '7560d102-676a-4e23-8285-04b43fdb153f_19', 'created_at': '2023-05-26T04:00:00Z', 'document_id': '7560d102-676a-4e23-8285-04b43fdb153f', 'source': 'book', 'source_id': '../../raw_data/csv/The Ezra Klein Show/15564005923.txt', 'text': "And then we ask people questions about how blameworthy the person is, how much they should be punished, things like that. And what we find is that we go all the way from weird societies where it's all about the intention. Really people want to kind of forgive the guy who made a mistake. All the way down to there's no difference. The person is out their goods one way or another, you know. Another way to think about this if you accidentally burn someone's house down, or if you intentionally burn someone's house down, the bottom line is the person has no house. So it just turns out that how important those intentions are, tho

In [39]:
# Check if the query was successful
if "data" in query_result and "OpenAIDocument" in query_result["data"]["Get"]:
    # Extract the objects from the query result
    matching_objects = query_result["data"]["Get"]["OpenAIDocument"]

    # Print the total number of ids to be updated
    total_ids_to_update = len(matching_objects)
    print("Total number of ids to be updated:", total_ids_to_update)

    if total_ids_to_update > 0:
        # Display a sample object
        sample_object = matching_objects[0]
        print("\nSample Object:")
        print(sample_object)

        # Extract the id of the sample object
        sample_object_id = sample_object["_additional"]["id"]

        # Define the updated source value
        updated_source = "podcast"

        # Create a dictionary with the property to update
        updated_properties = {
            "source": updated_source
        }

        # Update the sample object with the new source value
        print("\nBefore Update - Source:", sample_object["source"])
        client.data_object.update(
            updated_properties,
            class_name="OpenAIDocument",
            uuid=sample_object_id,
            consistency_level=weaviate.data.replication.ConsistencyLevel.ALL,  # default QUORUM
        )

        # Fetch the updated object to show its properties
        updated_object = client.data_object.get(class_name="OpenAIDocument", uuid=sample_object_id)
        print("\nAfter Update - Updated Object:")
        print(updated_object)
    else:
        print("No objects to update.")
else:
    # Print the error message if the query was not successful
    print("Error in query result:", query_result.get("errors"))

Total number of ids to be updated: 84

Sample Object:
{'_additional': {'id': '419c982b-e1f3-5f57-9cc5-e3a34b1b58af'}, 'author': 'Ezra Klein', 'chunk_id': '7560d102-676a-4e23-8285-04b43fdb153f_19', 'created_at': '2023-05-26T04:00:00Z', 'document_id': '7560d102-676a-4e23-8285-04b43fdb153f', 'source': 'book', 'source_id': '../../raw_data/csv/The Ezra Klein Show/15564005923.txt', 'text': "And then we ask people questions about how blameworthy the person is, how much they should be punished, things like that. And what we find is that we go all the way from weird societies where it's all about the intention. Really people want to kind of forgive the guy who made a mistake. All the way down to there's no difference. The person is out their goods one way or another, you know. Another way to think about this if you accidentally burn someone's house down, or if you intentionally burn someone's house down, the bottom line is the person has no house. So it just turns out that how important those i

# Update all matching IDs

In [40]:
# Check if the query was successful
if "data" in query_result and "OpenAIDocument" in query_result["data"]["Get"]:
    # Extract the objects from the query result
    matching_objects = query_result["data"]["Get"]["OpenAIDocument"]

    # Print the total number of ids to be updated
    total_ids_to_update = len(matching_objects)
    print("Total number of ids to be updated:", total_ids_to_update)

    if total_ids_to_update > 0:
        # Define the updated source value
        updated_source = "podcast"

        # Iterate over all matching objects and update their "source" property
        for obj in matching_objects:
            object_id = obj["_additional"]["id"]

            # Create a dictionary with the property to update
            updated_properties = {
                "source": updated_source
            }

            # Update the object with the new source value
            client.data_object.update(
                updated_properties,
                class_name="OpenAIDocument",
                uuid=object_id,
                consistency_level=weaviate.data.replication.ConsistencyLevel.ALL,  # default QUORUM
            )

            # Fetch the updated object to show its properties
            updated_object = client.data_object.get(class_name="OpenAIDocument", uuid=object_id)
            print(f"\nUpdated Object with id {object_id}:")
            print(updated_object)
    else:
        print("No objects to update.")
else:
    # Print the error message if the query was not successful
    print("Error in query result:", query_result.get("errors"))

Total number of ids to be updated: 84

Updated Object with id 419c982b-e1f3-5f57-9cc5-e3a34b1b58af:
{'class': 'OpenAIDocument', 'creationTimeUnix': 1690660249892, 'id': '419c982b-e1f3-5f57-9cc5-e3a34b1b58af', 'lastUpdateTimeUnix': 1690662120372, 'properties': {'author': 'Ezra Klein', 'chunk_id': '7560d102-676a-4e23-8285-04b43fdb153f_19', 'created_at': '2023-05-26T04:00:00Z', 'document_id': '7560d102-676a-4e23-8285-04b43fdb153f', 'source': 'podcast', 'source_id': '../../raw_data/csv/The Ezra Klein Show/15564005923.txt', 'text': "And then we ask people questions about how blameworthy the person is, how much they should be punished, things like that. And what we find is that we go all the way from weird societies where it's all about the intention. Really people want to kind of forgive the guy who made a mistake. All the way down to there's no difference. The person is out their goods one way or another, you know. Another way to think about this if you accidentally burn someone's house do


Updated Object with id a57a184b-c086-5411-ab19-483eb342f11b:
{'class': 'OpenAIDocument', 'creationTimeUnix': 1690660250067, 'id': 'a57a184b-c086-5411-ab19-483eb342f11b', 'lastUpdateTimeUnix': 1690662120573, 'properties': {'author': 'Ezra Klein', 'chunk_id': '7560d102-676a-4e23-8285-04b43fdb153f_48', 'created_at': '2023-05-26T04:00:00Z', 'document_id': '7560d102-676a-4e23-8285-04b43fdb153f', 'source': 'podcast', 'source_id': '../../raw_data/csv/The Ezra Klein Show/15564005923.txt', 'text': "And so kind of downstream from that, I think it would be intuitive for a lot of secular, weird people to believe they're probably much better at shaping societies more profoundly and potently. Because we're free of all this superstition and Kant and weirdness. But in fact, it seems to me a lesson potentially of the book is it's actually going to be much harder for societies to evolve if they don't have the cooperative power provided by belief and rituals that emerge from belief in something higher. 

# Backup Weaviate instance to local file system

In [3]:
result = client.backup.create(
  backup_id='backup-30072023',
  backend='filesystem',
  wait_for_completion=True,
)

print(result)

{'backend': 'filesystem', 'classes': ['OpenAIDocument'], 'id': 'backup-30072023', 'path': '/tmp/backups/backup-30072023', 'status': 'SUCCESS'}


# Restore backup

In [None]:
result = client.backup.restore(
  backup_id="backup-30072023",
  backend="filesystem",
  wait_for_completion=True,
)

print(result)

# Check if a particular source_id were inserted

In [14]:
query = """
{
    Get {
        OpenAIDocument(where: {
            path: ["source"],
            operator: Equal,
            valueString: "chat"
        }) {
            _additional {
                id
            }
            chunk_id
            document_id
            text
            source
            source_id
            url
            created_at
            author
            timestamp
        }
    }
}
"""

query_result = client.query.raw(query)
# TODO: should take only filename and ignore path for source_id

In [15]:
query_result

{'data': {'Get': {'OpenAIDocument': []}}}