# 批量导入
 

In [1]:
## 测试
import json
import weaviate
from weaviate.auth import AuthApiKey

# 连接到本地部署的 Weaviate
client = weaviate.Client(
    url="http://127.0.0.1:8080",
    auth_client_secret=AuthApiKey("WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih")
)

## 基础导入
下面的示例将对象添加到MyCollection集合中。

In [2]:
class_name = "MyCollection"  # Replace with your class name
data_objs = [
    {"title": f"Object {i+1}"} for i in range(5)
]
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    for data_obj in data_objs:
        batch.add_data_object(
            data_obj,
            class_name,
            # tenant="tenantA"  # If multi-tenancy is enabled, specify the tenant to which the object will be added.
        )

> 在批量导入期间，任何失败的对象或引用都将被存储，并可通过batch.failed_objects和获取batch.failed_references。此外，还会维护失败的对象和引用的运行计数，并可通过batch.number_errors上下文管理器访问。此计数器可用于停止导入过程，以调查失败的对象或引用。




In [None]:
### V4
data_rows = [
    {"title": f"Object {i+1}"} for i in range(5)
]

collection = client.collections.get("MyCollection")

with collection.batch.dynamic() as batch:
    for data_row in data_rows:
        batch.add_object(
            properties=data_row,
        )
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

## 使用 gRPC API 
gRPC API比 REST API 更快。使用 gRPC API 可以提高导入速度。

### V3
TypeScript 客户端v3默认使用 gRPC。         
旧版 TypeScript 客户端不支持 gRPC。            

### V4
Python 客户端默认使用 gRPC。更多批量导入配置选项请参见客户端页面。    
旧版 Python 客户端不支持 gRPC。


## 指定 ID
Weaviate 为每个对象生成一个 UUID。对象 ID 必须是唯一的。如果您设置了对象 ID，请使用以下确定性 UUID方法之一来防止 ID 重复：

- generate_uuid5（Python）
- generateUuid5（TypeScript）

In [None]:
### V3
from weaviate.util import generate_uuid5  # Generate a deterministic ID

class_name = "MyCollection"  # Replace with your class name
data_objs = [
    {"title": f"Object {i+1}"} for i in range(5)  # Replace with your actual objects
]
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    for data_obj in data_objs:
        batch.add_data_object(
            data_obj,
            class_name,
            uuid=generate_uuid5(data_obj)  # Optional: Specify an object ID
        )

In [None]:
## V4
from weaviate.util import generate_uuid5  # Generate a deterministic ID

data_rows = [{"title": f"Object {i+1}"} for i in range(5)]

collection = client.collections.get("MyCollection")

with collection.batch.dynamic() as batch:
    for data_row in data_rows:
        obj_uuid = generate_uuid5(data_row)
        batch.add_object(
            properties=data_row,
            uuid=obj_uuid
        )
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

## 指定向量
使用该vector属性为每个对象指定一个向量。



In [None]:
#### V3

class_name = "MyCollection"  # Replace with your class name
data_objs = [
    {"title": f"Object {i+1}"} for i in range(5)  # Replace with your actual objects
]
vectors = [
    [0.25 + i/100] * 10 for i in range(5)  # Replace with your actual vectors
]
client.batch.configure(batch_size=100)  # Configure batch
with client.batch as batch:
    for i, data_obj in enumerate(data_objs):
        batch.add_data_object(
            data_obj,
            class_name,
            vector=vectors[i]  # Optional: Specify an object vector
        )

In [None]:
### V4
data_rows = [{"title": f"Object {i+1}"} for i in range(5)]
vectors = [[0.1] * 1536 for i in range(5)]

collection = client.collections.get("MyCollection")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(data_rows):
        batch.add_object(
            properties=data_row,
            vector=vectors[i]
        )
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

## 指定命名向量
### V3
不支持
### V4
当你创建一个对象时，你可以指定命名向量（如果在你的集合中配置了）。
 

In [None]:
data_rows = [{
    "title": f"Object {i+1}",
    "body": f"Body {i+1}"
} for i in range(5)]

title_vectors = [[0.12] * 1536 for _ in range(5)]
body_vectors = [[0.34] * 1536 for _ in range(5)]

collection = client.collections.get("MyCollection")

with collection.batch.dynamic() as batch:
    for i, data_row in enumerate(data_rows):
        batch.add_object(
            properties=data_row,
            vector={
                "title": title_vectors[i],
                "body": body_vectors[i],
            }
        )
        if batch.number_errors > 10:
            print("Batch import stopped due to excessive errors.")
            break

failed_objects = collection.batch.failed_objects
if failed_objects:
    print(f"Number of failed imports: {len(failed_objects)}")
    print(f"First failed object: {failed_objects[0]}")

## 带引用导入
您可以通过交叉引用批量创建从一个对象到另一个对象的链接。

In [None]:
## V3
with client.batch as batch:
    batch.add_reference(
        from_object_uuid="36ddd591-2dee-4e7e-a3cc-eb86d30a4303",
        from_object_class_name="Author",
        from_property_name="wroteArticles",
        to_object_uuid="6bb06a43-e7f0-393e-9ecf-3c0f4e129064",
        to_object_class_name="Article",
        # tenant="tenantA",  # Optional; specify the tenant in multi-tenancy collections
    )



In [None]:
## V4
collection = client.collections.get("Author")

with collection.batch.fixed_size(batch_size=100) as batch:
    batch.add_reference(
        from_property="writesFor",
        from_uuid=from_uuid,
        to=target_uuid,
    )

failed_references = collection.batch.failed_references
if failed_references:
    print(f"Number of failed imports: {len(failed_references)}")
    print(f"First failed reference: {failed_references[0]}")

## 来自大文件的流数据

```json
[{
Air Date: "2006-11-08",
Round: "Double Jeopardy!",
Value: 800,
Category: "AMERICAN HISTORY",
Question: "Abraham Lincoln died across the street from this theatre on April 15, 1865",
Answer: "Ford's Theatre (the Ford Theatre accepted)"
},
{
Air Date: "2005-11-18",
Round: "Jeopardy!",
Value: 200,
Category: "RHYME TIME",
Question: "Any pigment on the wall so faded you can barely see it",
Answer: "faint paint"
}]
```

In [None]:
import weaviate
import ijson

# Settings for displaying the import progress
counter = 0
interval = 200  # print progress every this many records; should be bigger than the batch_size

def add_object(obj) -> None:
    global counter
    properties = {
        "question": obj["Question"],
        "answer": obj["Answer"],
    }

    client.batch.configure(batch_size=100)  # Configure batch
    with client.batch as batch:
        # Add the object to the batch
        batch.add_data_object(
            data_object=properties,
            class_name="JeopardyQuestion",
            # If you Bring Your Own Vectors, add the `vector` parameter here
            # vector=obj.vector
        )

        # Calculate and display progress
        counter += 1
        if counter % interval == 0:
            print(f"Imported {counter} articles...")


print("JSON streaming, to avoid running out of memory on large files...")
with open("jeopardy_1k.json", "rb") as f:
    objects = ijson.items(f, "item")
    for o in objects:
        add_object(o)

print(f"Finished importing {counter} articles.")

In [None]:
### V4
import ijson

# Settings for displaying the import progress
counter = 0
interval = 200  # print progress every this many records; should be bigger than the batch_size

print("JSON streaming, to avoid running out of memory on large files...")
with client.batch.fixed_size(batch_size=100) as batch:
    with open("jeopardy_1k.json", "rb") as f:
        objects = ijson.items(f, "item")
        for obj in objects:
            properties = {
                "question": obj["Question"],
                "answer": obj["Answer"],
            }
            batch.add_object(
                collection="JeopardyQuestion",
                properties=properties,
                # If you Bring Your Own Vectors, add the `vector` parameter here
                # vector=obj.vector["default"]
            )

            # Calculate and display progress
            counter += 1
            if counter % interval == 0:
                print(f"Imported {counter} articles...")


print(f"Finished importing {counter} articles.")

## 批量矢量化
一些模型提供商提供批量矢量化API，其中每个请求可以包含多个对象。

从 Weaviate 开始v1.25.0，批量导入会自动使用模型提供商的批量矢量化 API（如果可用）。这减少了对模型提供商的请求数量，从而提高了吞吐量。
### 模型提供者配置
您可以为每个模型提供程序配置批量矢量化设置，例如每分钟请求数或每分钟令牌数。以下示例为 Cohere 和 OpenAI 集成设置速率限制，并为两者提供 API 密钥。

请注意，每个提供商都公开不同的配置选项。

In [None]:
## V4
from weaviate.classes.config import Integrations

integrations = [
    # Each model provider may expose different parameters
    Integrations.cohere(
        api_key=cohere_key,
        requests_per_minute_embeddings=rpm_embeddings,
    ),
    Integrations.openai(
        api_key=openai_key,
        requests_per_minute_embeddings=rpm_embeddings,
        tokens_per_minute_embeddings=tpm_embeddings,   # e.g. OpenAI also exposes tokens per minute for embeddings
    ),
]
client.integrations.configure(integrations)

## 异步导入
从 开始可用v1.22。这是一项实验性功能。请谨慎使用。

为了最大限度地提高导入速度，请启用异步索引。
要启用异步索引，请在 Weaviate 配置文件`ASYNC_INDEXING`中将环境变量设置为`true`


```shell
weaviate:
  image: cr.weaviate.io/semitechnologies/weaviate:1.30.0
  ...
  environment:
    ASYNC_INDEXING: 'true'
  ...
```

### 自动添加新租户
默认情况下，如果您尝试将对象插入不存在的租户，Weaviate 会返回错误。要更改此行为以便 Weaviate 创建新租户，请在集合定义中设置autoTenantCreation为。true

自动租户功能可用于v1.25.0批量导入，也可用于v1.25.2单个对象插入。

在创建集合时设置autoTenantCreation，或重新配置集合以根据需要更新设置。

当您导入大量对象时，自动创建租户非常有用。如果您的数据可能存在细微的不一致或拼写错误，请谨慎操作。例如，名称TenantOne、tenantOne和TenntOne将创建三个不同的租户。

有关详细信息，请参阅自动租户。