diff --git a/docs/cn/sql-reference/20-sql-functions/17-table-functions/index.md b/docs/cn/sql-reference/20-sql-functions/17-table-functions/index.md
index c03c8f8e7f..8743eb14c3 100644
--- a/docs/cn/sql-reference/20-sql-functions/17-table-functions/index.md
+++ b/docs/cn/sql-reference/20-sql-functions/17-table-functions/index.md
@@ -41,4 +41,4 @@ title: 表函数 (Table Functions)
| 函数 | 描述 | 示例 |
|----------|-------------|---------|
-| [OBFUSCATE](obfuscate.md) | 生成匿名化的数据 | `SELECT * FROM OBFUSCATE(users)` |
+| [OBFUSCATE](../19-data-anonymization-functions/obfuscate.md) | 生成匿名化的数据 | `SELECT * FROM OBFUSCATE(users)` |
diff --git a/docs/cn/sql-reference/20-sql-functions/17-table-functions/obfuscate.md b/docs/cn/sql-reference/20-sql-functions/17-table-functions/obfuscate.md
deleted file mode 100644
index 46e0d8b0af..0000000000
--- a/docs/cn/sql-reference/20-sql-functions/17-table-functions/obfuscate.md
+++ /dev/null
@@ -1,84 +0,0 @@
-
----
-title: OBFUSCATE
----
-
-OBFUSCATE 表函数用于生成匿名化数据。这是一个快速方式,对于更复杂的场景,推荐直接使用底层函数 [MARKOV_TRAIN](../07-aggregate-functions/aggregate-markov-train.md)、[MARKOV_GENERATE](../19-data-anonymization-functions/markov_generate.md) 和 [FEISTEL_OBFUSCATE](../19-data-anonymization-functions/feistel_obfuscate.md)。该函数支持对 String、Integer 和 Float 类型的数据进行匿名化处理。
-
-:::note
-对于其他暂时不支持的类型(如 Date),该函数目前不进行处理,直接返回原值。
-:::
-
-## 语法
-
-```sql
-OBFUSCATE('
'[, seed => ])
-```
-
-## 参数
-
-| 参数 | 描述 |
-| ----------- | ----------- |
-| `` | 输入表。|
-| `seed` | 随机种子。|
-
-## 示例
-
-```sql
-create or replace table users as
-select * from (values
-(1, 'James Smith', 'james.smith@gmail.com', '123 Fake St, Anytown, CA 91234'),
-(2, 'Mary Johnson', 'mary.johnson@yahoo.com', '456 Fictional Ave, Springfield, IL 62704'),
-(3, 'John Williams', 'john.williams@outlook.com', '789 Imaginary Ln, Pleasantville, NY 10570'),
-(4, 'Patricia Brown', 'patricia.brown@hotmail.com', '101 Nonexistent Rd, Metropolis, KS 66666'),
-(5, 'Robert Jones', 'robert.jones@example.com', '222 Make Believe Dr, Smallville, OH 44688'),
-(6, 'Jennifer Garcia', 'jennifer.garcia@gmail.com', '333 Phantom Ct, Gotham, NJ 07005'),
-(7, 'Michael Miller', 'michael.miller@yahoo.com', '444 Unreal Blvd, Wonderland, TX 75001'),
-(8, 'Linda Davis', 'linda.davis@outlook.com', '555 Fabricated Way, Neverland, FL 32801'),
-(9, 'William Rodriguez', 'william.rodriguez@hotmail.com', '666 Bogus Pl, Oz, KS 67445'),
-(10, 'Elizabeth Martinez', 'elizabeth.martinez@example.com', '777 Sham Ln, Camelot, CA 90210'),
-(11, 'James Johnson', 'james.johnson@gmail.com', '888 Pretend Ave, Atlantis, GA 30303'),
-(12, 'Mary Williams', 'mary.williams@yahoo.com', '999 Simulated Rd, Utopia, MI 48009'),
-(13, 'John Brown', 'john.brown@outlook.com', '1010 Counterfeit St, El Dorado, AR 71730'),
-(14, 'Patricia Jones', 'patricia.jones@hotmail.com', '10 Counterfeit St, El Dorado, AR 71730'),
-(15, 'Robert Garcia', 'robert.garcia@example.com', '1111 Phony Ln, Shangri-La, CO 80014'),
-(16, 'Jennifer Miller', 'jennifer.miller@gmail.com', '1212 Artificial Dr, Rivendell, WA 98101'),
-(17, 'Michael Davis', 'michael.davis@yahoo.com', '1313 Spurious Ave, Narnia, TN 37201'),
-(18, 'Linda Rodriguez', 'linda.rodriguez@outlook.com', '1414 Pseudo Rd, Brigadoon, PA 19003'),
-(19, 'William Martinez', 'william.martinez@hotmail.com', '1515 Feigned St, Never Never Land, CA 90210'),
-(20, 'Elizabeth Smith', 'elizabeth.smith@example.com', '1616 Imitation Ln, Asgard, NY 10001'),
-(21, 'James Williams', 'james.williams@gmail.com', '1717 Simulated Ave, Middle Earth, OR 97006'),
-(22, 'Mary Brown', 'mary.brown@yahoo.com', '123 Fake St, Anytown, CA 91234'),
-(23, 'John Jones', 'john.jones@outlook.com', '456 Fictitious Ave, Springfield, IL 62704'),
-(24, 'Patricia Garcia', 'patricia.garcia@hotmail.com', '789 Illusion Ln, Pleasantville, NY 10570'),
-(25, 'Robert Miller', 'robert.miller@example.com', '101 Imaginary Rd, Metropolis, KS 66666'),
-(26, 'Jennifer Davis', 'jennifer.davis@gmail.com', '222 Make Believe Dr, Neverland, FL 33333'),
-(27, 'Michael Rodriguez', 'michael.rodriguez@yahoo.com', '333 Pretend Ct, Wonderland, TX 77777'),
-(28, 'Linda Martinez', 'linda.martinez@outlook.com', '444 Fabricated Blvd, Utopia, WA 98101'),
-(29, 'William Smith', 'william.smith@hotmail.com', '555 Sham Way, Mirage, AZ 85001'),
-(30, 'Elizabeth Johnson', 'elizabeth.johnson@example.com', '666 Bogus Pl, Fantasyland, GA 30303'),
-(31, 'James Brown', 'james.brown@gmail.com', '777 Unreal Ave, Dreamville, CO 80202'),
-(32, 'Mary Jones', 'mary.jones@yahoo.com', '888 Counterfeit Ln, Wishville, OH 44114'),
-(33, 'John Garcia', 'john.garcia@outlook.com', '999 Phony Rd, Delusion, MI 48075'),
-(34, 'Patricia Miller', 'patricia.miller@hotmail.com', '1010 Simulated St, Echo, NV 89109'),
-(35, 'Robert Davis', 'robert.davis@example.com', '1111 Spurious Ave, Replica, PA 19103'),
-(36, 'Jennifer Rodriguez', 'jennifer.rodriguez@gmail.com', '1212 Artificial Dr, Clone, NC 27601'),
-(37, 'Michael Martinez', 'michael.martinez@yahoo.com', '1313 Synthetic Ct, Duplicate, TN 37201'),
-(38, 'Linda Smith', 'linda.smith@outlook.com', '1414 Feigned Blvd, Imposter, IN 46204'),
-(39, 'William Johnson', 'william.johnson@hotmail.com', '1515 Pseudo Pl, Mimic, MN 55401'),
-(40, 'Elizabeth Williams', 'elizabeth.williams@example.com', '1616 Forged Way, Facsimile, AL 35203')
-) users(id, name, email, address);
-
-
-select * from obfuscate(users, seed=>10) limit 5 offset 20;
-╭────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ id │ name │ email │ address │
-│ Nullable(UInt64) │ Nullable(String) │ Nullable(String) │ Nullable(String) │
-├──────────────────┼───────────────────┼───────────────────────────┼─────────────────────────────────────────┤
-│ 21 │ William Rodriguez │ michael.davis@example.com │ 1212 Artificial Dr, Rivendell, WA 98101 │
-│ 16 │ Jennifer Garcia │ patricia.brown@gmail │ 1313 Spurious Ave, NC 27601 │
-│ 25 │ John Brown │ michael.martinez@example │ 1111 Phony Ln, Asgard, NY 10570 │
-│ 30 │ Mary Brown │ jennifer.garcia@gmail.com │ 222 Make Believe Dr, Clone, NC 27601 │
-│ 24 │ James Smith │ elizabeth.johnson@example │ 444 Fabricated St, Anytown, CA 90210 │
-╰────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
diff --git a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md
index 49353616e3..3d1c47e8f0 100644
--- a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md
+++ b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md
@@ -2,7 +2,7 @@
title: FEISTEL_OBFUSCATE
---
-FEISTEL_OBFUSCATE 函数用于对数值类型的数据进行匿名化处理。
+FEISTEL_OBFUSCATE 用于对整数(如 ID 或手机号)进行确定性混淆。它能保持数据的位宽和基数(Cardinality),从而确保混淆后的数据仍可用于 JOIN 操作。
## 语法
@@ -15,7 +15,7 @@ FEISTEL_OBFUSCATE( , )
| 参数 | 描述 |
| ----------- | ----------- |
| `` | 需要进行匿名化处理的数值。|
-| `` | 加密种子。
使用相同的种子总是会生成相同的结果,这在某些场景下很有用。但请注意,泄露种子可能会导致原始数据被还原。|
+| `` | 混淆种子。
使用相同的种子将始终生成相同的结果,这对于保持数据一致性非常有用。请注意,种子泄露可能导致原始数据被还原。|
## 返回类型
@@ -23,20 +23,21 @@ FEISTEL_OBFUSCATE( , )
## 示例
+手机号示例(seed = 4242):
+
```sql
-SELECT feistel_obfuscate(10000,1561819567875);
-+------------------------------------------+
-| feistel_obfuscate(10000, 1561819567875) |
-+------------------------------------------+
-| 15669 |
-+------------------------------------------+
+SELECT 13000000000 + number AS phone,
+ feistel_obfuscate(13000000000 + number, 4242) AS masked_phone
+FROM numbers(5);
+
+-- MCP 实测输出
++-------------+--------------+
+| phone | masked_phone |
++-------------+--------------+
+| 13000000000 | 12221668677 |
+| 13000000001 | 10245458699 |
+| 13000000002 | 15398657780 |
+| 13000000003 | 9910824758 |
+| 13000000004 | 13299971128 |
++-------------+--------------+
```
-feistel_obfuscate 会保留原始输入的位数。如果需要映射到更大的数值范围,可以在原始输入上添加一个偏移量,例如:`feistel_obfuscate(n+10000, 50)`。
-```sql
-SELECT feistel_obfuscate(10,1561819567875);
-+------------------------------------------+
-| feistel_obfuscate(10, 1561819567875) |
-+------------------------------------------+
-| 13 |
-+------------------------------------------+
-```
\ No newline at end of file
diff --git a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md
index 30def444ed..84028d0e23 100644
--- a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md
+++ b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md
@@ -2,9 +2,26 @@
title: 数据匿名化函数
---
-本节提供用于数据匿名化的函数。
+数据匿名化(Data Anonymization)是指修改或移除数据集中的个人身份信息(PII)以保护隐私的过程。其核心目标是在消除数据与特定个人关联的同时,最大限度地保留数据的分析、研究和测试价值。
+
+### 常见脱敏数据类别
+
+为了符合 GDPR 或 CCPA 等法规要求,组织通常需要对以下几类敏感数据进行脱敏处理:
+
+* **直接标识符 (Direct Identifiers)**:能够直接识别个人身份的数据,例如全名、电子邮箱、电话号码、身份证号或社保号。
+* **准标识符 (Quasi-Identifiers)**:虽然单一属性无法直接识别个人,但组合后可能暴露身份的属性,例如出生日期、性别、邮政编码或职位。
+* **敏感业务数据**:需要在非生产环境中保护的机密信息,例如交易金额、薪资详情、内部项目名称或知识产权数据。
+
+### Databend 匿名化技术
+
+Databend 提供了一系列函数来支持多种匿名化技术,涵盖数据脱敏、假名化处理以及合成数据生成:
+
+- **数据脱敏 (Data Masking)**:使用 [`OBFUSCATE` 表函数](obfuscate.md) 自动对列应用脱敏规则,用格式逼真但无实际关联的合成数据替换原始值。
+- **假名化 (Pseudonymization)**:使用 [FEISTEL_OBFUSCATE](feistel_obfuscate.md) 将标识符替换为确定性的混淆值。该方法保留了数据的基数(Cardinality)和位宽,非常适合处理需要进行关联查询(Join)的键值字段。
+- **合成数据 (Synthetic Data)**:结合使用 [MARKOV_TRAIN](../07-aggregate-functions/aggregate-markov-train.md) 和 [MARKOV_GENERATE](markov_generate.md) 生成机器合成数据。这些数据在统计特征上与原始数据集相似,但与真实记录没有任何直接联系。
| 函数 | 描述 |
|----------|-------------|
| [MARKOV_GENERATE](markov_generate.md) | 基于马尔可夫模型生成匿名化数据 |
| [FEISTEL_OBFUSCATE](feistel_obfuscate.md) | 对数值类型进行匿名化处理 |
+| [OBFUSCATE](obfuscate.md) | 表级快速脱敏 |
diff --git a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md
index 3a3b3a05f0..15d1e69c64 100644
--- a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md
+++ b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md
@@ -15,7 +15,7 @@ MARKOV_GENERATE( , , , )
| 参数 | 描述 |
| ----------- | ----------- |
| `model` | 由 markov_train 生成的模型。 |
-| `params`| 生成参数,为 JSON 字符串格式,例如 `{"order": 5, "sliding_window_size": 8}`。
`order`:模型上下文长度。
`sliding_window_size`:源字符串中滑动窗口的大小,其哈希值将用作模型中随机数生成器 (RNG) 的种子。 |
+| `params`| 生成参数,为 JSON 字符串格式,例如 `{"order": 5, "sliding_window_size": 8}`。
`order`:马尔可夫模型的阶数(上下文长度)。
`sliding_window_size`:源字符串的滑动窗口大小,其哈希值将用于初始化随机数生成器 (RNG) 的种子。 |
| `seed` | 生成种子。|
| `determinator`| 输入数据(决定因子)。 |
@@ -25,19 +25,48 @@ MARKOV_GENERATE( , , , )
## 示例
+以下是一个处理 PII(个人身份信息)的示例:分别针对姓名和邮箱训练模型,并一次性生成两列对应的匿名数据。
+
```sql
-create table model as
-select markov_train(concat('bar', number::string)) as bar from numbers(100);
-
-select markov_generate(bar,'{"order":5,"sliding_window_size":8}', 151, (number+100000)::string) as generate
-from numbers(5), model;
-+-----------+
-| generate |
-+-----------+
-│ bar95 │
-│ bar64 │
-│ bar85 │
-│ bar56 │
-│ bar95 │
-+-----------+
+-- 1) 训练姓名与邮箱的 Markov 模型
+CREATE TABLE markov_name_model AS
+SELECT markov_train(name) AS model
+FROM (
+ VALUES ('Alice Johnson'),('Bob Smith'),('Carol Davis'),('David Miller'),('Emma Wilson'),
+ ('Frank Brown'),('Grace Lee'),('Henry Clark'),('Irene Torres'),('Jack White'),
+ ('Karen Young'),('Leo Turner'),('Mia Scott'),('Noah Harris'),('Olivia Baker'),
+ ('Paul Adams'),('Quinn Foster'),('Rachel Price'),('Sam Carter'),('Tina Evans')
+) AS t(name);
+
+CREATE TABLE markov_email_model AS
+SELECT markov_train(email) AS model
+FROM (
+ VALUES ('alice.johnson@gmail.com'),('bob.smith@yahoo.com'),('carol.davis@outlook.com'),
+ ('david.miller@example.com'),('emma.wilson@example.com'),('frank.brown@gmail.com'),
+ ('grace.lee@example.com'),('henry.clark@example.com'),('irene.torres@example.com'),
+ ('jack.white@example.com'),('karen.young@example.com'),('leo.turner@example.com'),
+ ('mia.scott@example.com'),('noah.harris@example.com'),('olivia.baker@example.com'),
+ ('paul.adams@example.com'),('quinn.foster@example.com'),('rachel.price@example.com'),
+ ('sam.carter@example.com'),('tina.evans@example.com')
+) AS t(email);
+
+-- 2) 同时生成姓名和邮箱,两列数据均保持与原始样本相似的分布特征;指定 seed 仅为了复现示例结果
+SELECT
+ markov_generate(n.model, '{"order":3,"sliding_window_size":12}', 3030, CONCAT('orig_', number)) AS fake_name,
+ markov_generate(e.model, '{"order":3,"sliding_window_size":12}', 3030, CONCAT('orig_', number, '@example.com')) AS fake_email
+FROM numbers(6)
+JOIN markov_name_model n
+JOIN markov_email_model e
+LIMIT 6;
+-- 样例输出(MCP 实测)
++-------------+-------------------------+
+| fake_name | fake_email |
++-------------+-------------------------+
+| Frank Brown | henry.clark@example |
+| Grace Johnso| quinn.foster@example |
+| Rachel | paul.adams@example |
+| Carol David | olivia.baker@example |
+| Jack White | frank.brown@gmail.com |
+| Noah Harris | race.johnson@example |
++-------------+-------------------------+
```
diff --git a/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/obfuscate.md b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/obfuscate.md
new file mode 100644
index 0000000000..4b46378c13
--- /dev/null
+++ b/docs/cn/sql-reference/20-sql-functions/19-data-anonymization-functions/obfuscate.md
@@ -0,0 +1,57 @@
+
+---
+title: OBFUSCATE
+---
+
+OBFUSCATE 表函数提供了一种快速生成匿名化数据的方法。对于更复杂的场景,建议直接使用底层函数 [MARKOV_TRAIN](../07-aggregate-functions/aggregate-markov-train.md)、[MARKOV_GENERATE](../19-data-anonymization-functions/markov_generate.md) 和 [FEISTEL_OBFUSCATE](../19-data-anonymization-functions/feistel_obfuscate.md)。该函数支持对 String、Integer 和 Float 类型的数据进行脱敏处理。
+
+:::note
+对于暂不支持的类型(如 Date),该函数将直接返回原始值,不做处理。
+:::
+
+## 语法
+
+```sql
+OBFUSCATE(''[, seed => ])
+```
+
+## 参数
+
+| 参数 | 描述 |
+| ----------- | ----------- |
+| `` | 输入表。|
+| `seed` | 随机种子。|
+
+## 示例
+
+```sql
+CREATE OR REPLACE TABLE demo_customers AS
+SELECT *
+FROM (
+ VALUES
+ (1,'Alice Johnson','alice.johnson@gmail.com','555-123-0001','123 Maple St, Springfield, IL'),
+ (2,'Bob Smith','bob.smith@yahoo.com','555-123-0002','456 Oak Ave, Dayton, OH'),
+ (3,'Carol Davis','carol.davis@outlook.com','555-123-0003','789 Pine Rd, Austin, TX'),
+ (4,'David Miller','david.miller@example.com','555-123-0004','321 Birch Blvd, Denver, CO'),
+ (5,'Emma Wilson','emma.wilson@example.com','555-123-0005','654 Cedar Ln, Seattle, WA'),
+ (6,'Frank Brown','frank.brown@gmail.com','555-123-0006','987 Walnut Dr, Portland, OR'),
+ (7,'Grace Lee','grace.lee@example.com','555-123-0007','159 Ash Ct, Boston, MA'),
+ (8,'Henry Clark','henry.clark@example.com','555-123-0008','753 Elm St, Phoenix, AZ')
+) AS t(id, full_name, email, phone, address);
+
+-- 一键式表级脱敏;指定 seed 仅为了复现示例结果
+SELECT * FROM obfuscate(demo_customers, seed=>2025)
+ORDER BY id;
+
+-- 样例输出(MCP 实测)
+┌────id┬───────────────┬────────────────────────────────┬──────────────┬────────────────────────────────────┐
+│ 1 │ Alice Johnson │ emma.wilson@example.com │ 555-123-0002 │ 123 Maple St, Phoenix, AZ │
+│ 2 │ Alice Johnson │ grace.lee@example.com │ 555-123-0007 │ 753 Elm St, Phoenix, AZ │
+│ 3 │ David Miller │ frank.brown@gmail.com │ 555-123-0001 │ 321 Birch Blvd, Denver, │
+│ 4 │ Alice Johnson │ emma.wilson@example.com │ 555-123-0001 │ 654 Cedar Ln, Seattle, WA │
+│ 5 │ Grace Lee │ carol.david.miller@example │ 555-123-0003 │ 123 Maple St, Phoenix, AZ │
+│ 6 │ Carol David │ emma.wilson@example.com │ 555-123-0003 │ 654 Cedar Ln, Seattle, │
+│ 7 │ Emma Wilson │ bob.smith@yahoo.com │ 555-123-0004 │ 456 Oak Ave, Dayton, MA │
+│ 9 │ Carol David │ frank.brown@gmail.com │ 555-123-0006 │ 456 Oak Ave, Dayton, MA │
+└──────┴───────────────┴────────────────────────────────┴──────────────┴────────────────────────────────────┘
+```
diff --git a/docs/en/sql-reference/20-sql-functions/17-table-functions/index.md b/docs/en/sql-reference/20-sql-functions/17-table-functions/index.md
index 01e5324d38..384e2a982f 100644
--- a/docs/en/sql-reference/20-sql-functions/17-table-functions/index.md
+++ b/docs/en/sql-reference/20-sql-functions/17-table-functions/index.md
@@ -57,4 +57,4 @@ This page provides reference information for the table functions in Databend. Ta
| Function | Description | Example |
|----------|-------------|---------|
-| [OBFUSCATE](obfuscate.md) | dataset anonymization | `SELECT * FROM OBFUSCATE(users)` |
+| [OBFUSCATE](../19-data-anonymization-functions/obfuscate.md) | dataset anonymization | `SELECT * FROM OBFUSCATE(users)` |
diff --git a/docs/en/sql-reference/20-sql-functions/17-table-functions/obfuscate.md b/docs/en/sql-reference/20-sql-functions/17-table-functions/obfuscate.md
deleted file mode 100644
index 05351203e1..0000000000
--- a/docs/en/sql-reference/20-sql-functions/17-table-functions/obfuscate.md
+++ /dev/null
@@ -1,72 +0,0 @@
----
-title: OBFUSCATE
----
-
-Dataset anonymization. This is a quick tool, and for more complex scenarios, it is recommended to directly use the underlying function [MARKOV_TRAIN](../07-aggregate-functions/aggregate-markov-train.md), [MARKOV_GENERATE](../19-data-anonymization-functions/markov_generate.md), [FEISTEL_OBFUSCATE](../19-data-anonymization-functions/feistel_obfuscate.md).
-
-## Syntax
-
-```sql
-OBFUSCATE(''[, seed => ])
-```
-
-## Examples
-
-```sql
-create or replace table users as
-select * from (values
-(1, 'James Smith', 'james.smith@gmail.com', '123 Fake St, Anytown, CA 91234'),
-(2, 'Mary Johnson', 'mary.johnson@yahoo.com', '456 Fictional Ave, Springfield, IL 62704'),
-(3, 'John Williams', 'john.williams@outlook.com', '789 Imaginary Ln, Pleasantville, NY 10570'),
-(4, 'Patricia Brown', 'patricia.brown@hotmail.com', '101 Nonexistent Rd, Metropolis, KS 66666'),
-(5, 'Robert Jones', 'robert.jones@example.com', '222 Make Believe Dr, Smallville, OH 44688'),
-(6, 'Jennifer Garcia', 'jennifer.garcia@gmail.com', '333 Phantom Ct, Gotham, NJ 07005'),
-(7, 'Michael Miller', 'michael.miller@yahoo.com', '444 Unreal Blvd, Wonderland, TX 75001'),
-(8, 'Linda Davis', 'linda.davis@outlook.com', '555 Fabricated Way, Neverland, FL 32801'),
-(9, 'William Rodriguez', 'william.rodriguez@hotmail.com', '666 Bogus Pl, Oz, KS 67445'),
-(10, 'Elizabeth Martinez', 'elizabeth.martinez@example.com', '777 Sham Ln, Camelot, CA 90210'),
-(11, 'James Johnson', 'james.johnson@gmail.com', '888 Pretend Ave, Atlantis, GA 30303'),
-(12, 'Mary Williams', 'mary.williams@yahoo.com', '999 Simulated Rd, Utopia, MI 48009'),
-(13, 'John Brown', 'john.brown@outlook.com', '1010 Counterfeit St, El Dorado, AR 71730'),
-(14, 'Patricia Jones', 'patricia.jones@hotmail.com', '10 Counterfeit St, El Dorado, AR 71730'),
-(15, 'Robert Garcia', 'robert.garcia@example.com', '1111 Phony Ln, Shangri-La, CO 80014'),
-(16, 'Jennifer Miller', 'jennifer.miller@gmail.com', '1212 Artificial Dr, Rivendell, WA 98101'),
-(17, 'Michael Davis', 'michael.davis@yahoo.com', '1313 Spurious Ave, Narnia, TN 37201'),
-(18, 'Linda Rodriguez', 'linda.rodriguez@outlook.com', '1414 Pseudo Rd, Brigadoon, PA 19003'),
-(19, 'William Martinez', 'william.martinez@hotmail.com', '1515 Feigned St, Never Never Land, CA 90210'),
-(20, 'Elizabeth Smith', 'elizabeth.smith@example.com', '1616 Imitation Ln, Asgard, NY 10001'),
-(21, 'James Williams', 'james.williams@gmail.com', '1717 Simulated Ave, Middle Earth, OR 97006'),
-(22, 'Mary Brown', 'mary.brown@yahoo.com', '123 Fake St, Anytown, CA 91234'),
-(23, 'John Jones', 'john.jones@outlook.com', '456 Fictitious Ave, Springfield, IL 62704'),
-(24, 'Patricia Garcia', 'patricia.garcia@hotmail.com', '789 Illusion Ln, Pleasantville, NY 10570'),
-(25, 'Robert Miller', 'robert.miller@example.com', '101 Imaginary Rd, Metropolis, KS 66666'),
-(26, 'Jennifer Davis', 'jennifer.davis@gmail.com', '222 Make Believe Dr, Neverland, FL 33333'),
-(27, 'Michael Rodriguez', 'michael.rodriguez@yahoo.com', '333 Pretend Ct, Wonderland, TX 77777'),
-(28, 'Linda Martinez', 'linda.martinez@outlook.com', '444 Fabricated Blvd, Utopia, WA 98101'),
-(29, 'William Smith', 'william.smith@hotmail.com', '555 Sham Way, Mirage, AZ 85001'),
-(30, 'Elizabeth Johnson', 'elizabeth.johnson@example.com', '666 Bogus Pl, Fantasyland, GA 30303'),
-(31, 'James Brown', 'james.brown@gmail.com', '777 Unreal Ave, Dreamville, CO 80202'),
-(32, 'Mary Jones', 'mary.jones@yahoo.com', '888 Counterfeit Ln, Wishville, OH 44114'),
-(33, 'John Garcia', 'john.garcia@outlook.com', '999 Phony Rd, Delusion, MI 48075'),
-(34, 'Patricia Miller', 'patricia.miller@hotmail.com', '1010 Simulated St, Echo, NV 89109'),
-(35, 'Robert Davis', 'robert.davis@example.com', '1111 Spurious Ave, Replica, PA 19103'),
-(36, 'Jennifer Rodriguez', 'jennifer.rodriguez@gmail.com', '1212 Artificial Dr, Clone, NC 27601'),
-(37, 'Michael Martinez', 'michael.martinez@yahoo.com', '1313 Synthetic Ct, Duplicate, TN 37201'),
-(38, 'Linda Smith', 'linda.smith@outlook.com', '1414 Feigned Blvd, Imposter, IN 46204'),
-(39, 'William Johnson', 'william.johnson@hotmail.com', '1515 Pseudo Pl, Mimic, MN 55401'),
-(40, 'Elizabeth Williams', 'elizabeth.williams@example.com', '1616 Forged Way, Facsimile, AL 35203')
-) users(id, name, email, address);
-
-
-select * from obfuscate(users, seed=>10) limit 5 offset 20;
-╭────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ id │ name │ email │ address │
-│ Nullable(UInt64) │ Nullable(String) │ Nullable(String) │ Nullable(String) │
-├──────────────────┼───────────────────┼───────────────────────────┼─────────────────────────────────────────┤
-│ 21 │ William Rodriguez │ michael.davis@example.com │ 1212 Artificial Dr, Rivendell, WA 98101 │
-│ 16 │ Jennifer Garcia │ patricia.brown@gmail │ 1313 Spurious Ave, NC 27601 │
-│ 25 │ John Brown │ michael.martinez@example │ 1111 Phony Ln, Asgard, NY 10570 │
-│ 30 │ Mary Brown │ jennifer.garcia@gmail.com │ 222 Make Believe Dr, Clone, NC 27601 │
-│ 24 │ James Smith │ elizabeth.johnson@example │ 444 Fabricated St, Anytown, CA 90210 │
-╰────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
-```
diff --git a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md
index 445d36043c..0babcd0743 100644
--- a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md
+++ b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/feistel_obfuscate.md
@@ -2,7 +2,7 @@
title: FEISTEL_OBFUSCATE
---
-Transformed numbers for anonymization
+Deterministically obfuscate integers (e.g. IDs or phone numbers) while preserving bit length and value cardinality so joins still work.
## Syntax
@@ -40,4 +40,23 @@ SELECT feistel_obfuscate(10,1561819567875);
+------------------------------------------+
| 13 |
+------------------------------------------+
-```
\ No newline at end of file
+```
+
+Phone-number style example (seed = 4242):
+
+```sql
+SELECT 13000000000 + number AS phone,
+ feistel_obfuscate(13000000000 + number, 4242) AS masked_phone
+FROM numbers(5);
+
+-- Sample output
++-------------+--------------+
+| phone | masked_phone |
++-------------+--------------+
+| 13000000000 | 12221668677 |
+| 13000000001 | 10245458699 |
+| 13000000002 | 15398657780 |
+| 13000000003 | 9910824758 |
+| 13000000004 | 13299971128 |
++-------------+--------------+
+```
diff --git a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md
index bc9a12234f..839a7379bd 100644
--- a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md
+++ b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/index.md
@@ -2,9 +2,26 @@
title: Data Anonymization Functions
---
-This section provides functions used for data anonymization.
+Data anonymization is the process of altering or removing personally identifiable information (PII) from data sets to protect individual privacy. Its goal is to transform data so it cannot be linked back to specific individuals, while preserving the data's utility for analysis, research, and testing.
+
+### Common Data Categories for Anonymization
+
+Effective anonymization strategies typically target specific categories of sensitive data:
+
+* **Direct Identifiers (PII)**: Information that explicitly identifies a person, such as full names, email addresses, phone numbers, and government IDs.
+* **Indirect Identifiers (Quasi-Identifiers)**: Attributes that can identify individuals when combined with other data sources, such as dates of birth, gender, zip codes, or job titles.
+* **Sensitive Business Data**: Confidential information like financial transactions, salary details, or proprietary internal records that need protection in non-production environments.
+
+### Databend Anonymization Techniques
+
+Databend provides a set of functions to implement various anonymization techniques, including data masking, pseudonymization, and synthetic data generation:
+
+- **Data Masking**: Use the [`OBFUSCATE` table function](obfuscate.md) to automatically apply masking rules to columns, replacing original values with artificial ones that appear genuine.
+- **Pseudonymization**: Use [FEISTEL_OBFUSCATE](feistel_obfuscate.md) to replace identifiers with deterministic substitutes. This preserves data integrity and cardinality, making it suitable for maintaining join keys.
+- **Synthetic Data**: Use [MARKOV_TRAIN](../07-aggregate-functions/aggregate-markov-train.md) and [MARKOV_GENERATE](markov_generate.md) to produce machine-generated data that statistically resembles the original dataset but has no direct connection to real records.
| Function | Description |
|----------|-------------|
| [MARKOV_GENERATE](markov_generate.md) | Generate anonymized strings based on a Markov model |
| [FEISTEL_OBFUSCATE](feistel_obfuscate.md) | Obfuscate numbers using a Feistel cipher |
+| [OBFUSCATE](obfuscate.md) | Table-level masking using built-in rules |
diff --git a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md
index eb827230d4..6700dde730 100644
--- a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md
+++ b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/markov_generate.md
@@ -7,7 +7,7 @@ Using the model trained by [MARKOV_TRAIN](../07-aggregate-functions/aggregate-ma
## Syntax
```sql
-FEISTEL_OBFUSCATE( , , , )
+MARKOV_GENERATE( , , , )
```
## Arguments
@@ -25,19 +25,43 @@ String.
## Examples
+Generate multiple PII-like columns (name + email) from small seed sets:
+
```sql
-create table model as
-select markov_train(concat('bar', number::string)) as bar from numbers(100);
-
-select markov_generate(bar,'{"order":5,"sliding_window_size":8}', 151, (number+100000)::string) as generate
-from numbers(5), model;
-+-----------+
-| generate |
-+-----------+
-│ bar95 │
-│ bar64 │
-│ bar85 │
-│ bar56 │
-│ bar95 │
-+-----------+
+-- 1) Train separate models on names and emails (PII text)
+CREATE TABLE markov_name_model AS
+SELECT markov_train(name) AS model
+FROM (
+ VALUES ('Alice Johnson'),('Bob Smith'),('Carol Davis'),('David Miller'),('Emma Wilson'),
+ ('Frank Brown'),('Grace Lee'),('Henry Clark'),('Irene Torres'),('Jack White')
+) AS t(name);
+
+CREATE TABLE markov_email_model AS
+SELECT markov_train(email) AS model
+FROM (
+ VALUES ('alice.johnson@gmail.com'),('bob.smith@yahoo.com'),('carol.davis@outlook.com'),
+ ('david.miller@example.com'),('emma.wilson@example.com'),('frank.brown@gmail.com'),
+ ('grace.lee@example.com'),('henry.clark@example.com'),('irene.torres@example.com'),
+ ('jack.white@example.com')
+) AS t(email);
+
+-- 2) Generate synthetic name + email pairs; seed keeps it reproducible
+SELECT
+ markov_generate(n.model, '{"order":3,"sliding_window_size":12}', 3030, CONCAT('orig_', number)) AS fake_name,
+ markov_generate(e.model, '{"order":3,"sliding_window_size":12}', 3030, CONCAT('orig_', number, '@example.com')) AS fake_email
+FROM numbers(6)
+JOIN markov_name_model n
+JOIN markov_email_model e
+LIMIT 6;
+-- Sample output
++----------------+-------------------------+
+| fake_name | fake_email |
++----------------+-------------------------+
+| Frank Brown | henry.clark@example |
+| Grace Johnso | quinn.foster@example |
+| Rachel | paul.adams@example |
+| Carol David | olivia.baker@example |
+| Jack White | frank.brown@gmail.com |
+| Noah Harris | race.johnson@example |
++----------------+-------------------------+
```
diff --git a/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/obfuscate.md b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/obfuscate.md
new file mode 100644
index 0000000000..3333ab98cf
--- /dev/null
+++ b/docs/en/sql-reference/20-sql-functions/19-data-anonymization-functions/obfuscate.md
@@ -0,0 +1,45 @@
+---
+title: OBFUSCATE
+---
+
+Dataset anonymization. This is a quick tool, and for more complex scenarios, it is recommended to directly use the underlying function [MARKOV_TRAIN](../07-aggregate-functions/aggregate-markov-train.md), [MARKOV_GENERATE](../19-data-anonymization-functions/markov_generate.md), [FEISTEL_OBFUSCATE](../19-data-anonymization-functions/feistel_obfuscate.md).
+
+## Syntax
+
+```sql
+OBFUSCATE(''[, seed => ])
+```
+
+## Examples
+
+```sql
+CREATE OR REPLACE TABLE demo_customers AS
+SELECT *
+FROM (
+ VALUES
+ (1,'Alice Johnson','alice.johnson@gmail.com','555-123-0001','123 Maple St, Springfield, IL'),
+ (2,'Bob Smith','bob.smith@yahoo.com','555-123-0002','456 Oak Ave, Dayton, OH'),
+ (3,'Carol Davis','carol.davis@outlook.com','555-123-0003','789 Pine Rd, Austin, TX'),
+ (4,'David Miller','david.miller@example.com','555-123-0004','321 Birch Blvd, Denver, CO'),
+ (5,'Emma Wilson','emma.wilson@example.com','555-123-0005','654 Cedar Ln, Seattle, WA'),
+ (6,'Frank Brown','frank.brown@gmail.com','555-123-0006','987 Walnut Dr, Portland, OR'),
+ (7,'Grace Lee','grace.lee@example.com','555-123-0007','159 Ash Ct, Boston, MA'),
+ (8,'Henry Clark','henry.clark@example.com','555-123-0008','753 Elm St, Phoenix, AZ')
+) AS t(id, full_name, email, phone, address);
+
+-- One-call table masking; seed keeps it reproducible
+SELECT * FROM obfuscate(demo_customers, seed=>2025)
+ORDER BY id;
+
+-- Sample output
+┌────id┬───────────────┬────────────────────────────────┬──────────────┬────────────────────────────────────┐
+│ 1 │ Alice Johnson │ emma.wilson@example.com │ 555-123-0002 │ 123 Maple St, Phoenix, AZ │
+│ 2 │ Alice Johnson │ grace.lee@example.com │ 555-123-0007 │ 753 Elm St, Phoenix, AZ │
+│ 3 │ David Miller │ frank.brown@gmail.com │ 555-123-0001 │ 321 Birch Blvd, Denver, │
+│ 4 │ Alice Johnson │ emma.wilson@example.com │ 555-123-0001 │ 654 Cedar Ln, Seattle, WA │
+│ 5 │ Grace Lee │ carol.david.miller@example │ 555-123-0003 │ 123 Maple St, Phoenix, AZ │
+│ 6 │ Carol David │ emma.wilson@example.com │ 555-123-0003 │ 654 Cedar Ln, Seattle, │
+│ 7 │ Emma Wilson │ bob.smith@yahoo.com │ 555-123-0004 │ 456 Oak Ave, Dayton, MA │
+│ 9 │ Carol David │ frank.brown@gmail.com │ 555-123-0006 │ 456 Oak Ave, Dayton, MA │
+└──────┴───────────────┴────────────────────────────────┴──────────────┴────────────────────────────────────┘
+```