In [1]:
import random
import nltk
# 如果 nltk_data 目录不存在，会自动创建到用户主目录下
nltk.download("wordnet")
nltk.download("omw-1.4")
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [1]:
import random
import nltk
from nltk.corpus import wordnet

def get_synonyms(word: str):
    synsets = wordnet.synsets(word)
    synonyms = set()
    for syn in synsets:
        for lemma in syn.lemmas():
            name = lemma.name().replace("_", " ")
            if name.lower() != word.lower():
                synonyms.add(name)
    return list(synonyms)

def synonym_replacement(sentence: str, n: int = 1) -> str:
    """
    随机选 n 个可替换的单词，用其同义词替换。
    """
    words = sentence.split()
    # 可替换词：非停用词、长度>2 且有同义词
    candidates = [w for w in set(words) if len(get_synonyms(w)) > 0]
    random.shuffle(candidates)
    num_replaced = 0
    for w in candidates:
        syns = get_synonyms(w)
        if syns:
            synonym = random.choice(syns)
            # 全文替换该词的所有出现
            sentence = sentence.replace(w, synonym, 1)
            num_replaced += 1
        if num_replaced >= n:
            break
    return sentence


In [2]:
text = "患者出现持续性头痛和恶心"
print(synonym_replacement(text, n=3))
# 可能输出: "患者发生持续性头痛和作呕"


患者出现持续性头痛和恶心


In [None]:
random.choice(seq)
📌 作用：从一个序列中随机选一个元素
import random
items = ['apple', 'banana', 'cherry']
print(random.choice(items))  # 输出可能是 'banana'

In [None]:
✅ random.randint(a, b)
📌 作用：生成一个**[a, b]（闭区间）之间的随机整数**
print(random.randint(1, 10))  # 输出 1~10 之间的整数

In [None]:
✅ random.sample(seq, k)
📌 作用：从序列中随机抽取 k 个不重复元素
nums = [1, 2, 3, 4, 5]
print(random.sample(nums, 3))  # 例如 [2, 5, 1]

In [None]:
✅ random.random()
📌 作用：生成一个 0 到 1 之间的浮点数

In [None]:
✅ 2. 另外检查 eda_augment() 中是否是按空格切词
EDA 代码通常假设输入是英文词列表，通过空格分词：

words = sentence.split()
但对于中文，你需要使用更合适的分词器，比如 jieba：

import jieba

words = list(jieba.cut(sentence))
否则 "糖尿病患者需要监测血糖水平" 会被认为是一个词而非多个词，结果 len(words) == 1，就会触发上面这个错误。

In [7]:
import jieba

def random_deletion(words, p=0.1):
    # 每个词以 p 的概率删除
    if len(words) == 1:
        return words
    return [w for w in words if random.random() > p]

def random_swap(words, n_swaps=1):
    words = words.copy()
    n = len(words)
    if n <= 2:
        return words
    for _ in range(n_swaps):
        i, j = random.sample(range(n), 2)
        words[i], words[j] = words[j], words[i]
    return words

def random_insertion(words, n_insert=1):
    new_words = words.copy()
    for _ in range(n_insert):
        candidates = [w for w in new_words if get_synonyms(w)]
        if not candidates: break
        word = random.choice(candidates)
        synonym = random.choice(get_synonyms(word))
        pos = random.randint(0, len(new_words))
        new_words.insert(pos, synonym)
    return new_words

def eda_augment(sentence: str, alpha_sr=0.1, alpha_ri=0.1,
                alpha_rs=0.1, p_rd=0.1, num_aug=1):
    """
    对一句话生成 num_aug 条 EDA 增强样本
    """
    words = sentence.split()
    augmented = []
    n_sr = max(1, int(alpha_sr * len(words)))
    n_ri = max(1, int(alpha_ri * len(words)))
    n_rs = max(1, int(alpha_rs * len(words)))

    # 1) 同义词替换
    a_words = synonym_replacement(sentence, n_sr).split()
    augmented.append(" ".join(a_words))
    # 2) 随机插入
    a_words = random_insertion(words, n_ri)
    augmented.append(" ".join(a_words))
    # 3) 随机交换
    a_words = random_swap(words, n_rs)
    augmented.append(" ".join(a_words))
    # 4) 随机删除
    a_words = random_deletion(words, p_rd)
    augmented.append(" ".join(a_words))

    # 随机选 num_aug 条返回
    random.shuffle(augmented)
    return augmented[:num_aug]


In [9]:
print(eda_augment("People with diabetes need to monitor their blood sugar levels", alpha_sr=0.2, p_rd=0.1, num_aug=2))

['masses with diabetes need to monitor their blood lolly levels', 'raze People with diabetes need to monitor their blood sugar levels']
