In [1]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [14]:
# 定义训练数据（假设是分词标注数据，每个句子用词列表表示，每个词有对应标签）
# 示例句子数据格式：[[(word1, label1), (word2, label2), ...], [(word1, label1), ...]]
train_sents = [
    [('我', 'O'), ('是', 'O'), ('学生', 'O')],
    [('你', 'O'), ('好', 'O')],
    [('今天', 'O'), ('天气', 'O'), ('很好', 'O')]
]

In [16]:
# 将句子中的词转化为特征
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word': word,
        'is_first': i == 0,
        'is_last': i == len(sent) - 1,
    }
    return features

In [18]:
# 构建每个句子的特征
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

In [20]:
# 获取句子的标签
def sent2labels(sent):
    return [label for token, label in sent]

In [54]:
for s in train_sents:
    print(s)
    print(len(s))

[('我', 'O'), ('是', 'O'), ('学生', 'O')]
3
[('你', 'O'), ('好', 'O')]
2
[('今天', 'O'), ('天气', 'O'), ('很好', 'O')]
3


In [40]:
# 准备训练特征和标签
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]
print(X_train)
print(y_train)

[[{'word': '我', 'is_first': True, 'is_last': False}, {'word': '是', 'is_first': False, 'is_last': False}, {'word': '学生', 'is_first': False, 'is_last': True}], [{'word': '你', 'is_first': True, 'is_last': False}, {'word': '好', 'is_first': False, 'is_last': True}], [{'word': '今天', 'is_first': True, 'is_last': False}, {'word': '天气', 'is_first': False, 'is_last': False}, {'word': '很好', 'is_first': False, 'is_last': True}]]
[['O', 'O', 'O'], ['O', 'O'], ['O', 'O', 'O']]


In [24]:
# 初始化 CRF 模型并进行训练
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',  # 使用 L-BFGS 训练算法
    c1=0.1,  # 正则化系数 c1
    c2=0.1,  # 正则化系数 c2
    max_iterations=100,  # 最大迭代次数
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [25]:
# 测试数据
test_sents = [
    [('明天', 'O'), ('有', 'O'), ('雨', 'O')]
]
X_test = [sent2features(s) for s in test_sents]


In [27]:
# 进行预测
y_pred = crf.predict(X_test)
print("预测标签:", y_pred)

预测标签: [['O' 'O' 'O']]


In [28]:
# 评估模型效果
y_pred_train = crf.predict(X_train)
print("训练集 F1 分数:", metrics.flat_f1_score(y_train, y_pred_train, average='weighted'))


训练集 F1 分数: 1.0


In [60]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# 定义带有多种标签的训练数据
train_sents = [
    [('我', 'O'), ('是', 'O'), ('张三', 'B-PER'), ('的', 'O'), ('朋友', 'O')],
    [('OpenAI', 'B-ORG'), ('的', 'O'), ('模型', 'O')],
    [('今天', 'O'), ('天气', 'O'), ('很好', 'O')]
]

In [62]:
# 构建特征提取函数
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'word': word,
        'is_first': i == 0,
        'is_last': i == len(sent) - 1,
        'is_digit': word.isdigit(),
        'prefix-1': word[0],
        'suffix-1': word[-1],
    }
    if i > 0:
        features.update({
            'prev_word': sent[i-1][0],
            'prev_label': sent[i-1][1]
        })
    else:
        features['BOS'] = True  # Beginning of Sentence

    if i < len(sent)-1:
        features['next_word'] = sent[i+1][0]
    else:
        features['EOS'] = True  # End of Sentence

    return features

# 提取句子的所有词特征
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# 提取句子的标签
def sent2labels(sent):
    return [label for token, label in sent]

In [64]:
# 准备训练特征和标签
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

# 初始化并训练 CRF 模型
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

In [73]:
print(X_train)
print(y_train)

[[{'word': '我', 'is_first': True, 'is_last': False, 'is_digit': False, 'prefix-1': '我', 'suffix-1': '我', 'BOS': True, 'next_word': '是'}, {'word': '是', 'is_first': False, 'is_last': False, 'is_digit': False, 'prefix-1': '是', 'suffix-1': '是', 'prev_word': '我', 'prev_label': 'O', 'next_word': '张三'}, {'word': '张三', 'is_first': False, 'is_last': False, 'is_digit': False, 'prefix-1': '张', 'suffix-1': '三', 'prev_word': '是', 'prev_label': 'O', 'next_word': '的'}, {'word': '的', 'is_first': False, 'is_last': False, 'is_digit': False, 'prefix-1': '的', 'suffix-1': '的', 'prev_word': '张三', 'prev_label': 'B-PER', 'next_word': '朋友'}, {'word': '朋友', 'is_first': False, 'is_last': True, 'is_digit': False, 'prefix-1': '朋', 'suffix-1': '友', 'prev_word': '的', 'prev_label': 'O', 'EOS': True}], [{'word': 'OpenAI', 'is_first': True, 'is_last': False, 'is_digit': False, 'prefix-1': 'O', 'suffix-1': 'I', 'BOS': True, 'next_word': '的'}, {'word': '的', 'is_first': False, 'is_last': False, 'is_digit': False, 'prefix-

In [75]:
# 测试数据
test_sents = [
    [('李四', 'B-PER'), ('去', 'O'), ('了', 'O'), ('北京', 'B-LOC')]
]
X_test = [sent2features(s) for s in test_sents]
print(X_test)
# 进行预测
y_pred = crf.predict(X_test)
print("预测标签:", y_pred)

[[{'word': '李四', 'is_first': True, 'is_last': False, 'is_digit': False, 'prefix-1': '李', 'suffix-1': '四', 'BOS': True, 'next_word': '去'}, {'word': '去', 'is_first': False, 'is_last': False, 'is_digit': False, 'prefix-1': '去', 'suffix-1': '去', 'prev_word': '李四', 'prev_label': 'B-PER', 'next_word': '了'}, {'word': '了', 'is_first': False, 'is_last': False, 'is_digit': False, 'prefix-1': '了', 'suffix-1': '了', 'prev_word': '去', 'prev_label': 'O', 'next_word': '北京'}, {'word': '北京', 'is_first': False, 'is_last': True, 'is_digit': False, 'prefix-1': '北', 'suffix-1': '京', 'prev_word': '了', 'prev_label': 'O', 'EOS': True}]]
预测标签: [['O' 'O' 'O' 'O']]


In [77]:
y_pred_train = crf.predict(X_train)
print(y_pred_train)
print("训练集 F1 分数:", metrics.flat_f1_score(y_train, y_pred_train, average='weighted'))


[list(['O', 'O', 'B-PER', 'O', 'O']) list(['B-ORG', 'O', 'O'])
 list(['O', 'O', 'O'])]
训练集 F1 分数: 1.0
