In [9]:
#纯使用BERT模型版本

import tkinter as tk
from tkinter import filedialog, messagebox
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import numpy as np

# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

def extract_pooled_keywords():
    extract_keywords(method="pooled_output")

def extract_sequence_keywords():
    extract_keywords(method="sequence_output")

def extract_keywords(method):
    # 打开文件对话框，让用户选择要上传的文件
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
    
    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        # 读取用户选择的文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # 将文本数据转换为适合 BERT 模型输入的格式
        inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)

        # 输入文本数据到 BERT 模型中
        outputs = model(inputs)

        keywords = []

        if method == "pooled_output":
            pooled_output = outputs.pooler_output.numpy()[0]
            word_ids = inputs['input_ids'][0].numpy()
            special_tokens = {tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id}
            id2word = {idx: tokenizer.convert_ids_to_tokens([word_id])[0] for idx, word_id in enumerate(word_ids) if word_id not in special_tokens}
            keywords = [(id2word[idx], pooled_output[idx % len(pooled_output)]) for idx in id2word]

        elif method == "sequence_output":
            sequence_output = outputs.last_hidden_state.numpy()[0]
            word_ids = inputs['input_ids'][0].numpy()
            special_tokens = {tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id}
            id2word = {idx: tokenizer.convert_ids_to_tokens([word_id])[0] for idx, word_id in enumerate(word_ids) if word_id not in special_tokens}
            word_scores = np.mean(sequence_output, axis=1)
            keywords = [(id2word[idx], word_scores[idx]) for idx in id2word]

        # 根据重要性分数降序排列关键词
        keywords = sorted(keywords, key=lambda x: x[1], reverse=True)

        # 将提取的关键词结果显示在文本框中
        result_text.delete('1.0', tk.END)
        for keyword, score in keywords:
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

# 创建主窗口
root = tk.Tk()
root.title("关键词提取工具")
root.geometry("500x400")  # 设置窗口尺寸

# 创建一个框架，用于存放按钮
button_frame = tk.Frame(root)
button_frame.pack(pady=10)

# 创建按钮选择不同的提取方式
pooled_button = tk.Button(button_frame, text="使用池化输出提取", command=extract_pooled_keywords, width=20)
pooled_button.grid(row=0, column=0, padx=5)

sequence_button = tk.Button(button_frame, text="使用序列输出提取", command=extract_sequence_keywords, width=20)
sequence_button.grid(row=0, column=1, padx=5)

# 创建显示结果的文本框并添加滚动条
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

# 运行主循环
root.mainloop()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [4]:
pip install keybert

Collecting keybert
  Downloading keybert-0.8.4.tar.gz (29 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting sentence-transformers>=0.3.8 (from keybert)
  Downloading sentence_transformers-2.7.0-py3-none-any.whl.metadata (11 kB)
Collecting torch>=1.11.0 (from sentence-transformers>=0.3.8->keybert)
  Downloading torch-2.3.0-cp39-cp39-win_amd64.whl.metadata (26 kB)
Collecting sympy (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading sympy-1.12-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading networkx-3.2.1-py3-none-any.whl.metadata (5.2 kB)
Collecting mkl<=2021.4.0,>=2021.1.1 (from torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  Downloading mkl-2021.4.0-py2.py3-none-win_amd64.whl.metadata (1.4 kB)
Collecting intel-openmp==2021.* (from mkl<=2021.4.0,>=2021.1.1->torch>=1.11.0->sentence-transformers>=0.3.8->keybert)
  

In [10]:
#BERT模型仿照KeyBERT提取的改进版

import tkinter as tk
from tkinter import filedialog, messagebox
from transformers import BertTokenizer, TFBertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

def extract_keywords():
    # 获取用户输入的关键词提取长度和数量
    try:
        ngram_range = int(ngram_entry.get())
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    # 打开文件对话框，让用户选择要上传的文件
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
    
    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        # 读取用户选择的文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # 将文本数据转换为适合 BERT 模型输入的格式
        inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)

        # 输入文本数据到 BERT 模型中
        outputs = model(inputs)

        # 获取最后一层的序列输出，并限制大小
        sequence_output = outputs.last_hidden_state.numpy()[0]

        # 获取输入的 token id 和映射的词汇
        word_ids = inputs['input_ids'][0].numpy()
        special_tokens = {tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id}
        id2word = {idx: tokenizer.convert_ids_to_tokens([word_id])[0] for idx, word_id in enumerate(word_ids) if word_id not in special_tokens}

        # 修复可能的索引超出错误：确保 `sequence_output` 和 `id2word` 索引一致
        valid_indices = [idx for idx in id2word if idx < len(sequence_output)]
        word_vectors = np.array([sequence_output[idx] for idx in valid_indices])
        id2word = {idx: id2word[idx] for idx in valid_indices}

        # 构建词向量与文本嵌入的余弦相似度
        doc_vector = np.mean(sequence_output, axis=0).reshape(1, -1)
        similarities = cosine_similarity(word_vectors, doc_vector).flatten()

        # 将关键词根据相似度排序
        keywords = sorted([(id2word[idx], similarities[i]) for i, idx in enumerate(valid_indices)], key=lambda x: x[1], reverse=True)

        # 显示提取的关键词（限制数量）
        result_text.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            if i >= num_keywords:
                break
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

# 创建主窗口
root = tk.Tk()
root.title("关键词提取工具")
root.geometry("500x500")  # 设置窗口尺寸

# 创建一个框架，用于存放设置项
settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)

# 创建用户输入 ngram 范围和关键词数量的输入框
ngram_label = tk.Label(settings_frame, text="关键词长度 (n-gram):")
ngram_label.grid(row=0, column=0, padx=5)
ngram_entry = tk.Entry(settings_frame, width=5)
ngram_entry.insert(0, "1")
ngram_entry.grid(row=0, column=1, padx=5)

num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)

# 创建按钮进行关键词提取
extract_button = tk.Button(root, text="提取关键词", command=extract_keywords, width=20)
extract_button.pack(pady=10)

# 创建显示结果的文本框并添加滚动条
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

# 运行主循环
root.mainloop()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [3]:
#使用Sentence-transformer库版本

import tkinter as tk
from tkinter import filedialog, messagebox
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 加载预训练的 Sentence Transformer 模型
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_keywords():
    # 获取用户输入的关键词长度范围和数量
    try:
        ngram_range = int(ngram_entry.get())
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    # 打开文件对话框，让用户选择要上传的文件
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
    
    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        # 读取用户选择的文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # 使用 CountVectorizer 提取词组
        vectorizer = CountVectorizer(ngram_range=(1, ngram_range), stop_words='english').fit([text])
        candidates = vectorizer.get_feature_names_out()

        # 对每个候选词组进行嵌入计算
        candidate_embeddings = model.encode(candidates)

        # 计算文档整体嵌入作为查询向量
        doc_embedding = model.encode([text]).reshape(1, -1)

        # 计算每个候选词组与文档的余弦相似度
        similarities = cosine_similarity(candidate_embeddings, doc_embedding).flatten()

        # 根据相似度对候选词组排序
        keywords = sorted(zip(candidates, similarities), key=lambda x: x[1], reverse=True)

        # 显示提取的关键词（限制数量）
        result_text.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            if i >= num_keywords:
                break
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

# 创建主窗口
root = tk.Tk()
root.title("关键词提取工具")
root.geometry("500x500")  # 设置窗口尺寸

# 创建一个框架，用于存放设置项
settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)

# 创建用户输入 ngram 范围和关键词数量的输入框
ngram_label = tk.Label(settings_frame, text="关键词长度 (n-gram):")
ngram_label.grid(row=0, column=0, padx=5)
ngram_entry = tk.Entry(settings_frame, width=5)
ngram_entry.insert(0, "2")  # 默认2词
ngram_entry.grid(row=0, column=1, padx=5)

num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)

# 创建按钮进行关键词提取
extract_button = tk.Button(root, text="提取关键词", command=extract_keywords, width=20)
extract_button.pack(pady=10)

# 创建显示结果的文本框并添加滚动条
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

# 运行主循环
root.mainloop()


In [8]:
#同时使用两种模型的版本

import tkinter as tk
from tkinter import filedialog, messagebox
from transformers import BertTokenizer, TFBertModel
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载预训练的 BERT 模型和分词器
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# 加载Sentence Transformer模型
sentence_model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_keywords_with_bert(text, num_keywords):
    """使用BERT模型提取关键词"""
    inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=512)
    outputs = bert_model(inputs)
    sequence_output = outputs.last_hidden_state.numpy()[0]

    word_ids = inputs['input_ids'][0].numpy()
    special_tokens = {tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id}
    id2word = {idx: tokenizer.convert_ids_to_tokens([word_id])[0] for idx, word_id in enumerate(word_ids) if word_id not in special_tokens}

    valid_indices = [idx for idx in id2word if idx < len(sequence_output)]
    word_vectors = np.array([sequence_output[idx] for idx in valid_indices])
    id2word = {idx: id2word[idx] for idx in valid_indices}

    doc_vector = np.mean(sequence_output, axis=0).reshape(1, -1)
    similarities = cosine_similarity(word_vectors, doc_vector).flatten()

    keywords = sorted([(id2word[idx], similarities[i]) for i, idx in enumerate(valid_indices)], key=lambda x: x[1], reverse=True)
    return keywords[:num_keywords]

def extract_keywords_with_sentence_transformer(text, num_keywords):
    """使用Sentence Transformer模型提取关键词"""
    sentences = text.split('. ')
    embeddings = sentence_model.encode(sentences)

    doc_vector = np.mean(embeddings, axis=0).reshape(1, -1)
    similarities = cosine_similarity(embeddings, doc_vector).flatten()

    keywords = sorted(zip(sentences, similarities), key=lambda x: x[1], reverse=True)
    return keywords[:num_keywords]

def extract_keywords():
    try:
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])

    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        method = extraction_method.get()
        if method == "BERT":
            keywords = extract_keywords_with_bert(text, num_keywords)
        else:
            keywords = extract_keywords_with_sentence_transformer(text, num_keywords)

        result_text.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

root = tk.Tk()
root.title("关键词提取工具")
root.geometry("600x600")

settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)

extraction_method = tk.StringVar(value="BERT")

bert_radio = tk.Radiobutton(settings_frame, text="BERT", variable=extraction_method, value="BERT")
bert_radio.grid(row=0, column=0, padx=5)

sentence_transformer_radio = tk.Radiobutton(settings_frame, text="Sentence Transformer", variable=extraction_method, value="Sentence Transformer")
sentence_transformer_radio.grid(row=0, column=1, padx=5)

num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)

extract_button = tk.Button(root, text="提取关键词", command=extract_keywords, width=20)
extract_button.pack(pady=10)

text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=20, width=70, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

root.mainloop()


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [11]:
#将Sentence-Transformer库包装后使用
import tkinter as tk
from tkinter import filedialog, messagebox
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 加载预训练的 Sentence Transformer 模型
model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_keywords():
    # 获取用户输入的关键词长度范围和数量
    try:
        ngram_range = int(ngram_entry.get())
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    # 打开文件对话框，让用户选择要上传的文件
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])

    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        # 读取用户选择的文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # 使用 CountVectorizer 提取词组
        vectorizer = CountVectorizer(ngram_range=(1, ngram_range), stop_words='english').fit([text])
        candidates = vectorizer.get_feature_names_out()

        # 对每个候选词组进行嵌入计算
        candidate_embeddings = model.encode(candidates)

        # 计算文档整体嵌入作为查询向量
        doc_embedding = model.encode([text]).reshape(1, -1)

        # 计算每个候选词组与文档的余弦相似度
        similarities = cosine_similarity(candidate_embeddings, doc_embedding).flatten()

        # 根据相似度对候选词组排序
        keywords = sorted(zip(candidates, similarities), key=lambda x: x[1], reverse=True)

        # 显示提取的关键词（限制数量）
        result_text.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            if i >= num_keywords:
                break
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

# 创建主窗口
root = tk.Tk()
root.title("关键词提取工具")
root.geometry("500x500")  # 设置窗口尺寸

# 创建一个框架，用于存放设置项
settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)

# 创建用户输入 ngram 范围和关键词数量的输入框
ngram_label = tk.Label(settings_frame, text="关键词长度 (n-gram):")
ngram_label.grid(row=0, column=0, padx=5)
ngram_entry = tk.Entry(settings_frame, width=5)
ngram_entry.insert(0, "2")  # 默认2词
ngram_entry.grid(row=0, column=1, padx=5)

num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)

# 创建按钮进行关键词提取
extract_button = tk.Button(root, text="提取关键词", command=extract_keywords, width=20)
extract_button.pack(pady=10)

# 创建显示结果的文本框并添加滚动条
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

# 运行主循环
root.mainloop()


In [12]:
pip install tensorflow tensorflow-hub


Note: you may need to restart the kernel to use updated packages.


In [15]:
import tkinter as tk
from tkinter import filedialog, messagebox
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# 加载预训练的 Universal Sentence Encoder 模型
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def extract_keywords():
    try:
        ngram_range = int(ngram_entry.get())
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])
    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        vectorizer = CountVectorizer(ngram_range=(1, ngram_range), stop_words='english').fit([text])
        candidates = vectorizer.get_feature_names_out()

        candidate_embeddings = embed(candidates).numpy()
        doc_embedding = embed([text]).numpy().reshape(1, -1)

        similarities = cosine_similarity(candidate_embeddings, doc_embedding).flatten()
        keywords = sorted(zip(candidates, similarities), key=lambda x: x[1], reverse=True)

        result_text.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            if i >= num_keywords:
                break
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

root = tk.Tk()
root.title("关键词提取工具")
root.geometry("500x500")

settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)
ngram_label = tk.Label(settings_frame, text="关键词长度 (n-gram):")
ngram_label.grid(row=0, column=0, padx=5)
ngram_entry = tk.Entry(settings_frame, width=5)
ngram_entry.insert(0, "2")
ngram_entry.grid(row=0, column=1, padx=5)
num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)
extract_button = tk.Button(root, text="提取关键词", command=extract_keywords, width=20)
extract_button.pack(pady=10)
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)
root.mainloop()


In [None]:
#使用Universal Sentence Encoder的模型

import tkinter as tk
from tkinter import filedialog, messagebox
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# 加载预训练的 Universal Sentence Encoder 模型
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

def extract_keywords():
    # 获取用户输入的关键词长度范围和数量
    try:
        ngram_range = int(ngram_entry.get())
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    # 打开文件对话框，让用户选择要上传的文件
    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])

    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        # 读取用户选择的文件内容
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()

        # 使用 CountVectorizer 提取词组
        vectorizer = CountVectorizer(ngram_range=(1, ngram_range), stop_words='english').fit([text])
        candidates = vectorizer.get_feature_names_out()

        # 对每个候选词组进行嵌入计算
        candidate_embeddings = embed(candidates)

        # 计算文档整体嵌入作为查询向量
        doc_embedding = embed([text])

        # 计算每个候选词组与文档的余弦相似度
        similarities = cosine_similarity(candidate_embeddings, doc_embedding).flatten()

        # 根据相似度对候选词组排序
        keywords = sorted(zip(candidates, similarities), key=lambda x: x[1], reverse=True)

        # 显示提取的关键词（限制数量）
        result_text.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            if i >= num_keywords:
                break
            result_text.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

# 创建主窗口
root = tk.Tk()
root.title("关键词提取工具")
root.geometry("500x500")  # 设置窗口尺寸

# 创建一个框架，用于存放设置项
settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)

# 创建用户输入 ngram 范围和关键词数量的输入框
ngram_label = tk.Label(settings_frame, text="关键词长度 (n-gram):")
ngram_label.grid(row=0, column=0, padx=5)
ngram_entry = tk.Entry(settings_frame, width=5)
ngram_entry.insert(0, "2")  # 默认2词
ngram_entry.grid(row=0, column=1, padx=5)

num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)

# 创建按钮进行关键词提取
extract_button = tk.Button(root, text="提取关键词", command=extract_keywords, width=20)
extract_button.pack(pady=10)

# 创建显示结果的文本框并添加滚动条
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar.set)
scrollbar.config(command=result_text.yview)
scrollbar.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

# 运行主循环
root.mainloop()


In [None]:
#终版前的最后一版，已经完成了三项模型的整合，并实现了高亮标注功能

import tkinter as tk
from tkinter import filedialog, messagebox
from sentence_transformers import SentenceTransformer
from transformers import BertTokenizer, TFBertModel
import tensorflow_hub as hub
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

# 加载三个不同的预训练模型
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')
use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# 初始化全局变量存储原文本内容
original_text = ""

def highlight_keywords(keywords):
    """在原文本中高亮显示关键词"""
    result_text.tag_remove("highlight", "1.0", tk.END)  # 清除已有的高亮
    for keyword, _ in keywords:
        start = "1.0"
        while True:
            start = result_text.search(keyword, start, stopindex=tk.END, nocase=True)
            if not start:
                break
            end = f"{start}+{len(keyword)}c"
            result_text.tag_add("highlight", start, end)
            start = end
    result_text.tag_config("highlight", background="yellow", foreground="black")

def extract_keywords(method):
    global original_text
    try:
        ngram_range = int(ngram_entry.get())
        num_keywords = int(num_keywords_entry.get())
    except ValueError:
        messagebox.showerror("错误", "请输入有效的整数")
        return

    file_path = filedialog.askopenfilename(filetypes=[("Text files", "*.txt")])

    if not file_path:
        messagebox.showwarning("警告", "未选择任何文件")
        return

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            original_text = file.read()

        vectorizer = CountVectorizer(ngram_range=(1, ngram_range), stop_words='english').fit([original_text])
        candidates = vectorizer.get_feature_names_out()

        if method == 'sentence_transformers':
            candidate_embeddings = sentence_transformer_model.encode(candidates)
            doc_embedding = sentence_transformer_model.encode([original_text]).reshape(1, -1)

        elif method == 'universal_sentence_encoder':
            candidate_embeddings = use_model(candidates)
            doc_embedding = use_model([original_text])

        elif method == 'bert':
            inputs = bert_tokenizer(original_text, return_tensors='tf', truncation=True, padding=True, max_length=512)
            outputs = bert_model(inputs)
            sequence_output = outputs.last_hidden_state.numpy()[0]

            word_ids = inputs['input_ids'][0].numpy()
            special_tokens = {bert_tokenizer.cls_token_id, bert_tokenizer.sep_token_id, bert_tokenizer.pad_token_id}
            id2word = {idx: bert_tokenizer.convert_ids_to_tokens([word_id])[0] for idx, word_id in enumerate(word_ids) if word_id not in special_tokens}
            
            valid_indices = [idx for idx in id2word if idx < len(sequence_output)]
            word_vectors = np.array([sequence_output[idx] for idx in valid_indices])
            id2word = {idx: id2word[idx] for idx in valid_indices}
            
            doc_vector = np.mean(sequence_output, axis=0).reshape(1, -1)
            similarities = cosine_similarity(word_vectors, doc_vector).flatten()
            keywords = sorted([(id2word[idx], similarities[i]) for i, idx in enumerate(valid_indices)], key=lambda x: x[1], reverse=True)

            # 展示原文本和关键词
            result_text.delete('1.0', tk.END)
            result_text.insert(tk.END, original_text)
            highlight_keywords(keywords[:num_keywords])

            keyword_results.delete('1.0', tk.END)
            for i, (keyword, score) in enumerate(keywords):
                if i >= num_keywords:
                    break
                keyword_results.insert(tk.END, f"{keyword}: {score:.4f}\n")
            return

        similarities = cosine_similarity(candidate_embeddings, doc_embedding).flatten()
        keywords = sorted(zip(candidates, similarities), key=lambda x: x[1], reverse=True)

        # 展示原文本和关键词
        result_text.delete('1.0', tk.END)
        result_text.insert(tk.END, original_text)
        highlight_keywords(keywords[:num_keywords])

        keyword_results.delete('1.0', tk.END)
        for i, (keyword, score) in enumerate(keywords):
            if i >= num_keywords:
                break
            keyword_results.insert(tk.END, f"{keyword}: {score:.4f}\n")
    except Exception as e:
        messagebox.showerror("错误", f"处理文件时出现问题：{e}")

# 主窗口和设置项
root = tk.Tk()
root.title("关键词提取工具")
root.geometry("600x800")

settings_frame = tk.Frame(root)
settings_frame.pack(pady=10)

ngram_label = tk.Label(settings_frame, text="关键词长度 (n-gram):")
ngram_label.grid(row=0, column=0, padx=5)
ngram_entry = tk.Entry(settings_frame, width=5)
ngram_entry.insert(0, "2")
ngram_entry.grid(row=0, column=1, padx=5)

num_keywords_label = tk.Label(settings_frame, text="提取关键词数量:")
num_keywords_label.grid(row=1, column=0, padx=5)
num_keywords_entry = tk.Entry(settings_frame, width=5)
num_keywords_entry.insert(0, "5")
num_keywords_entry.grid(row=1, column=1, padx=5)

# 不同提取方法的按钮
tk.Button(root, text="Sentence Transformers", command=lambda: extract_keywords('sentence_transformers'), width=20).pack(pady=5)
tk.Button(root, text="Universal Sentence Encoder", command=lambda: extract_keywords('universal_sentence_encoder'), width=20).pack(pady=5)
tk.Button(root, text="BERT", command=lambda: extract_keywords('bert'), width=20).pack(pady=5)

# 原文本显示区域
text_frame = tk.Frame(root)
text_frame.pack(pady=10)
scrollbar1 = tk.Scrollbar(text_frame, orient="vertical")
result_text = tk.Text(text_frame, height=15, width=60, yscrollcommand=scrollbar1.set)
scrollbar1.config(command=result_text.yview)
scrollbar1.pack(side="right", fill="y")
result_text.pack(side="left", fill="both", expand=True)

# 关键词显示区域
keyword_frame = tk.Frame(root)
keyword_frame.pack(pady=10)
scrollbar2 = tk.Scrollbar(keyword_frame, orient="vertical")
keyword_results = tk.Text(keyword_frame, height=10, width=60, yscrollcommand=scrollbar2.set)
scrollbar2.config(command=keyword_results.yview)
scrollbar2.pack(side="right", fill="y")
keyword_results.pack(side="left", fill="both", expand=True)

root.mainloop()
