In [1]:
import os
import jieba
import time
from collections import defaultdict
input_dir = r'D:\\code\\natural_language_processing\\lab3\\dataset\\article'
output_file = r'D:\\code\\natural_language_processing\\lab3\\output.txt'
inverted_index = defaultdict(list)

def build_inverted_index():
    for filename in os.listdir(input_dir):
        if filename.endswith('.txt'):
            file_path = os.path.join(input_dir, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                words = jieba.lcut(content)
                # 去除标点、数字、单字
                filtered_words = [word for word in words if len(word) > 1 and word.isalpha()]
                # 统计词频
                word_count = defaultdict(int)
                for word in filtered_words:
                    word_count[word] += 1
                for word, count in word_count.items():
                    inverted_index[word].append((int(filename[:-4]), count))

def write_inverted_index_to_file():
    with open(output_file, 'w', encoding='utf-8') as file:
        for word, doc_counts in inverted_index.items():
            sorted_doc_counts = sorted(doc_counts, key=lambda x: (-x[1], x[0]))
            file.write(f"{word}: {', '.join(f'({doc_id},{count})' for doc_id, count in sorted_doc_counts)}\n")

def search_word(word):
    start_time = time.perf_counter()  
    if word in inverted_index:
        sorted_doc_counts = sorted(inverted_index[word], key=lambda x: (-x[1], x[0]))
        end_time = time.perf_counter()  
        print(f"查询“{word}”花费时间: {end_time - start_time:.6f} 秒")
        for doc_id, count in sorted_doc_counts:
            print(f"文档{doc_id}共出现{count}次")
    else:
        end_time = time.perf_counter()  
        print(f"查询“{word}”花费时间: {end_time - start_time:.6f} 秒")
        print("未找到该词")


def search_multiple_words(query):
    start_time = time.perf_counter()
    if '&' in query and '|' in query:
        print("输入错误")
        return
    
    if '&' in query:
        words = query.split('&')
        operator = '&'
    elif '|' in query:
        words = query.split('|')
        operator = '|'
    else:
        search_word(query)
        return
    
    # 获取每个词的倒排索引集合，只提取 doc_id
    word_indices = {word: {doc_id for doc_id, _ in inverted_index[word]} for word in words if word in inverted_index}
    
    if not word_indices:  # 如果没有找到任何词
        print(f"查询“{query}”花费时间: {time.perf_counter() - start_time:.6f} 秒")
        print("未找到该词")
        return
    
    # 根据操作符计算结果
    if operator == '&':
        # 找到所有词都存在的文档ID
        common_doc_ids = set.intersection(*word_indices.values())
        if not common_doc_ids:
            print(f"查询“{query}”花费时间: {time.perf_counter() - start_time:.6f} 秒")
            print("未找到该词")
            return
        
        # 计算每个文档的总词频
        doc_count = defaultdict(int)
        for doc_id in common_doc_ids:
            total_count = sum(count for word in words for doc_id2, count in inverted_index[word] if doc_id2 == doc_id)
            doc_count[doc_id] = total_count
    
    elif operator == '|':
        # 找到所有词存在的文档ID
        all_doc_ids = set.union(*word_indices.values())
        
        # 统计每个文档的总词频
        doc_count = defaultdict(int)
        for word in words:
            for doc_id, count in inverted_index[word]:
                doc_count[doc_id] += count
    
    # 按照词频降序排序，如果词频相同则按文档 ID 升序排序
    sorted_doc_counts = sorted(doc_count.items(), key=lambda x: (-x[1], x[0]))
    
    end_time = time.perf_counter()
    print(f"查询“{query}”花费时间: {end_time - start_time:.6f} 秒")
    if not sorted_doc_counts:  # 如果没有匹配的文档
        print("未找到该词")
    else:
        for doc_id, count in sorted_doc_counts:
            print(f"文档{doc_id}共出现{count}次")

if __name__ == "__main__":
    try:
        build_inverted_index()
        print("倒排索引构建完成")
        write_inverted_index_to_file()
        print("结果已写入输出文件")
    
        while True:
            user_input = input("请输入要检索的词或字（输入 'exit' 退出）: ").strip()
            if user_input.lower() == 'exit':
                print("退出检索")
                break
            if not user_input:
                print("输入不能为空，请重新输入。")
                continue
            search_multiple_words(user_input)
    except Exception as e:
        print(f"发生错误: {e}")

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\TANGYI~1\AppData\Local\Temp\jieba.cache
Loading model cost 0.454 seconds.
Prefix dict has been built successfully.


倒排索引构建完成
结果已写入输出文件
查询“走廊&建成”花费时间: 0.000028 秒
文档1共出现5次
查询“走廊”花费时间: 0.000008 秒
文档1共出现4次
文档4共出现2次
查询“走廊|建成”花费时间: 0.000028 秒
文档1共出现5次
文档4共出现2次
文档5共出现1次
查询“走廊&唐屹”花费时间: 0.000029 秒
文档1共出现4次
文档4共出现2次
退出检索
