In [59]:
from openai import OpenAI
import re
import json
from tqdm import tqdm
import os

In [60]:
os.makedirs("./report", exist_ok=True)

In [61]:
tex_path = "../localvariables/Du.tex"
with open(tex_path) as f:
    content = f.read()

In [62]:
def extract_newcommands(text):
    commands = {}
    current_value = []
    i = 0
    n = len(text)
    
    while i < n:
        if text.startswith('\\newcommand{', i):
            # Find the command name
            start = i + len('\\newcommand{')
            brace_count = 1
            j = start
            while j < n and brace_count > 0:
                if text[j] == '{':
                    brace_count += 1
                elif text[j] == '}':
                    brace_count -= 1
                j += 1
            command_name = text[start:j-1]
            
            # Find the command value
            if j < n and text[j] == '{':
                brace_count = 1
                k = j + 1
                current_value = []
                while k < n and brace_count > 0:
                    if text[k] == '{':
                        brace_count += 1
                    elif text[k] == '}':
                        brace_count -= 1
                    current_value.append(text[k])
                    k += 1
                command_value = ''.join(current_value[:-1])  # Remove the last '}'
                commands[command_name] = command_value
                i = k
            else:
                i = j
        else:
            i += 1
    
    return commands

def extract_with_brace(text, brace):
    pattern = r'\\begin\{'+brace+r'\}(.*?)\\end\{' + brace + r'\}'
    matches = re.findall(pattern, text, re.DOTALL)
    return matches

def extract_with_head(text, head='section'):
    result = dict()
    pattern = r'\\' + re.escape(head) + r'\{(.*?)\}\s*(.*?)(?=\\' + re.escape(head) + r'\{|$)'
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        for key, value in matches:
            result[key] = value.strip()  # 添加strip()去除可能的空白字符
    return result if result else text

def extract_command_firt_parameter(text, command):
    pattern = r'\\' + re.escape(command) + r'{(.*?)}'
    captions = re.findall(pattern, text)
    return captions[0]

def extract_tables(text):
    tables = extract_with_brace(text, "table")
    tables.extend(extract_with_brace(text, "table\*"))
    table_dict = dict()
    for t in tables:
        label = extract_command_firt_parameter(t, 'label')
        table_dict[label] = t
    return table_dict

In [63]:
def remove_annotation(str):
    return re.sub(r'%.*', '', str)


def remove_newcommands(text):
    result = []
    i = 0
    n = len(text)
    stack = []
    
    while i < n:
        if (i + len('\\newcommand') < n and 
            text[i:i+len('\\newcommand')] == '\\newcommand' and
            (i == 0 or text[i-1] != '\\')):
            
            i += len('\\newcommand')
            
            while i < n and text[i].isspace():
                i += 1
                
            if i < n and text[i] == '{':
                stack.append('{')
                i += 1
                
                command_name = []
                while i < n and stack:
                    if text[i] == '{':
                        stack.append('{')
                    elif text[i] == '}':
                        stack.pop()
                    if stack:
                        command_name.append(text[i])
                    i += 1
                
                if i < n and text[i] == '{':
                    stack.append('{')
                    i += 1
                    
                    while i < n and stack:
                        if text[i] == '{':
                            stack.append('{')
                        elif text[i] == '}':
                            stack.pop()
                        i += 1
            else:
                pass
        else:
            result.append(text[i])
            i += 1
    
    return ''.join(result)

def remove_command_with_name(text, name):
    command_start = '\\' + name + '{'
    result = []
    i = 0
    n = len(text)
    
    while i < n:
        start_pos = text.find(command_start, i)
        if start_pos == -1:
            result.append(text[i:])
            break
        
        result.append(text[i:start_pos])
        
        i = start_pos + len(command_start)
        
        stack = 1  
        
        while i < n and stack > 0:
            if text[i] == '{':
                stack += 1
            elif text[i] == '}':
                stack -= 1
            i += 1
        
        if stack != 0:
            result.append(text[start_pos:i])
    
    return ''.join(result)


def remove_single_line_command(text):
    pattern = r'\\[a-zA-Z]*?\n'
    
    result = re.sub(pattern, '\n', text)
    
    return result

def remove_redundant_newlines(text):
    return re.sub(r'\n{3,}', '\n\n', text).strip()


def remove_with_brace(text, brace):
    pattern = r'\\begin\{'+brace+r'\}.*?\\end\{' + brace + r'\}'
    return re.sub(pattern, '', text, flags=re.DOTALL)



In [64]:
def translate_newcommands(text, newcommands):
    for key, value in newcommands.items():
        text = text.replace(key+"{}", value)
    return text

def process_AAAI_content(text, abstract):
    segments = text.split('\n\n')
    
    paragraphs = [abstract] 
    
    sections = []
    subsections = []
    
    section_pattern = re.compile(r'\\section\{(.*?)\}')
    subsection_pattern = re.compile(r'\\subsection\{(.*?)\}')
    
    para_id = 1
    
    para_to_section = {0: "abstract"}
    section_to_para = {"abstract": [0]}
    
    para_to_subsection = {}
    subsection_to_para = {}
    
    section_to_subsection = {}
    subsection_to_section = {}
    
    current_section = "abstract"
    current_subsection = None
    
    for seg in segments:
        section_match = section_pattern.search(seg)
        if section_match:
            section_name = section_match.group(1)
            sections.append(section_name)
            current_section = section_name
            current_subsection = None
            continue
        
        subsection_match = subsection_pattern.search(seg)
        if subsection_match:
            subsection_name = subsection_match.group(1)
            subsections.append(subsection_name)
            current_subsection = subsection_name
            if current_section not in section_to_subsection:
                section_to_subsection[current_section] = []
            section_to_subsection[current_section].append(current_subsection)
            
            subsection_to_section[current_subsection] = current_section

            continue
        
        if not seg.strip().startswith('\\') and seg.strip() != '':
            paragraphs.append(seg)
            
            # 更新段落与当前section的映射
            if current_section not in section_to_para:
                section_to_para[current_section] = []
            section_to_para[current_section].append(para_id)
            para_to_section[para_id] = current_section
            
            # 更新段落与当前subsection的映射（如果有）
            if current_subsection:
                if current_subsection not in subsection_to_para:
                    subsection_to_para[current_subsection] = []
                subsection_to_para[current_subsection].append(para_id)
                para_to_subsection[para_id] = current_subsection
            
            para_id += 1
    
    # 构建结果字典
    result = {
        "paragraphs": paragraphs,
        "sections": sections,
        "subsections": subsections,
        "para_to_section": para_to_section,
        "section_to_para": section_to_para,
        "para_to_subsection": para_to_subsection,
        "subsection_to_para": subsection_to_para,
        "section_to_subsection": section_to_subsection,
        "subsection_to_section": subsection_to_section
    }
    
    return result

def inser_space_after_section(text):
    # 匹配模式：\xxxsection{yyy}\nz
    pattern = r'\\([a-zA-Z]+section)\{(.*?)\}\n([a-zA-Z])'
    
    def replace_match(match):
        section_cmd = match.group(1)  # xxxsection
        content = match.group(2)      # yyy
        char_after = match.group(3)   # z
        
        # 使用栈来验证括号是否平衡，并找到最外层括号的结束位置
        stack = []
        balanced_content = content
        remaining_text = ''
        
        # 检查content中的括号是否平衡
        for i, ch in enumerate(content):
            if ch == '{':
                stack.append('{')
            elif ch == '}':
                if stack:
                    stack.pop()
                else:
                    # 不平衡的右括号，直接返回原匹配
                    return match.group(0)
        
        # 如果栈不为空，说明有未闭合的左括号
        if stack:
            # 尝试从后续文本中寻找匹配的右括号
            extended_content = content
            text_after = text[match.end(2):]  # 获取匹配内容之后的文本
            
            for i, ch in enumerate(text_after):
                extended_content += ch
                if ch == '{':
                    stack.append('{')
                elif ch == '}':
                    if stack:
                        stack.pop()
                        if not stack:  # 所有括号都匹配了
                            # 更新content为扩展后的内容
                            balanced_content = extended_content
                            remaining_text = text_after[i+1:]
                            break
                    else:
                        # 不平衡的右括号
                        break
            
            # 如果栈仍然不为空，说明括号不平衡，返回原匹配
            if stack:
                return match.group(0)
        
        # 返回替换后的结果
        return f'\\{section_cmd}{{{balanced_content}}}\n\n{char_after}'
    
    # 使用正则表达式进行替换
    result = re.sub(pattern, replace_match, text)
    return result

In [65]:
content_1 = remove_annotation(content)
table_dict = extract_tables(content_1)
newcommands = extract_newcommands(content_1)
content_2 = remove_newcommands(content_1)
content_3 = extract_with_brace(content_2, "document")[0].strip()
content_4 = content_3.replace("\\maketitle", "")
content_5 = remove_command_with_name(content_4, "bibliography")
content_6 = remove_command_with_name(content_5, 'input')
content_7 = remove_single_line_command(content_6)
content_8 = remove_redundant_newlines(content_7)
abstract = extract_with_brace(content_8, 'abstract')[0].strip()
content_9 = remove_command_with_name(content_8, 'cite')
content_10 = translate_newcommands(content_9, newcommands)
content_11 = content_10.replace("\n\\begin{equation}", "\\begin{equation}")
content_12 = remove_with_brace(content_11, "abstract").strip()
content_13 = inser_space_after_section(content_12)

In [66]:
para_tree = process_AAAI_content(content_13, abstract)

In [67]:
localvariables = json.load(open("../localvariables/localvariables.json"))
api_key = localvariables['deepseek_api']
base_url = "https://api.deepseek.com"

client = OpenAI(api_key=api_key, base_url=base_url)


def chat_with_llm(client, system_prompt, user_prompt):

    response = client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        stream=False
    )

    return response.choices[0].message.content

In [68]:
# chat_with_llm(client, "You are a helpful assistant", "Hello world")

In [80]:
def check_deductive_structure(client, paragraphs):
    system_prompt = "你是一个文本结构分析专家，需要判断段落是否符合总分或总分总结构。请用中文回答。"
    user_prompt_template = "请分析以下段落是否符合总分或总分总结构：\n\n{}"
    
    results = []
    for para in tqdm(paragraphs, total=len(paragraphs)):
        user_prompt = user_prompt_template.format(para)
        para_result = chat_with_llm(client, system_prompt, user_prompt)
        results.append({
            'paragraph': para,
            'analysis': para_result
        })
    
    with open("./report/deductive_structure.md", 'w', encoding='utf-8') as f:
        for result in results:
            f.write("---\n")
            f.write("```\n")
            f.write(f"{result['paragraph']}\n")
            f.write("```\n\n")
            f.write("LLM输出如下：\n")
            f.write(f"{result['analysis']}\n")
            f.write("---\n\n")

def check_grammar(client, paragraphs):
    # 系统提示词 - 明确模型角色和任务
    system_prompt = """你是一位专业的英语语法检查专家。你的任务是：
    1. 仔细分析用户提供的英文段落
    2. 识别并指出所有语法错误、拼写错误、标点错误和表达不自然的地方
    3. 对每个错误提供详细的解释和修正建议
    4. 最后给出一个修正后的完整段落版本
    5. 如果段落没有错误，请明确指出该段落语法正确
    
    请使用以下格式进行回复：
    - 总体评价: [简要评价段落的语法质量]
    - 错误分析: 
      * [错误位置]: [错误描述] → [修正建议]
      * (如果没有错误，写"未发现语法错误")
    - 修正后的段落: [提供完整的修正后段落]
    """
    
    # 用户提示词模板
    user_prompt_template = """请仔细检查以下英文段落的语法、拼写、标点和表达是否准确自然：
    
    {}
    
    请按照要求的格式提供详细的语法分析。"""
    
    results = []
    for para in tqdm.tqdm(paragraphs, total=len(paragraphs)):
        # 跳过空段落
        if not para.strip():
            results.append({
                'paragraph': para,
                'analysis': "段落为空，跳过检查"
            })
            continue
            
        user_prompt = user_prompt_template.format(para)
        para_result = chat_with_llm(client, system_prompt, user_prompt)
        results.append({
            'paragraph': para,
            'analysis': para_result
        })
    
    # 生成更详细的报告
    with open("./report/grammar.md", 'w', encoding='utf-8') as f:
        f.write("# 语法检查报告\n\n")
        f.write("本文档包含对各个段落的语法检查结果。\n\n")
        
        for i, result in enumerate(results, 1):
            f.write(f"## 段落 {i}\n")
            f.write("---\n")
            f.write("### 原始段落:\n")
            f.write("```\n")
            f.write(f"{result['paragraph']}\n")
            f.write("```\n\n")
            f.write("### 语法分析:\n")
            f.write(f"{result['analysis']}\n")
            f.write("---\n\n")


def check_consistence_between_table_and_paragraph(client, paragraphs, table_dict):
    check_paragraphs = []
    for para in paragraphs:
        for tk, tv in table_dict.items():
            if tk in para:
                check_paragraphs.append((tv, para))

    system_prompt = ""
    user_prompt_template = ""
    
    results = []
    for table, para in tqdm(check_paragraphs, total=len(check_paragraphs)):
        user_prompt = user_prompt_template.format(table, para)
        para_result = chat_with_llm(client, system_prompt, user_prompt)
        results.append({
            'paragraph': para,
            'analysis': para_result
        })
    
    with open("./report/deductive_structure.md", 'w', encoding='utf-8') as f:
        for result in results:
            f.write("---\n")
            f.write("```\n")
            f.write(f"{result['paragraph']}\n")
            f.write("```\n\n")
            f.write("LLM输出如下：\n")
            f.write(f"{result['analysis']}\n")
            f.write("---\n\n")


def check_consistence_between_table_and_paragraph(client, paragraphs, table_dict):
    # 构建系统提示词
    system_prompt = """你是一个数据一致性检测专家。你的任务是比较表格数据和段落描述，找出所有不一致的地方。
    请仔细分析表格中的数据和段落中的描述，指出任何数字、事实或细节上的差异。
    对于每个不一致之处，请明确指出：1) 表格中的值是什么 2) 段落中的描述是什么 3) 为什么不一致
    如果完全一致，请明确指出"表格数据与段落描述完全一致"。
    请保持客观、准确，只基于提供的数据进行分析。"""
    
    # 构建用户提示词模板
    user_prompt_template = """请分析以下表格数据与段落描述之间的一致性：
    
    表格数据：
    {}
    
    段落描述：
    {}
    
    请指出任何不一致的地方，包括但不限于数字、统计数据等方面的差异。
    如果存在不一致，请具体说明哪些部分不一致，并指出表格和段落中的相应值。
    如果完全一致，请明确说明。"""
    
    # 找出包含表格关键词的段落
    check_paragraphs = []
    for para in paragraphs:
        for tk, tv in table_dict.items():
            # 检查表格标识是否在段落中
            if tk in para:
                check_paragraphs.append((tv, para))
    

    results = []
    for table, para in tqdm(check_paragraphs, total=len(check_paragraphs), desc="检查一致性"):

        user_prompt = user_prompt_template.format(table, para)
        para_result = chat_with_llm(client, system_prompt, user_prompt)
        results.append({
            'table_data': table,
            'paragraph': para,
            'analysis': para_result
        })
    
    # 保存结果到Markdown文件
    with open("./report/deductive_structure.md", 'w', encoding='utf-8') as f:
        f.write("# 表格与段落一致性检查报告\n\n")
        
        for i, result in enumerate(results, 1):
            f.write(f"## 检查项 {i}\n\n")
            f.write("### 表格数据\n```json\n")
            f.write(f"{result['table_data']}\n")
            f.write("```\n\n")
            f.write("### 段落描述\n```\n")
            f.write(f"{result['paragraph']}\n")
            f.write("```\n\n")
            f.write("### 一致性分析\n")
            f.write(f"{result['analysis']}\n\n")
            f.write("---\n\n")

In [70]:
# check_deductive_structure(client, para_tree['paragraphs'])

In [81]:
check_consistence_between_table_and_paragraph(client, para_tree['paragraphs'], table_dict)

检查一致性: 100%|██████████| 5/5 [01:11<00:00, 14.28s/it]
