In [1]:
import os

def read_file_content(file_path):
    """
    读取文件内容并返回字符串，同时进行简单的内容清洗。
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()
        
    # 简单清洗: 去掉空行和注释行
    cleaned_content = []
    for line in content:
        stripped_line = line.strip('\n')
        if not stripped_line.startswith('#') and stripped_line:  # 忽略注释和空行
            cleaned_content.append(stripped_line)
    
    return "\n".join(cleaned_content)

def summarize_file_content(content):
    """
    根据文件内容生成一个简单的介绍描述。
    """
    lines = content.split("\n")
    if len(lines) == 0:
        return "The file is empty or contains only comments."
    
    intro = []
    if len(lines) > 0:
        intro.append(f"The file has {len(lines)} lines of code.")
        intro.append(f"First non-comment line: {lines[0][:50]}...")
    
    return " ".join(intro)

def process_folder(directory_path):
    """
    遍历指定文件夹中的所有文件，读取并清洗内容，并生成简单的介绍。
    """
    all_summaries = {}
    
    for root, _, files in os.walk(directory_path):
        for file_name in files:
            if file_name.endswith(('.py', '.txt', '.html', '.css')):  # 可根据需要扩展支持的文件类型
                file_path = os.path.join(root, file_name)
                
                # 读取文件并清洗内容
                content = read_file_content(file_path)
                
                # 生成简单介绍
                summary = summarize_file_content(content)
                
                # 保存介绍和完整内容
                all_summaries[file_path] = {
                    "content": content,
                    "summary": summary
                }
    
    return all_summaries


In [10]:
directory = "web"  # 替换成你希望扫描的文件夹路径
summaries = process_folder(directory)

print('This is the dir of my web project: \n\n')

# 打印结果
for file_name, details in summaries.items():
    print(f"--- {file_name} ---")
    print(f"Summary: {details['summary']}")
    print(f"Full Content:\n{details['content']}")
    print("\n" + "-" * 40 + "\n")

This is the dir of my web project: 


--- web\app.py ---
Summary: The file has 114 lines of code. First non-comment line: from flask import Flask, render_template, request,...
Full Content:
from flask import Flask, render_template, request, redirect, url_for, session
import pandas as pd
from utils import return_display_df
from model import generate_logical_operators, generate_physical_operators, generate_final_code
app = Flask(__name__)
app.secret_key = 'your_secret_key'  # 设置一个安全的密钥
@app.route('/', methods=['GET', 'POST'])
def upload_file():
    if request.method == 'POST':
        user_question = request.form.get('question') or request.form.get('question_datalake')
        
        if 'data_source' in request.form:  # Data Lake option
            data_lake_url = request.form.get('data_lake_url')
            top_k = int(request.form.get('top_k'))
            # Add your data lake retrieval logic here
            # For now, we'll just create a sample DataFrame
            uploaded_data 