In [25]:
import os
import shutil
from pprint import pprint 
import warnings

warnings.filterwarnings("ignore")

from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import Language
from langchain.schema.output_parser import StrOutputParser
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnableLambda, RunnablePassthrough
from operator import itemgetter

def process_repository(source_repo, new_repo):
    memory = ConversationBufferMemory(return_messages=True)
    # 遍历源仓库中的所有文件和目录
    for root, dirs, files in os.walk(source_repo):
        # 对每个文件进行处理
        for file in files:
            file_path = os.path.join(root, file)
            pprint(file_path)
            # 检查文件扩展名是否为代码文件
            if file_path.endswith(('.py', '.java', '.cpp', '.c')):
                # 读取并处理代码
                processed_code = process(memory, root, file)
                # 构造新仓库中的相同路径
                new_file_path = file_path.replace(source_repo, new_repo)
                new_file_dir = os.path.dirname(new_file_path)
                pprint(f"new_file_dir is {new_file_dir}")
                if not os.path.exists(new_file_dir):
                    os.makedirs(new_file_dir)
                # 将处理后的代码写入新仓库
                with open(new_file_path, 'w') as new_file:
                    new_file.write(processed_code)
            else:
                # 对于非代码文件，直接复制
                new_file_path = file_path.replace(source_repo, new_repo)
                new_file_dir = os.path.dirname(new_file_path)
                if not os.path.exists(new_file_dir):
                    os.makedirs(new_file_dir)
                shutil.copy(file_path, new_file_path)

# 设置源仓库和新仓库的路径
source_repo_path = './data/RiseGPT'
new_repo_path = './data/new_risegpt'

# 确保新仓库目录存在
if not os.path.exists(new_repo_path):
    os.makedirs(new_repo_path)

# 处理仓库
process_repository(source_repo_path, new_repo_path)

pprint("仓库处理完成。原仓库文件已复制并处理到新仓库。")


'./data/RiseGPT/requirements.txt'
'./data/RiseGPT/utils.py'
the current file is utils.py
def load_pdf(directory):
    documents = []
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            print(f"Begin loading {filename}")
            filepath = os.path.join(directory, filename)
            loader = PyPDFLoader(filepath)
            document = loader.load()
            documents.extend(document)
            print(f"{filename} load successfully")
    return documents
def delete_space(path):
    import os

    folder_path = path

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith('.txt'):
                file_path = os.path.join(root, file)

                with open(file_path, 'r') as f:
                    content = f.read()

                processed_content = content.replace(' ', '').replace('\n', '')

                with open(file_path, 'w') as f:
                    f.write(processed_cont

KeyboardInterrupt: 

In [24]:

def process(memory, root, file):
    # 这里是处理代码的函数，具体实现依据需要填写
    # 示例：简单地返回原始代码
    loader = GenericLoader.from_filesystem(
        root,
        glob=file,
        suffixes=[".py", ".cpp", ".java", ".c", ".js"],
        parser=LanguageParser(),
    )
    documents = loader.load()
    splitter = get_splitter(file)
    texts = splitter.split_documents(documents)
    print(f"the current file is {file}")
    print("\n".join([text.page_content for text in texts]))
    processed_code = "\n\n".join([get_comment(memory, text.page_content) for text in texts])
    return processed_code

In [15]:
def get_splitter(file):
    python_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.PYTHON, chunk_size=2000, chunk_overlap=0
    )

    java_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.JAVA, chunk_size=2000, chunk_overlap=0
    )
    cpp_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.CPP, chunk_size=2000, chunk_overlap=0
    )
    Javascript_splitter = RecursiveCharacterTextSplitter.from_language(
        language=Language.JS, chunk_size=2000, chunk_overlap=0
    )
    match file.split('.'):
        case [_, 'py']:
            return python_splitter
        case [_, 'java']:
            return java_splitter
        case [_, 'cpp']:
            return cpp_splitter
        case [_, 'js']:
            return Javascript_splitter
        case _:
            return None

In [20]:
def get_comment(memory, code_str):
    code_template = """你现在是一个程序员，你的任务是给用户的代码进行详细中文注释。并且只输出注释后的代码。
    """
    prompt = ChatPromptTemplate.from_messages(
        [
            ("system", code_template),
            MessagesPlaceholder(variable_name="history"),
            ("human", "{input}")
        ]
    )
    llm = ChatOpenAI(
        openai_api_base="https://aiapi.xing-yun.cn/v1",
        openai_api_key="sk-3e5wTBAl2iFDvQvW9b5693C90a97425eBf3b4bEa558eC66a",
        model_name="gpt-3.5-turbo-1106",
        temperature=0
    )
    chain = (
        RunnablePassthrough.assign(
            history=RunnableLambda(memory.load_memory_variables) | itemgetter("history")
        )
        | prompt
        | llm
    )
    response = chain.invoke({"input": code_str})
    inputs = {"input": code_str}
    memory.save_context(inputs, {"output": response.content})
    rt = extract_code(response.content)
    return rt

In [19]:
import re

def extract_code(text):
    pattern = r"```python(.*?)```"
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None