#### 获取paddle所有repo的commits

方法1：使用git log获取，然后转换为json

In [None]:
%%bash
#!/bin/bash
# 读取repo文件，循环处理每个repo
cat ./data/paddle_repo.json | jq -c '.[]' | while read repo_info; do
    FULL_NAME=$(echo $repo_info | jq -r '.full_name')    
    REPO_URL="https://github.com/$FULL_NAME.git"    
    FILE_NAME=$(echo $FULL_NAME | tr '/' '_')
    
    # 指定目录，clone repo
    DEST_DIR="./repos"
    mkdir -p $DEST_DIR
    CLONE_PATH="$DEST_DIR/$FILE_NAME"    
    git clone $REPO_URL $CLONE_PATH

    # 获取commit日志
    cd $CLONE_PATH
    git log --pretty=format:'{"repo": "'"$FULL_NAME"'", "sha": "%H", "created_at": "%ad", "author": "%an", "committer": "%cn", "message": "%f"},' --date=iso | sed "$ s/,$//" > ../${FILE_NAME}_commits.json
    git log --name-status --pretty=format:'STARTOFTHECOMMIT\nsha: %H'> ../${FILE_NAME}_commits1.log
    git log --numstat --pretty=format:'STARTOFTHECOMMIT\nsha: %H'> ../${FILE_NAME}_commits2.log

    cd ../.. # 返回当前目录
done

In [None]:
import json

def load_commit_objects(file_path):
    """
    将git log获取的commit基本信息转换为json
    """
    commits = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                # 将每一行的JSON对象加载为Python字典
                line = line.strip().rstrip(',')
                commit = json.loads(line)
                commits.append(commit)
    return commits

def extract_new_filename(change_str):
    """
    修改文件名的文件保留新的文件名
    如'demo/components/{pir_translate.py => pir_program_test.py}'
    """
    start = change_str.find('=>') + 3  # Skip '=> ' to get to the new filename
    end = change_str.find('}', start)

    new_filename = change_str[start:end].strip()

    base_path = change_str[:change_str.find('{')].strip()
    return f"{base_path}{new_filename}"

def parse_commit_logs1(file_path):
    """
    解析git log --name-status获取的commit信息
    """
    commit_data = {}
    current_sha = None
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("STARTOFTHECOMMIT"):
                current_sha = line.split(": ")[1]
                commit_data[current_sha] = {'files': []}
            else:
                if line:
                    parts = line.split()
                    if len(parts) == 2:
                        status, filename = parts[0], parts[1]
                        if status == 'A':
                            status = 'added'
                        elif status == 'M':
                            status = 'modified'
                        else:
                            status = 'removed'
                        commit_data[current_sha]['files'].append({
                            'filename': filename,
                            'status': status,
                        })
                    elif len(parts) == 3:
                        status, filename = parts[0], parts[2]
                        commit_data[current_sha]['files'].append({
                            'filename': filename,
                            'status': 'modified',
                        })
    return commit_data

def parse_commit_logs2(file_path):
    """
    解析git log --numstat获取的commit信息
    """
    commit_data = {}
    current_sha = None
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line.startswith("STARTOFTHECOMMIT"):
                current_sha = line.split(": ")[1]
                commit_data[current_sha] = {'files': []}
            else:
                if line:
                    parts = line.split()
                    # print(line)
                    additions = int(parts[0]) if parts[0].isdigit() else 0
                    deletions = int(parts[1]) if parts[1].isdigit() else 0
                    # filename = extract_new_filename(parts[2]) if "=>" in parts[2] else parts[2]
                    commit_data[current_sha]['files'].append({
                        # 'filename': filename,
                        'additions': additions,
                        'deletions': deletions,
                        'changes': additions + deletions
                        })
    return commit_data


repos = None
with open("data/tmp.json", "r", encoding="utf-8") as f:
    repos = json.load(f)

for repo in repos:
    repo_owner_repo = repo['full_name']
    print(f"Processing repository: {repo_owner_repo}")

    commits_file = f"repos/{repo_owner_repo.replace('/', '_')}_commits.json"
    commits1_file = f"repos/{repo_owner_repo.replace('/', '_')}_commits1.log"
    commits2_file = f"repos/{repo_owner_repo.replace('/', '_')}_commits2.log"
    commits = load_commit_objects(commits_file)
    commit_details_1 = parse_commit_logs1(commits1_file)
    commit_details_2 = parse_commit_logs2(commits2_file)

    # 合并信息
    final_commit_details = []
    for commit in commits:
        sha = commit['sha']
        files_1 = commit_details_1.get(sha, {}).get('files', [])
        files_2 = commit_details_2.get(sha, {}).get('files', [])

        for i in range(len(files_1)):
            file_1 = files_1[i]
            file_2 = files_2[i] if i < len(files_2) else {}
            file_1.update(file_2)

        commit['files'] = files_1
        final_commit_details.append(commit)

    repo_owner, repo_name = repo['full_name'].split('/')
    output_filename = f"data/paddle_commits/{repo_owner}_{repo_name}_commits.json"
    with open(output_filename, 'w', encoding='utf-8') as f:
        json.dump(final_commit_details, f, indent=4)

    # print(f"Merged data for {repo_owner_repo} into {output_filename}")

Processing repository: PaddlePaddle/Paddle-Lite
Processing repository: PaddlePaddle/PaddleSpeech
Processing repository: PaddlePaddle/VisualDL
Processing repository: PaddlePaddle/continuous_evaluation
Processing repository: PaddlePaddle/Paddle2ONNX
Processing repository: PaddlePaddle/paddle-ce-latest-kpis
Processing repository: PaddlePaddle/PARL
Processing repository: PaddlePaddle/Anakin
Processing repository: PaddlePaddle/docs
Processing repository: PaddlePaddle/tape
Processing repository: PaddlePaddle/PaddleFleetX
Processing repository: PaddlePaddle/PaddleFormers
Processing repository: PaddlePaddle/X2Paddle
Processing repository: PaddlePaddle/AutoDL
Processing repository: PaddlePaddle/benchmark
Processing repository: PaddlePaddle/ERNIE
Processing repository: PaddlePaddle/Serving
Processing repository: PaddlePaddle/any
Processing repository: PaddlePaddle/PGL
Processing repository: PaddlePaddle/MetaGym
Processing repository: PaddlePaddle/Paddle-Lite-Demo
Processing repository: PaddlePad

方法2：使用github api获取，相对更慢

In [None]:
from utils.request_github import request_github
from tqdm import tqdm
import logging
import json
from github import Github
# import requests

def get_repo_commits(gh, repo):
    """
    获取指定仓库的所有commit
    """
    commits = request_github(
        gh, lambda r: gh.get_repo(r).get_commits(),
        (repo['full_name'], )
    )
    logger.info(f"Fetching commits for repository: {repo['full_name']}, total: {commits.totalCount}")

    commit_list = []
    for commit in tqdm(commits):
        try:
            commit_info = {
                'repo': repo['full_name'],
                'sha': commit.sha,
                'message': commit.commit.message,
                'created_at': commit.commit.author.date.isoformat(),
                'author': commit.author.login if commit.author else None,
                'committer': commit.committer.login if commit.committer else None,
            }
            commit_files = commit.files
            if commit_files:
                commit_info['files'] = []
                for file in commit_files:
                    file_info = {
                        'filename': file.filename,
                        'status': file.status,
                        'additions': file.additions,
                        'deletions': file.deletions,
                        'changes': file.changes,
                        # 'patch': file.patch if hasattr(file, 'patch') else None
                    }
                    commit_info['files'].append(file_info)
            else:
                commit_info['files'] = []
            commit_list.append(commit_info)
        except Exception as e:
            logger.error(f"Error processing commit {commit.sha} in repository {repo['full_name']}: {e}")
            commit_list.append({
                'repo': repo['full_name'],
                'sha': commit.sha,
                'error': str(e)
            })
            continue
    return commit_list


logging.basicConfig(
    format="%(asctime)s (PID %(process)d) [%(levelname)s] %(filename)s:%(lineno)d %(message)s",
    level=logging.INFO,
)
logger = logging.getLogger(__name__)

token = '' # 添加你的GitHub token
gh = Github(token)

with open("data/paddle_repos.json", "r", encoding="utf-8") as f:
    repos = json.load(f)

for repo in repos:
    # 获取每个仓库的commit信息
    commits = get_repo_commits(gh, repo)
    repo_owner, repo_name = repo['full_name'].split('/')
    with open(f"data/paddle_commits/{repo_owner}_{repo_name}_commits.json", "w", newline="", encoding="utf-8") as f:
        json.dump(commits, f, indent=4, ensure_ascii=False)