In [1]:
import pandas as pd
import yaml
import os

In [2]:
top_level_fields = ['repos', 'default_install_hook_types', 'default_language_version', 'default_stages', 'files', 'exclude', 'fail_fast', 'minimum_pre_commit_version']
top_level_fields.sort()

In [3]:

def create_pre_commit_row(pre_commit_yaml):
    pre_commit_row = {}

    for field in top_level_fields:
        if field in pre_commit_yaml:
            pre_commit_row[f'{field}_count'] = len(pre_commit_yaml[field]) if type(pre_commit_yaml[field]) is list else 1
        else:
            pre_commit_row[f'{field}_count'] = 0

    local_hooks = []
    remote_repoes = []
    remote_hooks = []
    for repo in pre_commit_yaml['repos']:
        if repo['repo'] == 'local':
            local_hooks = local_hooks + repo['hooks']
        else:
            remote_repoes.append(repo['repo'] )
            remote_hooks = remote_hooks + repo['hooks']

    pre_commit_row['local_hooks'] = [hook['id'] for hook in local_hooks]
    pre_commit_row['local_hooks_raw'] = local_hooks

    pre_commit_row['remote_repoes'] = remote_repoes
    pre_commit_row['remote_hooks'] = [hook['id'] for hook in remote_hooks]    
    pre_commit_row['remote_hooks_raw'] = remote_hooks
    pre_commit_row['is_valid'] = True
    pre_commit_row['raw_yaml'] = pre_commit_yaml
    return pre_commit_row
        


In [4]:
pre_commit_rows = []
folders = ['pre-commit-hooks', 'pre-commit-hooks-300-700', 'pre-commit-hooks-800', 'found_hooks']

for folder in folders:
    for file_name in os.listdir(folder):
        path = os.path.join(folder, file_name)
        with open(path) as f:
            try:
                pre_commit_yaml = yaml.safe_load(f)
                if type(pre_commit_yaml) is list:
                    row = {'is_valid' : False}
                    pre_commit_rows.append(row)
                else:
                    row = create_pre_commit_row(pre_commit_yaml)
                pre_commit_rows.append(row)
            except yaml.YAMLError as ec:
                row = {'is_valid' : False}
                pre_commit_rows.append(row)
            except:
                row = {'is_valid' : False}
                pre_commit_rows.append(row)

In [5]:
pre_comit_df = pd.DataFrame(pre_commit_rows)

In [6]:
pre_comit_df

Unnamed: 0,default_install_hook_types_count,default_language_version_count,default_stages_count,exclude_count,fail_fast_count,files_count,minimum_pre_commit_version_count,repos_count,local_hooks,local_hooks_raw,remote_repoes,remote_hooks,remote_hooks_raw,is_valid
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,[],[],[https://github.com/ambv/black],[black],"[{'id': 'black', 'language_version': 'python'}]",True
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,[],[],[https://github.com/ambv/black],[black],[{'id': 'black'}],True
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,[],[],[https://github.com/commitizen-tools/commitizen],[commitizen],"[{'id': 'commitizen', 'stages': ['commit-msg']}]",True
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,[],[],[https://github.com/pre-commit/pre-commit-hooks],[trailing-whitespace],[{'id': 'trailing-whitespace'}],True
4,,,,,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,"[pylint, pylint, check-newsfragments]","[{'id': 'pylint', 'name': 'pylint', 'entry': '...",[https://github.com/pre-commit/pre-commit-hook...,"[trailing-whitespace, end-of-file-fixer, ruff,...","[{'id': 'trailing-whitespace', 'exclude': 'tes...",True
7788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,"[copyright_checker, clang-format-with-version-...","[{'id': 'copyright_checker', 'name': 'copyrigh...",[https://github.com/PaddlePaddle/mirrors-yapf....,"[yapf, check-merge-conflict, check-symlinks, e...","[{'id': 'yapf', 'files': '\.py$'}, {'id': 'che...",True
7789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,[],[],[https://github.com/charliermarsh/ruff-pre-com...,"[ruff, black]","[{'id': 'ruff', 'args': ['--select=I001', '--f...",True
7790,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,[],[],"[https://github.com/PyCQA/flake8, https://gith...","[flake8, isort, yapf, trailing-whitespace, che...","[{'id': 'flake8'}, {'id': 'isort'}, {'id': 'ya...",True
