From fd795ba3c0aee8b38b37a86a6b812e736c83c82c Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 16:32:48 -0400
Subject: [PATCH 1/7] initial changes

---
 .gitignore                 |   3 +-
 docs/analysis.md           |   6 +
 docs/render_submissions.py | 334 +++++++++++++++++++++++++++++++++++++
 mkdocs.yml                 |   1 +
 4 files changed, 343 insertions(+), 1 deletion(-)
 create mode 100644 docs/analysis.md
 create mode 100644 docs/render_submissions.py

diff --git a/.gitignore b/.gitignore
index 0459882..e0582af 100644
--- a/.gitignore
+++ b/.gitignore
@@ -166,4 +166,5 @@ repos/
 config.yml
 hydra_outputs/
 .commit0*
-.agent*
\ No newline at end of file
+.agent*
+docs/analysis_*.md
\ No newline at end of file
diff --git a/docs/analysis.md b/docs/analysis.md
new file mode 100644
index 0000000..bc2a8f9
--- /dev/null
+++ b/docs/analysis.md
@@ -0,0 +1,6 @@
+
+|  | Name   |  Summary |  |
+|--|--------|----------|--|
+||[reference](/analysis_reference)|3628 / 33 ; duration: 18.66s||
+||[test-save-commit0](/analysis_test-save-commit0)|0 / 0 ; duration: 0.00s||
+||[model_name-claude-3-5-sonnet-20240620__run_tests-0__use_lint_info-0__use_spec_info-0](/analysis_model_name-claude-3-5-sonnet-20240620__run_tests-0__use_lint_info-0__use_spec_info-0)|0 / 0 ; duration: 0.00s||
\ No newline at end of file
diff --git a/docs/render_submissions.py b/docs/render_submissions.py
new file mode 100644
index 0000000..8ab9be2
--- /dev/null
+++ b/docs/render_submissions.py
@@ -0,0 +1,334 @@
+import re
+import os
+import glob
+import ast
+from datasets import load_dataset
+import subprocess
+import json
+import shutil
+import sys
+import argparse
+from transformers import AutoTokenizer
+import commit0.harness.setup
+from commit0.harness.constants import SPLIT, SPLIT_ALL
+from commit0.harness.utils import clone_repo
+from commit0.cli import write_commit0_dot_file
+import pypdf
+# from render_utils import _find_files_to_edit
+
+import logging
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+analysis_files_path = "/share/rush/commit0_analysis_temp"
+
+def get_pytest_info(path_to_logs, repo_name, branch_name): 
+    pytest_info = {}
+    for pytest_hash in os.listdir(path_to_logs):
+        eval_script = open(os.path.join(path_to_logs, pytest_hash, "eval.sh")).read()
+        testname = re.search(r'([\S]+) > test_output', eval_script).group(1)
+        patch_diff = open(os.path.join(path_to_logs, pytest_hash, "patch.diff")).read()
+        pytest_info[testname] = {'hash': pytest_hash, 'patch_diff': patch_diff, 'summary': pytest_summary, 'failures': {}, 'duration': pytest_report['duration']}
+        report_file_path = os.path.join(path_to_logs, pytest_hash, "report.json")
+        if not os.path.exists(report_file_path): 
+            reason_for_failure = open(os.path.join(path_to_logs, pytest_hash, "test_output.txt")).read()
+            pytest_info[testname]["failed_to_run"] = reason_for_failure
+            return pytest_info
+        pytest_report = json.load(report_file_path)
+        pytest_summary = pytest_report['summary']
+        if 'passed' not in pytest_summary: pytest_summary['passed'] = 0
+        for test in pytest_report["tests"]:
+            if test['outcome'] == "passed": continue
+            if 'longrepr' in test: 
+                failure_string = test['longrepr']
+            elif '???' in test:
+                failure_string = test['???']['longrepr']
+            elif test['outcome'] == 'error':
+                failure_string = test['setup']['longrepr']
+            elif 'setup' in test and 'longrepr' in test['setup']:
+                failure_string = test['setup']['longrepr']
+            elif 'call' in test and 'longrepr' in test['call']:
+                failure_string = test['call']['longrepr']
+                # could use test['call']['traceback'] information and test['call']['crash'] for more info
+            else:
+                import pdb; pdb.set_trace()
+            duration = 0.
+            for action_key in ["setup", "call", "teardown"]:
+                if action_key not in test: continue
+                if "duration" in test: duration += test["duration"]
+            pytest_info[testname]['failures'][test['nodeid']] = {"failure_string": failure_string, "duration": duration}
+    return pytest_info
+
+def get_coverage_info(path_to_logs, repo_name, branch_name):
+    # for filename, file_coverage in json.load(open(os.path.join(path_to_logs, pytest_hash, "coverage.json")))["files"].items():
+    #     if not any(relevant_function.startswith(filename) for relevant_function in relevant_functions): continue
+    #     for funcname, func_coverage in file_coverage["functions"].items():
+    #         if f"{filename}::{funcname}" not in relevant_functions: continue
+    #         pycov_info[testname][f"{filename}::{funcname}"] = {
+    #                 "implementation": submission_info["function_impls"][f"{filename}::{funcname}"],
+    #                 "executed_lines": func_coverage["executed_lines"],
+    #                 "executed_branches": func_coverage["executed_branches"]
+    #         }
+    raise NotImplementedError
+
+def get_blank_repo_metrics(blank_source_code_folder, spec_filename, tokenizer, code_file_filter=lambda filename:filename):
+    blank_repo_metrics = {
+        "functions_to_edit": [],
+    }
+    
+    for subdir, _, files in os.walk(blank_source_code_folder):
+        for file in files: 
+            if not code_file_filter(file): continue
+            filename = os.path.join(subdir, file)
+            splitted = filename.split('/')
+            hidden = False
+            for one in splitted:
+                if one.startswith('.'):
+                    hidden = True
+                    break
+            if hidden:
+                continue
+            try:
+                code = open(filename, encoding='utf-8').read()
+            except:
+                print(f"Trouble opening {filename}")
+                continue
+
+            filename = filename[len(blank_source_code_folder):].lstrip(" /")
+            try:
+                code_tree = ast.parse(code)
+            except:
+                print(f"Trouble parsing {os.path.join(blank_source_code_folder, filename)}")
+                continue
+            for node in ast.walk(code_tree): 
+                if isinstance(node, ast.ClassDef):
+                    for child in node.body:
+                        child.parent_class = node.name
+                elif isinstance(node, ast.FunctionDef) and len(node.body) > 0:
+                    classname = ""
+                    if hasattr(node, "parent_class"): 
+                        classname = f"{node.parent_class}." 
+                    for child in node.body:
+                        child.parent_function = f"{classname}{node.name}"
+                elif isinstance(node, ast.Pass):
+                    if hasattr(node, "parent_function"): 
+                        blank_repo_metrics["functions_to_edit"].append(f"{filename}::{node.parent_function}")
+                    elif hasattr(node, "parent_class"): 
+                        blank_repo_metrics["functions_to_edit"].append(f"{filename}::{node.parent_class}")
+
+    # Get spec metrics
+    concatted_spec = ""
+    reader = pypdf.PdfReader(spec_filename)
+    for p_idx, page in enumerate(reader.pages):
+        try:
+            concatted_spec += page.extract_text()
+        except pypdf.errors.PdfReadError as e:
+            print(f"Could not load page {p_idx} of {spec_filename}, excluding")
+    blank_repo_metrics["no_tokens_in_spec"] = tokenizer(concatted_spec, return_tensors='pt').input_ids.shape[-1]
+    
+    return blank_repo_metrics
+
+def render_mds(subfolder="docs"):
+    all_submissions = {}
+
+    method_repo_pytests = {}
+    for branch_name in glob.glob(os.path.join(analysis_files_path, '*')):
+        branch_name = os.path.basename(branch_name)
+        if branch_name in {"blank", "repos", "submission_repos"}: continue
+        all_submissions[branch_name] = {}
+        for repo_file in glob.glob(os.path.join(analysis_files_path, branch_name, '*.json')):
+            
+            repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, repo_file)
+            repo_metrics = json.load(open(repo_metrics_output_file))
+            repo_name = os.path.basename(repo_file[:-len(".json")])
+            
+            all_submissions[branch_name][repo_name] = {}
+
+            method_repo_pytests[f"{branch_name}_{repo_name}"] = f"# Submission Name: {branch_name}\n# Repository: {repo_name}"
+            if 'pytest_results' in repo_metrics: repo_metrics = repo_metrics['pytest_results']
+            for pytest_group, pytest_info in repo_metrics.items():
+                pytest_group = os.path.basename(pytest_group.strip("/"))
+                patch_diff = f"""\n\n### Patch diff\n```diff\n{pytest_info['patch_diff']}```"""
+                if 'reason_for_failure' in pytest_info:
+                    all_submissions[branch_name][repo_name][pytest_group] = {"reason_for_failure": pytest_info["failed_to_run"]}
+                    method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""\n## Failed to run pytests\n{ pytest_info['failed_to_run']}"""
+                else:
+                    all_submissions[branch_name][repo_name][pytest_group] = {"summary": pytest_info['summary'], "duration": pytest_info["duration"]}
+                    method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""\n## Pytest Summary: {pytest_group}
+    | status   | count |
+    |:---------|:-----:|
+    """
+                    for category, count in pytest_info['summary'].items():
+                        if category not in {'duration'}:
+                            method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""| {category} | {count} |\n"""
+                        else: 
+                            method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""| {category} | {float(count):.2f}s |\n"""
+
+                    method_repo_pytests[f"{branch_name}_{repo_name}"] += f"\n## Failed pytest outputs: {pytest_group}\n\n"
+                    for testname, failure in pytest_info['failures'].items():
+                        shortened_testname = os.path.basename(testname)
+                        method_repo_pytests[f"{branch_name}_{repo_name}"] += f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
+
+            back_button = f"[back to {branch_name} summary]({os.path.join('/', f'analysis_{branch_name}')})\n\n"
+            with open(os.path.join(subfolder, f"analysis_{branch_name}_{repo_name}.md"), 'w') as wf: 
+                wf.write(back_button + method_repo_pytests[f"{branch_name}_{repo_name}"] + patch_diff)    
+
+
+    # Render general page. Has buttons to all methods
+    leaderboard = """
+|  | Name   |  Summary |  |
+|--|--------|----------|--|"""
+    # Render method page. Per method, buttons to all repos.
+    method_to_repos = {}
+    # Render method & repo page. Has "back" button.
+    for branch_name, branch_info in all_submissions.items():
+        cum_pytests = {'passed': 0}
+        method_to_repos[branch_name] = """
+| | Repository | Summary | |
+|-|------------|---------|-|"""
+        total_tests = 0 # better info is probably broken down by split lol TODO
+        total_duration = 0.
+        for repo_name, repo_test_info in branch_info.items():
+            for testname, test_info in repo_test_info.items():
+                total_duration += test_info['duration']
+                if "reason_for_failure" in test_info:
+                    summary_pytests_string = "failure"
+                else:
+                    summary_pytests_string = f"{testname}: {test_info['summary']['passed']} / {test_info['summary']['collected']} ; duration: { test_info['duration']:.2f}s"
+                    for category, count in test_info["summary"].items():
+                        if category not in cum_pytests:
+                            cum_pytests[category] = 0
+                        if isinstance(count, int): cum_pytests[category] += int(count)
+                        elif isinstance(count, float): cum_pytests[category] += float(count)
+                        total_tests += 1
+                method_to_repos[branch_name] += f"\n||[{repo_name}]({os.path.join('/', f'analysis_{branch_name}_{repo_name}')})|{summary_pytests_string}||"
+                break # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
+        summary_pytests_string = f"{cum_pytests['passed']} / {total_tests} ; duration: {total_duration:.2f}s"
+        leaderboard += f"\n||[{branch_name}]({os.path.join('/', f'analysis_{branch_name}')})|{summary_pytests_string}||"
+        with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), 'w') as wf: 
+            wf.write( method_to_repos[branch_name])    
+    with open(os.path.join(subfolder, "analysis.md"), 'w') as wf: 
+        wf.write(leaderboard)    
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--do_setup", action="store_true")
+    parser.add_argument("--get_blank_details", action="store_true")
+    parser.add_argument("--get_reference_details", action="store_true")
+    parser.add_argument("--keep_previous_eval", action="store_true")
+    parser.add_argument("--analyze_submissions", action="store_true")
+    parser.add_argument("--render_webpages", action="store_true")
+
+    parser.add_argument("--split", type=str, default='lite')
+
+    parser.add_argument("--tokenizer_name", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct")
+
+    return parser.parse_args()
+
+def main(args):
+    global analysis_files_path
+
+    commit0_dataset_name = "wentingzhao/commit0_combined"
+    submissions_dataset_name = "celinelee/commit0_submissions"
+    dataset = load_dataset(commit0_dataset_name, split="test")  # type: ignore
+    submission_dataset = load_dataset(submissions_dataset_name, split="train")
+    
+
+    if args.get_blank_details:
+        if args.do_setup:  
+            os.system(f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos --commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml")
+        branch_name = "blank"
+        if not args.keep_previous_eval:
+            if os.path.exists(os.path.join(analysis_files_path, branch_name)):
+                shutil.rmtree(os.path.join(analysis_files_path, branch_name))
+        os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) 
+        for example in dataset:
+            repo_name = example["repo"].split('/')[-1]
+            if args.split != "all" and repo_name not in SPLIT[args.split]:
+                continue
+
+            repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, f"{repo_name}.json")
+            blank_source_code_folder = os.path.join(analysis_files_path, "repos", repo_name, example["src_dir"])
+            spec_filepath = os.path.join(analysis_files_path, "repos", repo_name, "spec.pdf")
+
+            repo_metrics = get_blank_repo_metrics(
+                        blank_source_code_folder, 
+                        spec_filepath, 
+                        tokenizer,
+                        code_file_filter=lambda filename: re.fullmatch(r'.*\.py', filename) is not None, 
+                    )
+            json.dump(repo_metrics, open(repo_metrics_output_file, "w"), indent=4)
+
+    if args.get_reference_details:
+        if args.do_setup:  
+            os.system(f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos --commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml")
+        branch_name = "reference"
+        os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
+        if not args.keep_previous_eval:
+            for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"):
+                if os.path.exists(os.path.join(repo_log_path, branch_name)):
+                    shutil.rmtree(os.path.join(repo_log_path, branch_name))
+        os.system(f"commit0 evaluate --reference --commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml")
+
+        # get coverage and pytest info for each repo
+        for example in dataset:
+            repo_name = example["repo"].split('/')[-1]
+            if args.split != "all" and repo_name not in SPLIT[args.split]:
+                continue
+
+            repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, f"{repo_name}.json")
+
+            path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}"
+            pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name)
+            json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4)
+
+    if args.analyze_submissions:
+        commit0_dot_file_path = os.path.join(analysis_files_path, "submission_repos", ".commit0.yaml")
+        for submission in submission_dataset:
+            branch_name = submission['name']
+            os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
+            if not args.keep_previous_eval:
+                for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"):
+                    if os.path.exists(os.path.join(repo_log_path, branch_name)):
+                        shutil.rmtree(os.path.join(repo_log_path, branch_name))
+            for example in dataset:
+                repo_name = example["repo"].split('/')[-1]
+                if args.split != "all" and repo_name not in SPLIT[args.split]:
+                    continue
+                clone_url = f"https://github.com/test-save-commit0/{repo_name}.git"
+                clone_dir = os.path.abspath(os.path.join(analysis_files_path, "submission_repos", repo_name))
+                repo = clone_repo(clone_url, clone_dir, branch_name, logger)
+            # after successfully setup, write the commit0 dot file
+            write_commit0_dot_file(
+                commit0_dot_file_path,
+                {
+                    "dataset_name": commit0_dataset_name,
+                    "dataset_split": "test",
+                    "repo_split": args.split,
+                    "base_dir": os.path.join(analysis_files_path, "submission_repos"),
+                },
+            )
+            # run pytests
+            os.system(f"commit0 evaluate --branch {branch_name} --commit0-dot-file-path {commit0_dot_file_path}")
+            for example in dataset:
+                repo_name = example["repo"].split('/')[-1]
+                if args.split != "all" and repo_name not in SPLIT[args.split]:
+                    continue
+
+                repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, f"{repo_name}.json")
+
+                path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}"
+                pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name)
+                json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4)
+                
+    if not args.keep_previous_eval: 
+        for analysis_file in glob.glob("docs/analysis*.md"):
+            os.unlink(analysis_file)
+    if args.render_webpages: render_mds()
+
+
+main(get_args())
diff --git a/mkdocs.yml b/mkdocs.yml
index 67cf1fb..7e87ed1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -6,6 +6,7 @@ nav:
   - Setup: setup.md
   - Extending: repos.md
   - About: about.md
+  - Submission Analysis: analysis.md
 theme:
   name: material
   logo: "logo2.webp"

From f01bb43d064556d9b442362064688a5dedaa518d Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 16:55:02 -0400
Subject: [PATCH 2/7] flaked

---
 docs/render_submissions.py | 359 ++++++++++++++++++++++++-------------
 1 file changed, 234 insertions(+), 125 deletions(-)

diff --git a/docs/render_submissions.py b/docs/render_submissions.py
index 8ab9be2..3214f35 100644
--- a/docs/render_submissions.py
+++ b/docs/render_submissions.py
@@ -3,18 +3,14 @@
 import glob
 import ast
 from datasets import load_dataset
-import subprocess
 import json
 import shutil
-import sys
 import argparse
 from transformers import AutoTokenizer
-import commit0.harness.setup
-from commit0.harness.constants import SPLIT, SPLIT_ALL
+from commit0.harness.constants import SPLIT
 from commit0.harness.utils import clone_repo
 from commit0.cli import write_commit0_dot_file
 import pypdf
-# from render_utils import _find_files_to_edit
 
 import logging
 
@@ -25,46 +21,67 @@
 
 analysis_files_path = "/share/rush/commit0_analysis_temp"
 
-def get_pytest_info(path_to_logs, repo_name, branch_name): 
+
+def get_pytest_info(path_to_logs, repo_name, branch_name):
     pytest_info = {}
     for pytest_hash in os.listdir(path_to_logs):
         eval_script = open(os.path.join(path_to_logs, pytest_hash, "eval.sh")).read()
-        testname = re.search(r'([\S]+) > test_output', eval_script).group(1)
+        testname = re.search(r"([\S]+) > test_output", eval_script).group(1)
         patch_diff = open(os.path.join(path_to_logs, pytest_hash, "patch.diff")).read()
-        pytest_info[testname] = {'hash': pytest_hash, 'patch_diff': patch_diff, 'summary': pytest_summary, 'failures': {}, 'duration': pytest_report['duration']}
+        pytest_info[testname] = {
+            "hash": pytest_hash,
+            "patch_diff": patch_diff,
+            "failures": {},
+        }
         report_file_path = os.path.join(path_to_logs, pytest_hash, "report.json")
-        if not os.path.exists(report_file_path): 
-            reason_for_failure = open(os.path.join(path_to_logs, pytest_hash, "test_output.txt")).read()
+        if not os.path.exists(report_file_path):
+            reason_for_failure = open(
+                os.path.join(path_to_logs, pytest_hash, "test_output.txt")
+            ).read()
             pytest_info[testname]["failed_to_run"] = reason_for_failure
             return pytest_info
-        pytest_report = json.load(report_file_path)
-        pytest_summary = pytest_report['summary']
-        if 'passed' not in pytest_summary: pytest_summary['passed'] = 0
+        pytest_report = json.load(open(report_file_path))
+        pytest_summary = pytest_report["summary"]
+        pytest_info[testname]["summary"] = pytest_summary
+        pytest_info[testname]["duration"] = pytest_report["duration"]
+        if "passed" not in pytest_summary:
+            pytest_summary["passed"] = 0
         for test in pytest_report["tests"]:
-            if test['outcome'] == "passed": continue
-            if 'longrepr' in test: 
-                failure_string = test['longrepr']
-            elif '???' in test:
-                failure_string = test['???']['longrepr']
-            elif test['outcome'] == 'error':
-                failure_string = test['setup']['longrepr']
-            elif 'setup' in test and 'longrepr' in test['setup']:
-                failure_string = test['setup']['longrepr']
-            elif 'call' in test and 'longrepr' in test['call']:
-                failure_string = test['call']['longrepr']
+            if test["outcome"] == "passed":
+                continue
+            if "longrepr" in test:
+                failure_string = test["longrepr"]
+            elif "???" in test:
+                failure_string = test["???"]["longrepr"]
+            elif test["outcome"] == "error":
+                failure_string = test["setup"]["longrepr"]
+            elif "setup" in test and "longrepr" in test["setup"]:
+                failure_string = test["setup"]["longrepr"]
+            elif "call" in test and "longrepr" in test["call"]:
+                failure_string = test["call"]["longrepr"]
                 # could use test['call']['traceback'] information and test['call']['crash'] for more info
             else:
-                import pdb; pdb.set_trace()
-            duration = 0.
+                import pdb
+
+                pdb.set_trace()
+            duration = 0.0
             for action_key in ["setup", "call", "teardown"]:
-                if action_key not in test: continue
-                if "duration" in test: duration += test["duration"]
-            pytest_info[testname]['failures'][test['nodeid']] = {"failure_string": failure_string, "duration": duration}
+                if action_key not in test:
+                    continue
+                if "duration" in test:
+                    duration += test["duration"]
+            pytest_info[testname]["failures"][test["nodeid"]] = {
+                "failure_string": failure_string,
+                "duration": duration,
+            }
     return pytest_info
 
+
 def get_coverage_info(path_to_logs, repo_name, branch_name):
-    # for filename, file_coverage in json.load(open(os.path.join(path_to_logs, pytest_hash, "coverage.json")))["files"].items():
-    #     if not any(relevant_function.startswith(filename) for relevant_function in relevant_functions): continue
+    # coverage_fp = open(os.path.join(path_to_logs, pytest_hash, "coverage.json"))
+    # for filename, file_coverage in json.load(coverage_fp)["files"].items():
+    #     if not any(relevant_function.startswith(filename) for relevant_function in relevant_functions):
+    #         continue
     #     for funcname, func_coverage in file_coverage["functions"].items():
     #         if f"{filename}::{funcname}" not in relevant_functions: continue
     #         pycov_info[testname][f"{filename}::{funcname}"] = {
@@ -74,50 +91,63 @@ def get_coverage_info(path_to_logs, repo_name, branch_name):
     #         }
     raise NotImplementedError
 
-def get_blank_repo_metrics(blank_source_code_folder, spec_filename, tokenizer, code_file_filter=lambda filename:filename):
+
+def get_blank_repo_metrics(
+    blank_source_code_folder,
+    spec_filename,
+    tokenizer,
+    code_file_filter=lambda filename: filename,
+):
     blank_repo_metrics = {
         "functions_to_edit": [],
     }
-    
+
     for subdir, _, files in os.walk(blank_source_code_folder):
-        for file in files: 
-            if not code_file_filter(file): continue
+        for file in files:
+            if not code_file_filter(file):
+                continue
             filename = os.path.join(subdir, file)
-            splitted = filename.split('/')
+            splitted = filename.split("/")
             hidden = False
             for one in splitted:
-                if one.startswith('.'):
+                if one.startswith("."):
                     hidden = True
                     break
             if hidden:
                 continue
             try:
-                code = open(filename, encoding='utf-8').read()
-            except:
-                print(f"Trouble opening {filename}")
+                code = open(filename, encoding="utf-8").read()
+            except Exception as e:
+                print(f"{e}: Trouble opening {filename}")
                 continue
 
             filename = filename[len(blank_source_code_folder):].lstrip(" /")
             try:
                 code_tree = ast.parse(code)
-            except:
-                print(f"Trouble parsing {os.path.join(blank_source_code_folder, filename)}")
+            except Exception as e:
+                print(
+                    f"{e}: Trouble parsing {os.path.join(blank_source_code_folder, filename)}"
+                )
                 continue
-            for node in ast.walk(code_tree): 
+            for node in ast.walk(code_tree):
                 if isinstance(node, ast.ClassDef):
                     for child in node.body:
                         child.parent_class = node.name
                 elif isinstance(node, ast.FunctionDef) and len(node.body) > 0:
                     classname = ""
-                    if hasattr(node, "parent_class"): 
-                        classname = f"{node.parent_class}." 
+                    if hasattr(node, "parent_class"):
+                        classname = f"{node.parent_class}."
                     for child in node.body:
                         child.parent_function = f"{classname}{node.name}"
                 elif isinstance(node, ast.Pass):
-                    if hasattr(node, "parent_function"): 
-                        blank_repo_metrics["functions_to_edit"].append(f"{filename}::{node.parent_function}")
-                    elif hasattr(node, "parent_class"): 
-                        blank_repo_metrics["functions_to_edit"].append(f"{filename}::{node.parent_class}")
+                    if hasattr(node, "parent_function"):
+                        blank_repo_metrics["functions_to_edit"].append(
+                            f"{filename}::{node.parent_function}"
+                        )
+                    elif hasattr(node, "parent_class"):
+                        blank_repo_metrics["functions_to_edit"].append(
+                            f"{filename}::{node.parent_class}"
+                        )
 
     # Get spec metrics
     concatted_spec = ""
@@ -126,56 +156,92 @@ def get_blank_repo_metrics(blank_source_code_folder, spec_filename, tokenizer, c
         try:
             concatted_spec += page.extract_text()
         except pypdf.errors.PdfReadError as e:
-            print(f"Could not load page {p_idx} of {spec_filename}, excluding")
-    blank_repo_metrics["no_tokens_in_spec"] = tokenizer(concatted_spec, return_tensors='pt').input_ids.shape[-1]
-    
+            print(f"{e}: Could not load page {p_idx} of {spec_filename}, excluding...")
+    blank_repo_metrics["no_tokens_in_spec"] = tokenizer(
+        concatted_spec, return_tensors="pt"
+    ).input_ids.shape[-1]
+
     return blank_repo_metrics
 
+
 def render_mds(subfolder="docs"):
     all_submissions = {}
 
     method_repo_pytests = {}
-    for branch_name in glob.glob(os.path.join(analysis_files_path, '*')):
+    for branch_name in glob.glob(os.path.join(analysis_files_path, "*")):
         branch_name = os.path.basename(branch_name)
-        if branch_name in {"blank", "repos", "submission_repos"}: continue
+        if branch_name in {"blank", "repos", "submission_repos"}:
+            continue
         all_submissions[branch_name] = {}
-        for repo_file in glob.glob(os.path.join(analysis_files_path, branch_name, '*.json')):
-            
-            repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, repo_file)
+        for repo_file in glob.glob(
+            os.path.join(analysis_files_path, branch_name, "*.json")
+        ):
+
+            repo_metrics_output_file = os.path.join(
+                analysis_files_path, branch_name, repo_file
+            )
             repo_metrics = json.load(open(repo_metrics_output_file))
-            repo_name = os.path.basename(repo_file[:-len(".json")])
-            
+            repo_name = os.path.basename(repo_file[: -len(".json")])
+
             all_submissions[branch_name][repo_name] = {}
 
-            method_repo_pytests[f"{branch_name}_{repo_name}"] = f"# Submission Name: {branch_name}\n# Repository: {repo_name}"
-            if 'pytest_results' in repo_metrics: repo_metrics = repo_metrics['pytest_results']
+            method_repo_pytests[
+                f"{branch_name}_{repo_name}"
+            ] = f"# Submission Name: {branch_name}\n# Repository: {repo_name}"
+            if "pytest_results" in repo_metrics:
+                repo_metrics = repo_metrics["pytest_results"]
             for pytest_group, pytest_info in repo_metrics.items():
                 pytest_group = os.path.basename(pytest_group.strip("/"))
-                patch_diff = f"""\n\n### Patch diff\n```diff\n{pytest_info['patch_diff']}```"""
-                if 'reason_for_failure' in pytest_info:
-                    all_submissions[branch_name][repo_name][pytest_group] = {"reason_for_failure": pytest_info["failed_to_run"]}
-                    method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""\n## Failed to run pytests\n{ pytest_info['failed_to_run']}"""
+                patch_diff = (
+                    f"""\n\n### Patch diff\n```diff\n{pytest_info['patch_diff']}```"""
+                )
+                if "reason_for_failure" in pytest_info:
+                    all_submissions[branch_name][repo_name][pytest_group] = {
+                        "reason_for_failure": pytest_info["failed_to_run"]
+                    }
+                    method_repo_pytests[
+                        f"{branch_name}_{repo_name}"
+                    ] += f"""\n## Failed to run pytests\n{ pytest_info['failed_to_run']}"""
                 else:
-                    all_submissions[branch_name][repo_name][pytest_group] = {"summary": pytest_info['summary'], "duration": pytest_info["duration"]}
-                    method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""\n## Pytest Summary: {pytest_group}
+                    all_submissions[branch_name][repo_name][pytest_group] = {
+                        "summary": pytest_info["summary"],
+                        "duration": pytest_info["duration"],
+                    }
+                    method_repo_pytests[
+                        f"{branch_name}_{repo_name}"
+                    ] += f"""\n## Pytest Summary: {pytest_group}
     | status   | count |
     |:---------|:-----:|
     """
-                    for category, count in pytest_info['summary'].items():
-                        if category not in {'duration'}:
-                            method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""| {category} | {count} |\n"""
-                        else: 
-                            method_repo_pytests[f"{branch_name}_{repo_name}"] += f"""| {category} | {float(count):.2f}s |\n"""
-
-                    method_repo_pytests[f"{branch_name}_{repo_name}"] += f"\n## Failed pytest outputs: {pytest_group}\n\n"
-                    for testname, failure in pytest_info['failures'].items():
+                    for category, count in pytest_info["summary"].items():
+                        if category not in {"duration"}:
+                            method_repo_pytests[
+                                f"{branch_name}_{repo_name}"
+                            ] += f"""| {category} | {count} |\n"""
+                        else:
+                            method_repo_pytests[
+                                f"{branch_name}_{repo_name}"
+                            ] += f"""| {category} | {float(count):.2f}s |\n"""
+
+                    method_repo_pytests[
+                        f"{branch_name}_{repo_name}"
+                    ] += f"\n## Failed pytest outputs: {pytest_group}\n\n"
+                    for testname, failure in pytest_info["failures"].items():
                         shortened_testname = os.path.basename(testname)
-                        method_repo_pytests[f"{branch_name}_{repo_name}"] += f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
+                        method_repo_pytests[f"{branch_name}_{repo_name}"] += (
+                            f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}"
+                            "</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
+                        )
 
             back_button = f"[back to {branch_name} summary]({os.path.join('/', f'analysis_{branch_name}')})\n\n"
-            with open(os.path.join(subfolder, f"analysis_{branch_name}_{repo_name}.md"), 'w') as wf: 
-                wf.write(back_button + method_repo_pytests[f"{branch_name}_{repo_name}"] + patch_diff)    
-
+            with open(
+                os.path.join(subfolder, f"analysis_{branch_name}_{repo_name}.md"), "w"
+            ) as wf:
+                wf.write(
+                    back_button
+                    + method_repo_pytests[f"{branch_name}_{repo_name}"]
+                    + patch_diff
+                )
 
     # Render general page. Has buttons to all methods
     leaderboard = """
@@ -185,33 +251,46 @@ def render_mds(subfolder="docs"):
     method_to_repos = {}
     # Render method & repo page. Has "back" button.
     for branch_name, branch_info in all_submissions.items():
-        cum_pytests = {'passed': 0}
-        method_to_repos[branch_name] = """
+        cum_pytests = {"passed": 0}
+        method_to_repos[
+            branch_name
+        ] = """
 | | Repository | Summary | |
 |-|------------|---------|-|"""
-        total_tests = 0 # better info is probably broken down by split lol TODO
-        total_duration = 0.
+        total_tests = 0  # better info is probably broken down by split lol TODO
+        total_duration = 0.0
         for repo_name, repo_test_info in branch_info.items():
             for testname, test_info in repo_test_info.items():
-                total_duration += test_info['duration']
                 if "reason_for_failure" in test_info:
                     summary_pytests_string = "failure"
                 else:
-                    summary_pytests_string = f"{testname}: {test_info['summary']['passed']} / {test_info['summary']['collected']} ; duration: { test_info['duration']:.2f}s"
+                    total_duration += test_info["duration"]
+                    summary_pytests_string = (
+                        f"{testname}: {test_info['summary']['passed']} / "
+                        "{test_info['summary']['collected']} ; duration: { test_info['duration']:.2f}s"
+                    )
                     for category, count in test_info["summary"].items():
                         if category not in cum_pytests:
                             cum_pytests[category] = 0
-                        if isinstance(count, int): cum_pytests[category] += int(count)
-                        elif isinstance(count, float): cum_pytests[category] += float(count)
+                        if isinstance(count, int):
+                            cum_pytests[category] += int(count)
+                        elif isinstance(count, float):
+                            cum_pytests[category] += float(count)
                         total_tests += 1
-                method_to_repos[branch_name] += f"\n||[{repo_name}]({os.path.join('/', f'analysis_{branch_name}_{repo_name}')})|{summary_pytests_string}||"
-                break # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
-        summary_pytests_string = f"{cum_pytests['passed']} / {total_tests} ; duration: {total_duration:.2f}s"
+                method_to_repos[branch_name] += (
+                    f"\n||[{repo_name}]({os.path.join('/', f'analysis_{branch_name}_{repo_name}')})|"
+                    f"{summary_pytests_string}||"
+                )
+                break  # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
+        summary_pytests_string = (
+            f"{cum_pytests['passed']} / {total_tests} ; duration: {total_duration:.2f}s"
+        )
         leaderboard += f"\n||[{branch_name}]({os.path.join('/', f'analysis_{branch_name}')})|{summary_pytests_string}||"
-        with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), 'w') as wf: 
-            wf.write( method_to_repos[branch_name])    
-    with open(os.path.join(subfolder, "analysis.md"), 'w') as wf: 
-        wf.write(leaderboard)    
+        with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), "w") as wf:
+            wf.write(method_to_repos[branch_name])
+    with open(os.path.join(subfolder, "analysis.md"), "w") as wf:
+        wf.write(leaderboard)
+
 
 def get_args():
     parser = argparse.ArgumentParser()
@@ -222,12 +301,15 @@ def get_args():
     parser.add_argument("--analyze_submissions", action="store_true")
     parser.add_argument("--render_webpages", action="store_true")
 
-    parser.add_argument("--split", type=str, default='lite')
+    parser.add_argument("--split", type=str, default="lite")
 
-    parser.add_argument("--tokenizer_name", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct")
+    parser.add_argument(
+        "--tokenizer_name", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct"
+    )
 
     return parser.parse_args()
 
+
 def main(args):
     global analysis_files_path
 
@@ -235,73 +317,94 @@ def main(args):
     submissions_dataset_name = "celinelee/commit0_submissions"
     dataset = load_dataset(commit0_dataset_name, split="test")  # type: ignore
     submission_dataset = load_dataset(submissions_dataset_name, split="train")
-    
 
     if args.get_blank_details:
-        if args.do_setup:  
-            os.system(f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos --commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml")
+        if args.do_setup:
+            os.system(
+                f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos "
+                "--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
+            )
         branch_name = "blank"
         if not args.keep_previous_eval:
             if os.path.exists(os.path.join(analysis_files_path, branch_name)):
                 shutil.rmtree(os.path.join(analysis_files_path, branch_name))
         os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) 
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
         for example in dataset:
-            repo_name = example["repo"].split('/')[-1]
+            repo_name = example["repo"].split("/")[-1]
             if args.split != "all" and repo_name not in SPLIT[args.split]:
                 continue
 
-            repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, f"{repo_name}.json")
-            blank_source_code_folder = os.path.join(analysis_files_path, "repos", repo_name, example["src_dir"])
-            spec_filepath = os.path.join(analysis_files_path, "repos", repo_name, "spec.pdf")
+            repo_metrics_output_file = os.path.join(
+                analysis_files_path, branch_name, f"{repo_name}.json"
+            )
+            blank_source_code_folder = os.path.join(
+                analysis_files_path, "repos", repo_name, example["src_dir"]
+            )
+            spec_filepath = os.path.join(
+                analysis_files_path, "repos", repo_name, "spec.pdf"
+            )
 
             repo_metrics = get_blank_repo_metrics(
-                        blank_source_code_folder, 
-                        spec_filepath, 
-                        tokenizer,
-                        code_file_filter=lambda filename: re.fullmatch(r'.*\.py', filename) is not None, 
-                    )
+                blank_source_code_folder,
+                spec_filepath,
+                tokenizer,
+                code_file_filter=lambda filename: re.fullmatch(r".*\.py", filename)
+                is not None,
+            )
             json.dump(repo_metrics, open(repo_metrics_output_file, "w"), indent=4)
 
     if args.get_reference_details:
-        if args.do_setup:  
-            os.system(f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos --commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml")
+        if args.do_setup:
+            os.system(
+                f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos "
+                "--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
+            )
         branch_name = "reference"
         os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
         if not args.keep_previous_eval:
             for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"):
                 if os.path.exists(os.path.join(repo_log_path, branch_name)):
                     shutil.rmtree(os.path.join(repo_log_path, branch_name))
-        os.system(f"commit0 evaluate --reference --commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml")
+        os.system(
+            "commit0 evaluate --reference "
+            f"--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
+        )
 
         # get coverage and pytest info for each repo
         for example in dataset:
-            repo_name = example["repo"].split('/')[-1]
+            repo_name = example["repo"].split("/")[-1]
             if args.split != "all" and repo_name not in SPLIT[args.split]:
                 continue
 
-            repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, f"{repo_name}.json")
+            repo_metrics_output_file = os.path.join(
+                analysis_files_path, branch_name, f"{repo_name}.json"
+            )
 
             path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}"
             pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name)
             json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4)
 
     if args.analyze_submissions:
-        commit0_dot_file_path = os.path.join(analysis_files_path, "submission_repos", ".commit0.yaml")
+        commit0_dot_file_path = os.path.join(
+            analysis_files_path, "submission_repos", ".commit0.yaml"
+        )
         for submission in submission_dataset:
-            branch_name = submission['name']
+            branch_name = submission["name"]
             os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
             if not args.keep_previous_eval:
                 for repo_log_path in glob.glob(f"{os.getcwd()}/logs/pytest/*"):
                     if os.path.exists(os.path.join(repo_log_path, branch_name)):
                         shutil.rmtree(os.path.join(repo_log_path, branch_name))
             for example in dataset:
-                repo_name = example["repo"].split('/')[-1]
+                repo_name = example["repo"].split("/")[-1]
                 if args.split != "all" and repo_name not in SPLIT[args.split]:
                     continue
                 clone_url = f"https://github.com/test-save-commit0/{repo_name}.git"
-                clone_dir = os.path.abspath(os.path.join(analysis_files_path, "submission_repos", repo_name))
-                repo = clone_repo(clone_url, clone_dir, branch_name, logger)
+                clone_dir = os.path.abspath(
+                    os.path.join(analysis_files_path, "submission_repos", repo_name)
+                )
+                clone_repo(clone_url, clone_dir, branch_name, logger)
             # after successfully setup, write the commit0 dot file
             write_commit0_dot_file(
                 commit0_dot_file_path,
@@ -313,22 +416,28 @@ def main(args):
                 },
             )
             # run pytests
-            os.system(f"commit0 evaluate --branch {branch_name} --commit0-dot-file-path {commit0_dot_file_path}")
+            os.system(
+                f"commit0 evaluate --branch {branch_name} "
+                "--commit0-dot-file-path {commit0_dot_file_path}"
+            )
             for example in dataset:
-                repo_name = example["repo"].split('/')[-1]
+                repo_name = example["repo"].split("/")[-1]
                 if args.split != "all" and repo_name not in SPLIT[args.split]:
                     continue
 
-                repo_metrics_output_file = os.path.join(analysis_files_path, branch_name, f"{repo_name}.json")
+                repo_metrics_output_file = os.path.join(
+                    analysis_files_path, branch_name, f"{repo_name}.json"
+                )
 
                 path_to_logs = f"{os.getcwd()}/logs/pytest/{repo_name}/{branch_name}"
                 pytest_results = get_pytest_info(path_to_logs, repo_name, branch_name)
                 json.dump(pytest_results, open(repo_metrics_output_file, "w"), indent=4)
-                
-    if not args.keep_previous_eval: 
+
+    if not args.keep_previous_eval:
         for analysis_file in glob.glob("docs/analysis*.md"):
             os.unlink(analysis_file)
-    if args.render_webpages: render_mds()
+    if args.render_webpages:
+        render_mds()
 
 
 main(get_args())

From 1400b835857024a6d3e5b8c06d347fd6539d29ff Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 17:29:36 -0400
Subject: [PATCH 3/7] formatting and diffs

---
 docs/render_submissions.py | 46 +++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/docs/render_submissions.py b/docs/render_submissions.py
index 3214f35..811d3ec 100644
--- a/docs/render_submissions.py
+++ b/docs/render_submissions.py
@@ -61,9 +61,7 @@ def get_pytest_info(path_to_logs, repo_name, branch_name):
                 failure_string = test["call"]["longrepr"]
                 # could use test['call']['traceback'] information and test['call']['crash'] for more info
             else:
-                import pdb
-
-                pdb.set_trace()
+                breakpoint()
             duration = 0.0
             for action_key in ["setup", "call", "teardown"]:
                 if action_key not in test:
@@ -195,13 +193,13 @@ def render_mds(subfolder="docs"):
                 patch_diff = (
                     f"""\n\n### Patch diff\n```diff\n{pytest_info['patch_diff']}```"""
                 )
-                if "reason_for_failure" in pytest_info:
+                if "failed_to_run" in pytest_info:
                     all_submissions[branch_name][repo_name][pytest_group] = {
-                        "reason_for_failure": pytest_info["failed_to_run"]
+                        "failed_to_run": pytest_info["failed_to_run"]
                     }
                     method_repo_pytests[
                         f"{branch_name}_{repo_name}"
-                    ] += f"""\n## Failed to run pytests\n{ pytest_info['failed_to_run']}"""
+                    ] += f"""\n## Failed to run pytests\n```\n{pytest_info['failed_to_run']}\n```"""
                 else:
                     all_submissions[branch_name][repo_name][pytest_group] = {
                         "summary": pytest_info["summary"],
@@ -210,9 +208,9 @@ def render_mds(subfolder="docs"):
                     method_repo_pytests[
                         f"{branch_name}_{repo_name}"
                     ] += f"""\n## Pytest Summary: {pytest_group}
-    | status   | count |
-    |:---------|:-----:|
-    """
+| status   | count |
+|:---------|:-----:|
+"""
                     for category, count in pytest_info["summary"].items():
                         if category not in {"duration"}:
                             method_repo_pytests[
@@ -230,10 +228,10 @@ def render_mds(subfolder="docs"):
                         shortened_testname = os.path.basename(testname)
                         method_repo_pytests[f"{branch_name}_{repo_name}"] += (
                             f"### {shortened_testname}\n\n<details><summary> <pre>{shortened_testname}"
-                            "</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
+                            f"</pre></summary><pre>\n{failure['failure_string']}\n</pre>\n</details>\n"
                         )
 
-            back_button = f"[back to {branch_name} summary]({os.path.join('/', f'analysis_{branch_name}')})\n\n"
+            back_button = f"[back to {branch_name} summary]({f'analysis_{branch_name}'})\n\n"
             with open(
                 os.path.join(subfolder, f"analysis_{branch_name}_{repo_name}.md"), "w"
             ) as wf:
@@ -252,22 +250,20 @@ def render_mds(subfolder="docs"):
     # Render method & repo page. Has "back" button.
     for branch_name, branch_info in all_submissions.items():
         cum_pytests = {"passed": 0}
-        method_to_repos[
-            branch_name
-        ] = """
+        method_to_repos[branch_name] = """
 | | Repository | Summary | |
 |-|------------|---------|-|"""
         total_tests = 0  # better info is probably broken down by split lol TODO
         total_duration = 0.0
         for repo_name, repo_test_info in branch_info.items():
             for testname, test_info in repo_test_info.items():
-                if "reason_for_failure" in test_info:
+                if "failed_to_run" in test_info:
                     summary_pytests_string = "failure"
                 else:
                     total_duration += test_info["duration"]
                     summary_pytests_string = (
                         f"{testname}: {test_info['summary']['passed']} / "
-                        "{test_info['summary']['collected']} ; duration: { test_info['duration']:.2f}s"
+                        f"{test_info['summary']['collected']} ; duration: { test_info['duration']:.2f}s"
                     )
                     for category, count in test_info["summary"].items():
                         if category not in cum_pytests:
@@ -278,14 +274,14 @@ def render_mds(subfolder="docs"):
                             cum_pytests[category] += float(count)
                         total_tests += 1
                 method_to_repos[branch_name] += (
-                    f"\n||[{repo_name}]({os.path.join('/', f'analysis_{branch_name}_{repo_name}')})|"
+                    f"\n||[{repo_name}]({f'analysis_{branch_name}_{repo_name}'})|"
                     f"{summary_pytests_string}||"
                 )
                 break  # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
         summary_pytests_string = (
             f"{cum_pytests['passed']} / {total_tests} ; duration: {total_duration:.2f}s"
         )
-        leaderboard += f"\n||[{branch_name}]({os.path.join('/', f'analysis_{branch_name}')})|{summary_pytests_string}||"
+        leaderboard += f"\n||[{branch_name}]({f'analysis_{branch_name}'})|{summary_pytests_string}||"
         with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), "w") as wf:
             wf.write(method_to_repos[branch_name])
     with open(os.path.join(subfolder, "analysis.md"), "w") as wf:
@@ -322,7 +318,7 @@ def main(args):
         if args.do_setup:
             os.system(
                 f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos "
-                "--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
+                f"--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
             )
         branch_name = "blank"
         if not args.keep_previous_eval:
@@ -358,7 +354,7 @@ def main(args):
         if args.do_setup:
             os.system(
                 f"commit0 setup {args.split} --base-dir {analysis_files_path}/repos "
-                "--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
+                f"--commit0-dot-file-path {analysis_files_path}/repos/.commit0.yaml"
             )
         branch_name = "reference"
         os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
@@ -389,6 +385,14 @@ def main(args):
         commit0_dot_file_path = os.path.join(
             analysis_files_path, "submission_repos", ".commit0.yaml"
         )
+        if not args.keep_previous_eval:
+            for subfolder in glob.glob(os.path.join(analysis_files_path, "*")):
+                if os.path.basename(subfolder) not in {"blank", "reference", "repos", "submission_repos"}:
+                    try:
+                        shutil.rmtree(analysis_files_path, subfolder)
+                    except Exception as e:
+                        print(f"{e}: when removing {subfolder}")
+
         for submission in submission_dataset:
             branch_name = submission["name"]
             os.makedirs(os.path.join(analysis_files_path, branch_name), exist_ok=True)
@@ -418,7 +422,7 @@ def main(args):
             # run pytests
             os.system(
                 f"commit0 evaluate --branch {branch_name} "
-                "--commit0-dot-file-path {commit0_dot_file_path}"
+                f"--commit0-dot-file-path {commit0_dot_file_path}"
             )
             for example in dataset:
                 repo_name = example["repo"].split("/")[-1]

From d28e0826b113be83e75433d65e0f5a5176b10abf Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 17:31:21 -0400
Subject: [PATCH 4/7] minor formatting then

---
 docs/render_submissions.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/render_submissions.py b/docs/render_submissions.py
index 811d3ec..0971c75 100644
--- a/docs/render_submissions.py
+++ b/docs/render_submissions.py
@@ -262,8 +262,8 @@ def render_mds(subfolder="docs"):
                 else:
                     total_duration += test_info["duration"]
                     summary_pytests_string = (
-                        f"{testname}: {test_info['summary']['passed']} / "
-                        f"{test_info['summary']['collected']} ; duration: { test_info['duration']:.2f}s"
+                        f"`{testname}`: {test_info['summary']['passed']} / "
+                        f"{test_info['summary']['collected']} ; duration: {test_info['duration']:.2f}s"
                     )
                     for category, count in test_info["summary"].items():
                         if category not in cum_pytests:

From 7ffc9a53664b78bb7969d972f315ebe267e538c8 Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 17:40:02 -0400
Subject: [PATCH 5/7] remove breakpoints

---
 docs/render_submissions.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/render_submissions.py b/docs/render_submissions.py
index 0971c75..81b7409 100644
--- a/docs/render_submissions.py
+++ b/docs/render_submissions.py
@@ -61,7 +61,7 @@ def get_pytest_info(path_to_logs, repo_name, branch_name):
                 failure_string = test["call"]["longrepr"]
                 # could use test['call']['traceback'] information and test['call']['crash'] for more info
             else:
-                breakpoint()
+                failure_string = ""
             duration = 0.0
             for action_key in ["setup", "call", "teardown"]:
                 if action_key not in test:

From 22b1f98869920fc6819ea85f45cacd4ebe1043dd Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 17:41:14 -0400
Subject: [PATCH 6/7] typo environments

---
 docs/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/index.md b/docs/index.md
index 24db8f3..3d56f78 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -19,7 +19,7 @@ pass their unit tests.  All libraries have:
 Commit-0 is an interactive environment that makes it easy
 to design and test new agents. You can:
 
-* Efficiently run tests in isolated environemnts
+* Efficiently run tests in isolated environments
 * Distribute testing and development across cloud systems
 * Track and log all changes made throughout.
 

From 967a55ba19b417155f5409572662deff93b88da4 Mon Sep 17 00:00:00 2001
From: Celine Lee <celine.y.lee@gmail.com>
Date: Wed, 25 Sep 2024 18:02:25 -0400
Subject: [PATCH 7/7] fix some counting

---
 docs/render_submissions.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/render_submissions.py b/docs/render_submissions.py
index 81b7409..799bb16 100644
--- a/docs/render_submissions.py
+++ b/docs/render_submissions.py
@@ -253,7 +253,6 @@ def render_mds(subfolder="docs"):
         method_to_repos[branch_name] = """
 | | Repository | Summary | |
 |-|------------|---------|-|"""
-        total_tests = 0  # better info is probably broken down by split lol TODO
         total_duration = 0.0
         for repo_name, repo_test_info in branch_info.items():
             for testname, test_info in repo_test_info.items():
@@ -272,18 +271,18 @@ def render_mds(subfolder="docs"):
                             cum_pytests[category] += int(count)
                         elif isinstance(count, float):
                             cum_pytests[category] += float(count)
-                        total_tests += 1
                 method_to_repos[branch_name] += (
                     f"\n||[{repo_name}]({f'analysis_{branch_name}_{repo_name}'})|"
                     f"{summary_pytests_string}||"
                 )
                 break  # assume we ran all tests. will add functionality for checking diff tests later, as we need it.
         summary_pytests_string = (
-            f"{cum_pytests['passed']} / {total_tests} ; duration: {total_duration:.2f}s"
+            f"{cum_pytests['passed']} / {cum_pytests['collected']} ; duration: {total_duration:.2f}s"
         )
         leaderboard += f"\n||[{branch_name}]({f'analysis_{branch_name}'})|{summary_pytests_string}||"
+        back_button = f"[back to all submissions]({f'analysis'})\n\n"
         with open(os.path.join(subfolder, f"analysis_{branch_name}.md"), "w") as wf:
-            wf.write(method_to_repos[branch_name])
+            wf.write(back_button + "\n" + method_to_repos[branch_name])
     with open(os.path.join(subfolder, "analysis.md"), "w") as wf:
         wf.write(leaderboard)
 
@@ -387,9 +386,10 @@ def main(args):
         )
         if not args.keep_previous_eval:
             for subfolder in glob.glob(os.path.join(analysis_files_path, "*")):
-                if os.path.basename(subfolder) not in {"blank", "reference", "repos", "submission_repos"}:
+                if os.path.basename(subfolder.rstrip("/")) not in {"blank", "reference", "repos", "submission_repos"}:
                     try:
-                        shutil.rmtree(analysis_files_path, subfolder)
+                        print(f"Clearing {subfolder}")
+                        shutil.rmtree(subfolder)
                     except Exception as e:
                         print(f"{e}: when removing {subfolder}")