## 预备工作

In [2]:
from IPython import get_ipython
import os
from pathlib import Path
script_dir = get_ipython().starting_dir
# 修改工作目录为上一级
os.chdir(Path(script_dir) / '..')
from collections import defaultdict
import json
from util.conflict_util import Conflict, conflict2file
from tqdm.notebook import tqdm
from typing import List, Dict, Any, Tuple
import re
work_dir = Path(os.getcwd())
print(work_dir)

class ConflictChunk:
    def __init__(self, m_start, m_end, a_content, b_content, 
                 o_content, r_content, label: str | None, chunk_idx):
        self.m_start = m_start
        self.m_end = m_end
        self.a_content: 'str' = a_content
        self.b_content: 'str' = b_content
        self.o_content: 'str' = o_content
        self.r_content: 'str' = r_content
        self.label = label
        self.chunk_idx = chunk_idx

    def to_dict(self):
        return {
            "m_start": self.m_start,
            "m_end": self.m_end,
            "a_content": self.a_content,
            "b_content": self.b_content,
            "o_content": self.o_content,
            "r_content": self.r_content,
            "label": self.label,
        }
    
    def getJSONstr(self):
        return json.dumps(self, default=lambda o: o.__dict__, indent=4)


class ConflictFile:
    def __init__(self, path, repo_url, file_a_content, file_b_content, file_o_content, file_r_content, file_m_content, commit_hash):
        self.path = path
        self.repo_url = repo_url
        self.file_a_content = file_a_content
        self.file_b_content = file_b_content
        self.file_o_content = file_o_content
        self.file_r_content = file_r_content
        self.file_m_content = file_m_content
        self.commit_hash = commit_hash
        self.conflict_chunks = []

    def add_conflict_chunk(self, conflict_chunk_obj):
        self.conflict_chunks.append(conflict_chunk_obj)

    def to_dict(self):
        return {
            "path": self.path,
            "repo_url": self.repo_url,
            "file_a_content": self.file_a_content,
            "file_b_content": self.file_b_content,
            "file_o_content": self.file_o_content,
            "file_r_content": self.file_r_content,
            "file_m_content": self.file_m_content,
            "conflict_chunks": [chunk.to_dict() for chunk in self.conflict_chunks],
        }
    
    def getJSONstr(self):
        return json.dumps(self, default=lambda o: o.__dict__, indent=4)
    
class ConflictFileCollector:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
    
    @staticmethod
    def sample(output_dir, n, random_seed=0, label=None):
        cnt = 0
        # 从所有冲突文件中随机抽取 n 个 label 类型的 Conflict chunk
        # 读取 output_dir 中的所有 JSON 文件
        jsons = list(ConflictFileCollector.getAllJsonsUnder(output_dir))
        print(f"Found {len(jsons)} JSON files in {output_dir}")
        # 读取所有 JSON 文件中的 Conflict chunk
        for json_file in jsons:
            with open(json_file) as f:
                data = json.load(f)
            for conflict_file in data:
                for chunk in conflict_file['conflict_chunks']:
                    if label == None or chunk['label'] == label:
                        if cnt >= n:
                            return
                        cnt += 1
                        yield chunk


    def collect(self):
        '''
        返回一个迭代器，每次迭代返回一个ConflictFile对象
        '''
        raise NotImplementedError
        
    def collect_in_batches(self, batch_size=10000):
        batch = []
        for conflict_file in self.collect():
            batch.append(conflict_file)
            if len(batch) >= batch_size:
                yield batch
                batch = []
        if batch:
            yield batch

    def collect_and_save(self, output_dir, batch_size=10000):
        output_dir = Path(output_dir)  # 确保 output_dir 是 Path 对象
        output_dir.mkdir(parents=True, exist_ok=True)  # 自动创建目录及其父目录
        for i, batch in enumerate(self.collect_in_batches(batch_size)):
            with open(output_dir / f"{i}.json", 'w') as f:
                print(f"Saving batch {i} to {output_dir / f'{i}.json'}")
                json.dump([json.loads(x.getJSONstr()) for x in batch], f)
    
    @staticmethod
    def preprocessContent(content: str):
        return '' if content.strip() == '' else re.sub(r'\s+', ' ', content.strip() + '\n')
    
    @staticmethod
    def getLabel(a, b, o, r):
        r_processed = ConflictFileCollector.preprocessContent(r)
        a_processed = ConflictFileCollector.preprocessContent(a)
        b_processed = ConflictFileCollector.preprocessContent(b)
        o_processed = ConflictFileCollector.preprocessContent(o)
        if a_processed == b_processed:
            return "same modification, formatting maybe different"
        if r_processed == a_processed:
            return "A"
        if r_processed == b_processed:
            return "B"
        if r_processed == o_processed:
            return "O"
        if r_processed == a_processed + b_processed:
            return "AB"
        if r_processed == b_processed + a_processed:
            return "BA"

        r_lines = set(r.split('\n'))
        a_lines = set(a.split('\n'))
        b_lines = set(b.split('\n'))
        o_lines = set(o.split('\n'))
        for rl in r_lines:
            if (rl not in a_lines) and (rl not in b_lines) and (rl not in o_lines) and not rl.isspace():
                return 'newline'
        return 'mixline'

    @staticmethod
    def getAllJsonsUnder(dirPath: str):
        for root, _, files in os.walk(dirPath):
            for file in files:
                if(file.endswith(".json")):
                    yield os.path.join(root, file)
    
    @staticmethod
    def list2str(l):
        if l == [] or l == ['']:
            return ''
        return '\n'.join(l) + '\n'


/root/projects/dataset_collect_analysis


## 观察样本

In [None]:
chunks = list(ConflictFileCollector.sample(work_dir / 'data_collect_analysis' / 'output' / '100+stars_4GB-_multidev_org_lang', n=10, label='mixline'))

Found 68 JSON files in /root/projects/dataset_collect_analysis/data_collect_analysis/output/100+stars_4GB-_multidev_org_lang


### 观察不能被 es 解决的 mixline

In [4]:
dirs = []
dirs.append(work_dir / "data_collect_analysis" / "output" / "100+_recollect")
# dirs.append('/Volumes/urine_bag/100+stars_4GB-_multidev_org_lang/recollected_sample')


chunks4debug = []

from util.edit_script import compute, SequenceDiff
class EditScriptLabel:
    def __init__(self, sd: SequenceDiff, _from: str, accept: bool):
        self.edit_script = sd
        self._from = _from
        self.accept = accept
def analyze_edit_script(dir2analyze):
    dataset_name = os.path.basename(dir2analyze)
    print(f'在 {dataset_name} 下统计')
    accept_mark_cnt = defaultdict(int)
    es_cnt = defaultdict(int)
    cc_with_es_intersects = 0
    resolvable_cc_cnt = 0
    all_cc_cnt = 0
    too_many_lines_cnt = 0
    label_cnt = defaultdict(int)
    label_resolvable_cnt = defaultdict(int)
    def cc_check(chunk: ConflictChunk) -> bool:
        '''
        统计可以用编辑脚本解决的冲突，统计接受和拒绝的数量
        统计编辑脚本的数量，如果太多则跳
        最后比较时我希望转化成 token
        生成编辑脚本时，去除空行影响，缩进。。。去掉？
        '''
        nonlocal accept_mark_cnt
        nonlocal es_cnt
        nonlocal resolvable_cc_cnt
        nonlocal too_many_lines_cnt
        nonlocal label_resolvable_cnt

        def es_gen_str2list(content: str) -> List[str]:
            '''
            生成编辑脚本时的处理
            '''
            return [line.strip() for line in content.split('\n') if line.strip() != '']

        if len(chunk.a_content) > 5000 or len(chunk.b_content) > 5000 or len(chunk.o_content) > 5000 or len(chunk.r_content) > 5000:
            too_many_lines_cnt += 1
            return False
            
        a_contents = es_gen_str2list(chunk.a_content)
        b_contents = es_gen_str2list(chunk.b_content)
        o_contents = es_gen_str2list(chunk.o_content)
        r_contents = es_gen_str2list(chunk.r_content)

        def compareInToken(a_ls: List[str], b_ls: List[str]) -> bool:
            '''
            最后比较的预处理，忽略空白符的影响
            '''
            def toUnifiedStr(ls: List[str]) -> str:
                return '' if ls == [] or ls == [''] else re.sub(r'\s+', ' ', '\n'.join(ls).strip() + '\n')
            a_processed = toUnifiedStr(a_ls)
            b_processed = toUnifiedStr(b_ls)
            # print(a_processed)
            # print(b_processed)
            # print(a_processed == b_processed)
            # print('-' * 20)
            return a_processed == b_processed

        def bt(generated, i, last_end, all_edit_scripts: List[EditScriptLabel]) -> bool:
            '''
            回溯法生成所有可能的解决方案，如果和 resolution 相同则加入结果集
            '''
            nonlocal cc_with_es_intersects
            # exit
            if i == len(all_edit_scripts):
                whole_generated = generated + o_contents[last_end:]
                # 过滤 whole_generated 和 resolution 中的空行
                if compareInToken(whole_generated, r_contents):
                    # 可以使用组合 ES 的方式解决的冲突
                    return True
                return False

            # 不接受这个脚本
            all_edit_scripts[i].accept = False
            if bt(generated, i + 1, last_end, all_edit_scripts):
                return True

            # 如果当前脚本的起始位置比 last_end 还小，说明这个脚本和上一个脚本有冲突
            # 不能接受这个脚本，直接跳过
            if all_edit_scripts[i].edit_script.seq1Range.start < last_end:
                cc_with_es_intersects += 1
                return False     # 因为是小于号，所以可以解决伪冲突

            # 接受这个脚本
            start = all_edit_scripts[i].edit_script.seq2Range.start
            end = all_edit_scripts[i].edit_script.seq2Range.end
            if all_edit_scripts[i]._from == 'ours':
                curr_content = a_contents[start:end]
            else:
                curr_content = b_contents[start:end]
            all_edit_scripts[i].accept = True
            if bt(generated
                    + o_contents[last_end:all_edit_scripts[i].edit_script.seq1Range.start]
                    + curr_content,
                    i + 1,
                    all_edit_scripts[i].edit_script.seq1Range.end,
                    all_edit_scripts
                ):
                return True


            # 有下一个脚本，且两者对应 base 的位置相同
            if (
                i + 1 < len(all_edit_scripts) and
                all_edit_scripts[i].edit_script.seq1Range == all_edit_scripts[i + 1].edit_script.seq1Range
            ):
                start = all_edit_scripts[i + 1].edit_script.seq2Range.start
                end = all_edit_scripts[i + 1].edit_script.seq2Range.end
                if all_edit_scripts[i + 1]._from == 'ours':
                    next_content = a_contents[start:end]
                else:
                    next_content = b_contents[start:end]

                #  另一种 concat（seq1Range 的长度为 0，代表双方在同一位置的插入）
                if len(all_edit_scripts[i].edit_script.seq1Range) == 0:
                    all_edit_scripts[i + 1].accept = True
                    if bt(generated
                            + o_contents[last_end:all_edit_scripts[i].edit_script.seq1Range.start]
                            + next_content
                            + curr_content,
                        i + 2,
                        all_edit_scripts[i].edit_script.seq1Range.end,
                        all_edit_scripts
                        ):
                        return True


        # 开始收集数据集
        kind = chunk.label
        
        # 如果是 newline 的冲突，直接跳过
        if kind == 'newline':
            return False
            
        # 如果行数过大，直接跳过
        if any([len(content) > 1000 for content in [a_contents, b_contents, o_contents, r_contents]]):
            too_many_lines_cnt += 1
            return False
        from_ours = compute(o_contents, a_contents)
        from_theirs = compute(o_contents, b_contents)
        # 加入 _from 标记
        from_ours = [EditScriptLabel(sd, 'ours', False) for sd in from_ours]
        from_theirs = [EditScriptLabel(sd, 'theirs', False) for sd in from_theirs]
        all_edit_scripts = from_ours + from_theirs
        es_cnt[len(all_edit_scripts)] += 1
        
        
        # 限制脚本数量，避免计算量过大
        if len(all_edit_scripts) > 20:
            return False

        all_edit_scripts.sort(key=lambda editScriptLabel: editScriptLabel.edit_script.seq1Range)

        if bt([], 0, 0, all_edit_scripts):  # 这个冲突能解决
            resolvable_cc_cnt += 1
            label_resolvable_cnt[kind] += 1
            # 统计 accept_mark
            for i, es in enumerate(all_edit_scripts):
                accept_mark_cnt[es.accept] += 1
            return True


    # 开始统计数据集结果
    jsonPaths = [path for path in ConflictFileCollector.getAllJsonsUnder(dir2analyze)]
    if len(jsonPaths) == 0:
        raise FileNotFoundError("No metadata json files found in the dataset path")
    for jsonPath in tqdm(jsonPaths[:1], desc="Processing files", position=0, leave=True, dynamic_ncols=True):
        # jsonData
        try:
            with open(jsonPath, 'r') as f:
                cfs = json.load(f)
        except Exception as e:
            print(f"Error reading {jsonPath}: {e} (type: {type(e).__name__})")
            import traceback
            traceback.print_exc()
        # random 打乱 cfs
        import random
        random.seed(43)
        random.shuffle(cfs)
        for cf in tqdm(cfs[:1000], desc=f"Process items", position=1, leave=False, dynamic_ncols=True):
            for cc in cf['conflict_chunks']:
                all_cc_cnt += 1
                label_cnt[cc['label']] += 1
                cc_obj = ConflictChunk(cc['m_start'], cc['m_end'], cc['a_content'], cc['b_content'], cc['o_content'], cc['r_content'], cc['label'], cc['chunk_idx'])

                if cc_obj.label == 'newline':
                # if cc_obj.label == 'mixline' and not cc_check(cc_obj):
                    chunks4debug.append(cc)
    
    # def print_res_to_file(file=os.sys.stdout):
    #     print(f'在 {dataset_name} 下统计结果:', file=file) 
    #     print(f'共有 {all_cc_cnt} 个冲突块，其中 {resolvable_cc_cnt} 个可以用编辑脚本解决，占比 {resolvable_cc_cnt / all_cc_cnt * 100:.2f}%', file=file)
    #     print(f'有 {cc_with_es_intersects} 个冲突块的编辑脚本有交集', file=file)
    #     print(f'有 {too_many_lines_cnt} 个冲突块的行数过大，无法处理', file=file)
    #     print(f'编辑脚本数量分布: {es_cnt}', file=file)
    #     print(f'接受标记分布: {accept_mark_cnt}', file=file)
    #     print(f'类型分布: {label_cnt}', file=file)
    #     print(f'可解决类型分布: {label_resolvable_cnt}', file=file)
    #     for k, v in label_cnt.items():
    #         print(f'{k}: {v}, 可解决: {label_resolvable_cnt[k]}，占比: {label_resolvable_cnt[k] / v * 100:.2f}%', file=file)

    # # 新建文件夹
    # os.makedirs(work_dir / 'data_collect_analysis' / 'bt_log', exist_ok=True)
    # print_res_to_file(file=open(work_dir / 'data_collect_analysis' / 'bt_log' / f'{dataset_name}.log', 'w'))

for dir2analyze in dirs:
    analyze_edit_script(dir2analyze)

在 100+_recollect 下统计


Processing files:   0%|          | 0/1 [00:00<?, ?it/s]

Process items:   0%|          | 0/1000 [00:00<?, ?it/s]

In [18]:
print(len(chunks4debug))
import requests
requests.post('http://localhost:3000/api/versions', json=chunks4debug[:8] + chunks4debug[15:20])

# chunk = chunks4debug[0]
# print(chunk)

188


<Response [200]>

## 收集数据

统一将数据格式化为 conflictMap
```json
{
    "path": , // 文件相对路径
    "repo_url": , // 仓库地址
    "file_a_content": , // 文件 A 内容
    "file_b_content": , // 文件 B 内容
    "file_o_content": , // 文件 base 内容
    "file_r_content": , // 文件 Resolved 内容
    "file_m_content": , // 文件 Merged 内容
    "commitHash": ,     // commit hash
    "conflict_chunks": [
        {
            "m_start": , // merge 起始行
            "m_end": , // merge 结束行
            "a_content": , // A 内容
            "b_content": , // B 内容
            "o_content": , // base 内容
            "r_content": , // resolved 内容
            "label": , // conflict 类型
            "chunk_idx": , // chunk 在文件中是第几个 chunk     // 有可能有的 chunk 没有 resolutioin
        }
    ]
    
}
```

In [None]:
data_dir = "/root/projects/gitMergeScenario/collect_output/output/conflictFiles"
output_dir = work_dir / "data_collect_analysis" / "output" / "100+stars_4GB-_multidev_org_lang"

class GraphQLFilteredRepoCollector(ConflictFileCollector):
    '''
    100+ stars, non_fork, 10+devs, org, 4GB- repos on GitHub
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
        # 1. 获取所有 json 文件名 /xxx/conflictFiles/hash/conflictFilesMetadata.json
        # 3. 读取 json 文件，构造 ConflictFile 对象

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")

        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            if basename != 'conflictFilesMetadata.json':
                raise ValueError("conflictFilesMetadata.json file name error")
            dirname = os.path.dirname(jsonPath)

            ret = []
            # jsonData
            with open(jsonPath, 'r') as f:      # 好多数据都没收集
                try:
                    metadata_list = json.load(f)
                    for metadata in metadata_list:
                        repo_url = None                 # 还真没记录 repo_url 或者 author/repoName，只记录 repoName 了
                        path = metadata['filePath']
                        suffix = path.split('.')[-1]
                        conflictChunks = metadata['conflictChunks']
                        commit_hash = metadata['resolvedCommitHash']

                        a_content = '\n'.join(metadata['oursContent'])          # 不需要在最后 + '\n'，收集数据集是用的是 String.split('\n', -1) -1 代表尽量分割，所以 join 后不需要再加换行符
                        b_content = '\n'.join(metadata['theirsContent'])
                        base_content = '\n'.join(metadata['baseContent'])
                        merged_content = '\n'.join(metadata['mergedContent'])
                        r_content = '\n'.join(metadata['resolvedContent'])
                
                    # 构造 ConflictFile 对象
                    conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, r_content, merged_content, commit_hash)
                    for chunk in conflictChunks:
                        if 'resolution' not in chunk or chunk['resolution'] == None:                          # gitMergeScenario 中 DeepMergeAligner 没有找到 resolution
                            continue
                        # m_start, m_end 和 chunk_idx 不太好拿，先忽略
                        # 最后得加 \n，因为 DeepMergeAligner 获取 resolution 是从代码行数组中提取出来的行，所以要加上换行符
                        cc = ConflictChunk(
                                chunk['startLine'], 
                                chunk['endLine'], 
                                self.list2str(chunk['ours']), 
                                self.list2str(chunk['theirs']), 
                                self.list2str(chunk['base']), 
                                self.list2str(chunk['resolution']), 
                                None, None)
                        cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)
                        conflict_file.add_conflict_chunk(cc)
                    ret.append(conflict_file)
                except Exception as e:
                    print(f"Error reading {jsonPath}: {e} (type: {type(e).__name__})")
                    import traceback
                    traceback.print_exc()  # 打印完整堆栈信息
            for conflict_file in ret:
                yield conflict_file

collector = GraphQLFilteredRepoCollector(data_dir)
collector.collect_and_save(output_dir)

# todo: 为什么有很多 chunk 没有 mergedContent

# 这里收集的 file merged content 的冲突块范围是由 jgit 的 formatter 生成的，有一个问题是会排除 AB 中的公共行
# 另有一个脚本可以在这个基础上生成冲突块，但是不排除 AB 中的公共行（采用 git merge-file）

In [8]:
data_dir = work_dir / "data" / "2000repos"
output_dir = work_dir / "data_collect_analysis" / "output" / "2000repos"

class MergeNatureRepoCollector(ConflictFileCollector):
    '''
    2000 repos 数据集转化为 conflictMap
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)

    def collect(self):
        # 1. 获取所有 json 文件名 /xxx/repo_name/hash/relativePath/filename/metadata.json
        # 2. 提取对应目录下的 ours.xxx theirs.xxx base.xxx conflict.xxx resolve.xxx
        # 3. 读取 metadata.json, 获取 repo_url, path 以及 conflict chunks

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            if basename != 'metadata.json':
                raise ValueError("metadata.json file name error")
 
            # jsonData
            with open(jsonPath, 'r') as f:      # 好多数据都没收集
                metadata = json.load(f)
                repo_url = None
                path = metadata['path']
                suffix = metadata['filetype']
                conflict_chunks = metadata['conflicting_chunks']
                commit_hash = metadata['commitID']
            dirname = os.path.dirname(jsonPath)
            # 读取 a, b, base, merged, resolved
            a_path = os.path.join(dirname, 'ours' + suffix)
            b_path = os.path.join(dirname, 'theirs' + suffix)
            base_path = os.path.join(dirname, 'base' + suffix)
            merged_path = os.path.join(dirname, 'conflict' + suffix)
            resolved_path = os.path.join(dirname, 'resolve' + suffix)

            # 读取文件内容
            try:
                with open(a_path, 'r') as f:
                    a_content = f.read()
                with open(b_path, 'r') as f:
                    b_content = f.read()
                with open (base_path, 'r') as f:
                    base_content = f.read()
                with open (merged_path, 'r') as f:
                    merged_content = f.read()
                with open (resolved_path, 'r') as f:
                    r_content = f.read()
            except Exception as e:
                # 有的文件不存在，直接跳过
                # print(jsonPath)
                # print(e)
                continue
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, r_content, merged_content, commit_hash)
            for chunk in conflict_chunks:
                if 'resolve' not in chunk or chunk['resolve'] == None:
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，先忽略
                cc = ConflictChunk(-1, -1, chunk['a_contents'], chunk['b_contents'], 
                                    chunk['base_contents'], chunk['resolve'], None, None)
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)
                conflict_file.add_conflict_chunk(cc)
            yield conflict_file


collector = MergeNatureRepoCollector(data_dir)
collector.collect_and_save(output_dir)

  0%|          | 0/103013 [00:00<?, ?it/s]

Saving batch 0 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/0.json
Saving batch 1 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/1.json
Saving batch 2 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/2.json
Saving batch 3 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/3.json
Saving batch 4 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/4.json
Saving batch 5 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/5.json
Saving batch 6 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/6.json
Saving batch 7 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/7.json
Saving batch 8 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/2000repos/8.json
Saving batch 9 to /root/projects/dataset_collect_analysis/data_c

In [2]:
data_dir = work_dir / "data" / "top50"
output_dir = work_dir / "data_collect_analysis" / "output" / "top50"

class MergeNatureRepoTop50Collector(ConflictFileCollector):
    '''
    top50/2000 repos 数据集转化为 conflictMap
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
        # 1. 获取所有 json 文件名 /.../repo_name/12345_a.java
        #    提取最后的 12345
        # 2. 获取对应的 12345_a.xxx, 12345_b.xxx, 12345_base.xxx, 12345_merged.xxx, 12345_resolved.xxx
        # 3. 读取 metadata.json, 获取 repo_url, path 以及 conflict chunks

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            dirname = os.path.dirname(jsonPath)

            # jsonData
            with open(jsonPath, 'r') as f:      # 好多数据都没收集
                metadata = json.load(f)
                repo_url = None
                path = None
                suffix = metadata['filetype']
                conflict_chunks = metadata['conflicting_chunks']
                commit_hash = None
            
            # 读取 a, b, base, merged, resolved
            a_path = os.path.join(dirname, basename.replace('_metadata.json', '_a' + suffix))
            b_path = os.path.join(dirname, basename.replace('_metadata.json', '_b' + suffix))
            base_path = os.path.join(dirname, basename.replace('_metadata.json', '_base' + suffix))
            merged_path = os.path.join(dirname, basename.replace('_metadata.json', '_merged' + suffix))
            resolved_path = os.path.join(dirname, basename.replace('_metadata.json', '_resolved' + suffix))

            # 读取文件内容
            try:
                with open(a_path, 'r') as f:
                    a_content = f.read()
                with open(b_path, 'r') as f:
                    b_content = f.read()
                with open (base_path, 'r') as f:
                    base_content = f.read()
                with open (merged_path, 'r') as f:
                    merged_content = f.read()
                with open(resolved_path, 'r') as f:
                    resolved_content = f.read()
            except Exception as e:
                print(jsonPath)
                print(e)
                continue
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, resolved_content, merged_content, commit_hash)
            for chunk in conflict_chunks:
                if 'resolve' not in chunk or chunk['resolve'] == None:
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，先忽略
                cc = ConflictChunk(-1, -1, chunk['a_contents'], chunk['b_contents'], 
                                    chunk['base_contents'], chunk['resolve'], None, None)
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)
                conflict_file.add_conflict_chunk(cc)
            yield conflict_file

collector = MergeNatureRepoTop50Collector(data_dir)
collector.collect_and_save(output_dir)

  0%|          | 0/26369 [00:00<?, ?it/s]

/root/projects/dataset_collect_analysis/data/top50/alg-vis/218_metadata.json
'utf-8' codec can't decode byte 0xe1 in position 112: invalid continuation byte
/root/projects/dataset_collect_analysis/data/top50/alg-vis/105_metadata.json
'utf-8' codec can't decode byte 0xe1 in position 112: invalid continuation byte
/root/projects/dataset_collect_analysis/data/top50/alg-vis/78_metadata.json
'utf-8' codec can't decode byte 0xe1 in position 112: invalid continuation byte
/root/projects/dataset_collect_analysis/data/top50/alg-vis/104_metadata.json
'utf-8' codec can't decode byte 0xe1 in position 112: invalid continuation byte
/root/projects/dataset_collect_analysis/data/top50/alg-vis/103_metadata.json
'utf-8' codec can't decode byte 0xe1 in position 112: invalid continuation byte
/root/projects/dataset_collect_analysis/data/top50/alg-vis/77_metadata.json
'utf-8' codec can't decode byte 0xe1 in position 112: invalid continuation byte
/root/projects/dataset_collect_analysis/data/top50/alg-vis/2

In [3]:
# data_dir = work_dir / "data" / "mergebert_data" / "automated-analysis-data" / "TypeScript"
# output_dir = work_dir / "data_collect_analysis" / "output" / "mergebert_ts"

data_dir = work_dir / "data" / "mergebert_data" / "automated-analysis-data"
output_dir = work_dir / "data_collect_analysis" / "output" / "mergebert_all_lang"

class MergeBERTConflictFileCollector(ConflictFileCollector):
    '''
    MergeBERT 数据集转化为 conflictMap
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
        # 1. 获取所有 json 文件名，如 /Users/foril/projects/conflict_resolve/my_work/dataset_collect_analysis_script/data/mergebert_data/automated-analysis-data/TypeScript/55743_metadata.json,
        #    提取最后的 12345
        # 2. 获取对应的 12345_a.xxx, 12345_b.xxx, 12345_base.xxx, 12345_merged.xxx, 12345_resolved.xxx
        # 3. 读取 metadata.json, 获取 repo_url, path 以及 conflict chunks
        chunk_cnt = 0
        chunk_no_r_cnt = 0

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            dirname = os.path.dirname(jsonPath)

            # jsonData
            with open(jsonPath, 'r') as f:
                metadata = json.load(f)
                repo_url = metadata['repo']
                path = metadata['fname']
                suffix = path.split('.')[-1]
                conflict_chunks = metadata['conflicting_chunks']
                commit_hash = metadata['commitHash']
            
            # 读取 a, b, base, merged, resolved
            a_path = os.path.join(dirname, basename.replace('_metadata.json', '_a.' + suffix))
            b_path = os.path.join(dirname, basename.replace('_metadata.json', '_b.' + suffix))
            base_path = os.path.join(dirname, basename.replace('_metadata.json', '_base.' + suffix))
            merged_path = os.path.join(dirname, basename.replace('_metadata.json', '_merged.' + suffix))
            resolved_path = os.path.join(dirname, basename.replace('_metadata.json', '_resolved.' + suffix))

            # 读取文件内容
            with open(a_path, 'r') as f:
                a_content = f.read()
            with open(b_path, 'r') as f:
                b_content = f.read()
            with open (base_path, 'r') as f:
                base_content = f.read()
            with open (merged_path, 'r') as f:
                merged_content = f.read()
            with open(resolved_path, 'r') as f:
                resolved_content = f.read()
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, resolved_content, merged_content, commit_hash)
            for chunk in conflict_chunks:
                chunk_cnt += 1
                if chunk['res_region'] == None:
                    chunk_no_r_cnt += 1
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，对 MergeBERT 数据集好像也不是很重要，先忽略吧
                cc = ConflictChunk(-1, -1, chunk['a_contents'], chunk['b_contents'], 
                                    chunk['base_contents'], chunk['res_region'], None, None)
                cc.mergebert_label = chunk.get('label', None) # type: ignore
                    # 'A',
                    #  'AB',
                    #  'B',
                    #  'BA',
                    #  'BASE',
                    #  None,
                    #  'OTHER',
                    #  'REM-BASE-A',
                    #  'REM-BASE-AB',
                    #  'REM-BASE-B',
                    #  'REM-BASE-BA',
                    #  'RES_EMPTY',
                    #  'RES_FILE_EMPTY'
                
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)

                conflict_file.add_conflict_chunk(cc)
            yield conflict_file
        print(f"Total chunk count: {chunk_cnt}, chunk without r: {chunk_no_r_cnt}")

collector = MergeBERTConflictFileCollector(data_dir)
collector.collect_and_save(output_dir)

  0%|          | 0/48785 [00:00<?, ?it/s]

Saving batch 0 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/mergebert_all_lang/0.json
Saving batch 1 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/mergebert_all_lang/1.json
Saving batch 2 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/mergebert_all_lang/2.json
Saving batch 3 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/mergebert_all_lang/3.json
Total chunk count: 193870, chunk without r: 42444
Saving batch 4 to /root/projects/dataset_collect_analysis/data_collect_analysis/output/mergebert_all_lang/4.json


## 分析冲突块的类型分布

In [None]:
# 读取文件夹下所有 json 文件，统计 ConflictFile 下的 ConflictChunk 的 label 分布
dir2analyze = work_dir / "data_collect_analysis" / "output" / "100+_recollect"
# dir2analyze = work_dir / "data_collect_analysis" / "output" / "mergebert_ts"

# 输入存放 ConflictFiles 的目录，输出 类型分布 map，同时绘制饼图
def analyze_label_distribution(dir2analyze):
    label_cnt = defaultdict(int)
    # 获取所有 json 文件名
    jsonPaths = [path for path in ConflictFileCollector.getAllJsonsUnder(dir2analyze)]
    if len(jsonPaths) == 0:
        raise FileNotFoundError("No metadata json files found in the dataset path")
    for jsonPath in tqdm(jsonPaths, position=0, leave=True, dynamic_ncols=True):
        # jsonData
        with open(jsonPath, 'r') as f:
            try:
                for x in tqdm(json.load(f), position=1, leave=False, dynamic_ncols=True):
                    for chunk in x['conflict_chunks']:
                        ### tmp 
                        # 因为 bug，导致有的没有找到 resolution 的 chunk 也加入了，这里忽略
                        if 'label' not in chunk:
                            continue
                        ### tmp
                        label_cnt[chunk['label']] += 1
            except Exception as e:
                print(f"Error reading {jsonPath}: {e} (type: {type(e).__name__})")
                import traceback
                traceback.print_exc()
            
    import plotly.graph_objects as go
    # 创建饼图
    labels = list(label_cnt.keys())
    values = list(label_cnt.values())
    fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
    # 设置布局
    fig.update_layout(title_text="各类型冲突占比", width=600, height=400)
    # 显示图形
    fig.show()
    from pprint import pprint
    pprint(label_cnt)
    return label_cnt


# 从文件中读取
label_cnt = analyze_label_distribution(dir2analyze)

  0%|          | 0/68 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

{'commit_hash': '0b668ee7880f495959846d98e8972d392cec65ae',
 'conflict_chunks': [{'a_content': '#define '
                                   'EXPECTED_MANGOS_CLIENT_BUILD        '
                                   '{11159, 0}\n',
                      'b_content': '#define EXPECTED_REALMD_CLIENT_BUILD    '
                                   '\\\n'
                                   '{                                       '
                                   '\\\n'
                                   '    10505,  /* 3.2.2a and higher */     '
                                   '\\\n'
                                   '    8606,   /* 2.4.3  */                '
                                   '\\\n'
                                   '    6005,   /* 1.12.2 */                '
                                   '\\\n'
                                   '    5875,   /* 1.12.1 */                '
                                   '\\\n'
                                   '    0        

## 回溯分析解决上界

In [None]:
dirs = []
dirs.append(work_dir / "data_collect_analysis" / "output" / "mergebert_ts")
dirs.append(work_dir / "data_collect_analysis" / "output" / "mergebert_all_lang")
dirs.append(work_dir / "data_collect_analysis" / "output" / "100+stars_4GB-_multidev_org_lang")
# dirs.append(work_dir / "data_collect_analysis" / "output" / "2000repos")
# dirs.append(work_dir / "data_collect_analysis" / "output" / "top50")



from util.edit_script import compute, SequenceDiff

class EditScriptLabel:
    def __init__(self, sd: SequenceDiff, _from: str, accept: bool):
        self.edit_script = sd
        self._from = _from
        self.accept = accept

def analyze_edit_script(dir2analyze):
    dataset_name = os.path.basename(dir2analyze)
    print(f'在 {dataset_name} 下统计')
    accept_mark_cnt = defaultdict(int)
    es_cnt = defaultdict(int)
    cc_with_es_intersects = 0
    resolvable_cc_cnt = 0
    all_cc_cnt = 0
    too_many_lines_cnt = 0
    label_cnt = defaultdict(int)
    label_resolvable_cnt = defaultdict(int)

    def cc_check(chunk: ConflictChunk) -> None:
        '''
        统计可以用编辑脚本解决的冲突，统计接受和拒绝的数量
        统计编辑脚本的数量，如果太多则跳
        最后比较时我希望转化成 token
        生成编辑脚本时，去除空行影响，缩进。。。去掉？
        '''
        nonlocal accept_mark_cnt
        nonlocal es_cnt
        nonlocal resolvable_cc_cnt
        nonlocal too_many_lines_cnt
        nonlocal label_resolvable_cnt

        def es_gen_str2list(content: str) -> List[str]:
            '''
            生成编辑脚本时的处理
            '''
            return [line.strip() for line in content.split('\n') if line.strip() != '']

        if len(chunk.a_content) > 5000 or len(chunk.b_content) > 5000 or len(chunk.o_content) > 5000 or len(chunk.r_content) > 5000:
            too_many_lines_cnt += 1
            return
            
        a_contents = es_gen_str2list(chunk.a_content)
        b_contents = es_gen_str2list(chunk.b_content)
        o_contents = es_gen_str2list(chunk.o_content)
        r_contents = es_gen_str2list(chunk.r_content)

        def compareInToken(a_ls: List[str], b_ls: List[str]) -> bool:
            '''
            最后比较的预处理，忽略空白符的影响
            '''
            def toUnifiedStr(ls: List[str]) -> str:
                return '' if ls == [] or ls == [''] else re.sub(r'\s+', ' ', '\n'.join(ls).strip() + '\n')
            a_processed = toUnifiedStr(a_ls)
            b_processed = toUnifiedStr(b_ls)
            # print(a_processed)
            # print(b_processed)
            # print(a_processed == b_processed)
            # print('-' * 20)
            return a_processed == b_processed

        def bt(generated, i, last_end, all_edit_scripts: List[EditScriptLabel]) -> bool:
            '''
            回溯法生成所有可能的解决方案，如果和 resolution 相同则加入结果集
            '''
            nonlocal cc_with_es_intersects
            # exit
            if i == len(all_edit_scripts):
                whole_generated = generated + o_contents[last_end:]
                # 过滤 whole_generated 和 resolution 中的空行
                if compareInToken(whole_generated, r_contents):
                    # 可以使用组合 ES 的方式解决的冲突
                    return True
                return False

            # 不接受这个脚本
            all_edit_scripts[i].accept = False
            if bt(generated, i + 1, last_end, all_edit_scripts):
                return True

            # 如果当前脚本的起始位置比 last_end 还小，说明这个脚本和上一个脚本有冲突
            # 不能接受这个脚本，直接跳过
            if all_edit_scripts[i].edit_script.seq1Range.start < last_end:
                cc_with_es_intersects += 1
                return False     # 因为是小于号，所以可以解决伪冲突

            # 接受这个脚本
            start = all_edit_scripts[i].edit_script.seq2Range.start
            end = all_edit_scripts[i].edit_script.seq2Range.end
            if all_edit_scripts[i]._from == 'ours':
                curr_content = a_contents[start:end]
            else:
                curr_content = b_contents[start:end]
            all_edit_scripts[i].accept = True
            if bt(generated
                    + o_contents[last_end:all_edit_scripts[i].edit_script.seq1Range.start]
                    + curr_content,
                    i + 1,
                    all_edit_scripts[i].edit_script.seq1Range.end,
                    all_edit_scripts
                ):
                return True


            # 有下一个脚本，且两者对应 base 的位置相同
            if (
                i + 1 < len(all_edit_scripts) and
                all_edit_scripts[i].edit_script.seq1Range == all_edit_scripts[i + 1].edit_script.seq1Range
            ):
                start = all_edit_scripts[i + 1].edit_script.seq2Range.start
                end = all_edit_scripts[i + 1].edit_script.seq2Range.end
                if all_edit_scripts[i + 1]._from == 'ours':
                    next_content = a_contents[start:end]
                else:
                    next_content = b_contents[start:end]

                # base 长度为 0 的情况，只需要加入另一种 concat（seq1Range 的长度为 0，代表双方在同一位置的插入）
                all_edit_scripts[i + 1].accept = True
                if bt(generated
                        + o_contents[last_end:all_edit_scripts[i].edit_script.seq1Range.start]
                        + next_content
                        + curr_content,
                    i + 2,
                    all_edit_scripts[i].edit_script.seq1Range.end,
                    all_edit_scripts
                ):
                    return True
                # base 长度不为 0 的情况，需要考虑两种 concat
                if len(all_edit_scripts[i].edit_script.seq1Range) > 0: 
                    all_edit_scripts[i + 1].accept = True
                    if bt(generated
                            + o_contents[last_end:all_edit_scripts[i].edit_script.seq1Range.start]
                            + curr_content
                            + next_content,
                            i + 2,
                            all_edit_scripts[i].edit_script.seq1Range.end,
                            all_edit_scripts
                    ):
                        return True


        # 开始收集数据集
        kind = chunk.label
        
        # 如果是 newline 的冲突，直接跳过
        if kind == 'newline':
            return
            
        # 如果行数过大，直接跳过
        if any([len(content) > 1000 for content in [a_contents, b_contents, o_contents, r_contents]]):
            too_many_lines_cnt += 1
            return
        from_ours = compute(o_contents, a_contents)
        from_theirs = compute(o_contents, b_contents)
        # 加入 _from 标记
        from_ours = [EditScriptLabel(sd, 'ours', False) for sd in from_ours]
        from_theirs = [EditScriptLabel(sd, 'theirs', False) for sd in from_theirs]
        all_edit_scripts = from_ours + from_theirs
        es_cnt[len(all_edit_scripts)] += 1
        
        
        # 限制脚本数量，避免计算量过大
        if len(all_edit_scripts) > 20:
            return

        all_edit_scripts.sort(key=lambda editScriptLabel: editScriptLabel.edit_script.seq1Range)

        if bt([], 0, 0, all_edit_scripts):  # 这个冲突能解决
            resolvable_cc_cnt += 1
            label_resolvable_cnt[kind] += 1
            # 统计 accept_mark
            for i, es in enumerate(all_edit_scripts):
                accept_mark_cnt[es.accept] += 1



    # 开始统计数据集结果
    jsonPaths = [path for path in ConflictFileCollector.getAllJsonsUnder(dir2analyze)]
    if len(jsonPaths) == 0:
        raise FileNotFoundError("No metadata json files found in the dataset path")
    for jsonPath in tqdm(jsonPaths, desc="Processing files", position=0, leave=True, dynamic_ncols=True):
        # jsonData
        try:
            with open(jsonPath, 'r') as f:
                cfs = json.load(f)
        except Exception as e:
            print(f"Error reading {jsonPath}: {e} (type: {type(e).__name__})")
            import traceback
            traceback.print_exc()
        for cf in tqdm(cfs, desc=f"Process items", position=1, leave=False, dynamic_ncols=True):
            for cc in cf['conflict_chunks']:
                all_cc_cnt += 1
                label_cnt[cc['label']] += 1
                cc = ConflictChunk(cc['m_start'], cc['m_end'], cc['a_content'], cc['b_content'], cc['o_content'], cc['r_content'], cc['label'], cc['chunk_idx'])
                cc_check(cc)
    
    def print_res_to_file(file=os.sys.stdout):
        print(f'在 {dataset_name} 下统计结果:', file=file) 
        print(f'共有 {all_cc_cnt} 个冲突块，其中 {resolvable_cc_cnt} 个可以用编辑脚本解决，占比 {resolvable_cc_cnt / all_cc_cnt * 100:.2f}%', file=file)
        print(f'有 {cc_with_es_intersects} 个冲突块的编辑脚本有交集', file=file)
        print(f'有 {too_many_lines_cnt} 个冲突块的行数过大，无法处理', file=file)
        print(f'编辑脚本数量分布: {es_cnt}', file=file)
        print(f'接受标记分布: {accept_mark_cnt}', file=file)
        print(f'类型分布: {label_cnt}', file=file)
        print(f'可解决类型分布: {label_resolvable_cnt}', file=file)
        for k, v in label_cnt.items():
            print(f'{k}: {v}, 可解决: {label_resolvable_cnt[k]}，占比: {label_resolvable_cnt[k] / v * 100:.2f}%', file=file)

    # 新建文件夹
    os.makedirs(work_dir / 'data_collect_analysis' / 'bt_log', exist_ok=True)
    print_res_to_file(file=open(work_dir / 'data_collect_analysis' / 'bt_log' / f'{dataset_name}.log', 'w'))

for dir2analyze in dirs:
    analyze_edit_script(dir2analyze)


In [25]:
# 读取 log 文件，绘制结果
log_path = work_dir / "data_collect_analysis" / "bt_log" / "100+stars_4GB-_multidev_org_lang.log"
out_dir = work_dir / "data_collect_analysis" / "bt_log"
# 类型分布: defaultdict(<class 'int'>, {'mixline': 4715, 'AB': 1485, 'B': 1590, 'A': 1969, 'newline': 3063, 'BA': 426, 'same modification, formatting maybe different': 68, 'O': 188})
# 可解决类型分布: defaultdict(<class 'int'>, {'AB': 1382, 'B': 1558, 'A': 1932, 'mixline': 3452, 'BA': 391, 'same modification, formatting maybe different': 67, 'O': 188})
dataset_name = log_path.stem.split('.')[0]

# 输入类型分布和可解决类型分布，绘制柱状图
def paint_bt_result(kind_counter, kind_resolvable, out_dir, dataset_name):
    import plotly.graph_objects as go
    fig = go.Figure()
    labels = list(kind_counter.keys())
    resolvable = [kind_resolvable[label] if label in kind_resolvable else 0 for label in labels]
    non_resolvable = [kind_counter[label] - (kind_resolvable[label] if label in kind_resolvable else 0) for label in labels]

    fig.add_trace(go.Bar(
        x=labels,
        y=resolvable,
        name='可解决'
    ))

    fig.add_trace(go.Bar(
        x=labels,
        y=non_resolvable,
        name='无法解决',
        base=resolvable
    ))

    fig.update_layout(
        barmode='stack',
        title=f'{dataset_name} 回溯上界统计',
        xaxis_title='冲突类型',
        yaxis_title='数量',
    )

    # 保存为 html 文件
    fig.write_html(out_dir / f'{dataset_name}_bt_result.html')

def read_bt_log(log_path):
    with open(log_path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith('类型分布'):
            kind_counter = eval(line.split("类型分布: defaultdict(<class 'int'>, ")[1][:-2])            # 从字符串中提取 dict
        if line.startswith('可解决类型分布'):
            kind_resolvable = eval(line.split("可解决类型分布: defaultdict(<class 'int'>, ")[1][:-2])
    return kind_counter, kind_resolvable

kind_counter, kind_resolvable = read_bt_log(log_path)
paint_bt_result(kind_counter, kind_resolvable, out_dir, dataset_name)