## 收集数据

In [5]:
from IPython import get_ipython
import os
from pathlib import Path
script_dir = get_ipython().starting_dir
# 修改工作目录为上一级
os.chdir(Path(script_dir) / '..')
print(os.getcwd())
from collections import defaultdict
import json
from util.conflict_util import Conflict, conflict2file
from tqdm import tqdm
from util.edit_script import compute
work_dir = Path(os.getcwd())

/Users/foril/projects/conflict_resolve/my_work/dataset_collect_analysis_script


统一将数据格式化为 conflictMap
```json
{
    "path": , // 文件相对路径
    "repo_url": , // 仓库地址
    "file_a_content": , // 文件 A 内容
    "file_b_content": , // 文件 B 内容
    "file_o_content": , // 文件 base 内容
    "file_r_content": , // 文件 Resolved 内容
    "file_m_content": , // 文件 Merged 内容
    "commitHash": ,     // commit hash
    "conflict_chunks": [
        {
            "m_start": , // merge 起始行
            "m_end": , // merge 结束行
            "a_content": , // A 内容
            "b_content": , // B 内容
            "o_content": , // base 内容
            "r_content": , // resolved 内容
            "label": , // conflict 类型
            "chunk_idx": , // chunk 在文件中是第几个 chunk     // 有可能有的 chunk 没有 resolutioin
        }
    ]
    
}
```

In [None]:
import re

class ConflictChunk:
    def __init__(self, m_start, m_end, a_content, b_content, 
                 o_content, r_content, label: str | None, chunk_idx):
        self.m_start = m_start
        self.m_end = m_end
        self.a_content = a_content
        self.b_content = b_content
        self.o_content = o_content
        self.r_content = r_content
        self.label = label
        self.chunk_idx = chunk_idx

    def to_dict(self):
        return {
            "m_start": self.m_start,
            "m_end": self.m_end,
            "a_content": self.a_content,
            "b_content": self.b_content,
            "o_content": self.o_content,
            "r_content": self.r_content,
            "label": self.label,
        }
    
    def getJSONstr(self):
        return json.dumps(self, default=lambda o: o.__dict__, indent=4)


class ConflictFile:
    def __init__(self, path, repo_url, file_a_content, file_b_content, file_o_content, file_r_content, file_m_content, commit_hash):
        self.path = path
        self.repo_url = repo_url
        self.file_a_content = file_a_content
        self.file_b_content = file_b_content
        self.file_o_content = file_o_content
        self.file_r_content = file_r_content
        self.file_m_content = file_m_content
        self.commit_hash = commit_hash
        self.conflict_chunks = []

    def add_conflict_chunk(self, conflict_chunk_obj):
        self.conflict_chunks.append(conflict_chunk_obj)

    def to_dict(self):
        return {
            "path": self.path,
            "repo_url": self.repo_url,
            "file_a_content": self.file_a_content,
            "file_b_content": self.file_b_content,
            "file_o_content": self.file_o_content,
            "file_r_content": self.file_r_content,
            "file_m_content": self.file_m_content,
            "conflict_chunks": [chunk.to_dict() for chunk in self.conflict_chunks],
        }
    
    def getJSONstr(self):
        return json.dumps(self, default=lambda o: o.__dict__, indent=4)
    
class ConflictFileCollector:
    def __init__(self, dataset_path):
        self.dataset_path = dataset_path
    
    def collect(self):
        raise NotImplementedError
    
    @staticmethod
    def preprocessContent(content: str):
        return '' if content.strip() == '' else re.sub(r'\s+', ' ', content.strip() + '\n')
    
    @staticmethod
    def getLabel(a, b, o, r):
        r_processed = ConflictFileCollector.preprocessContent(r)
        a_processed = ConflictFileCollector.preprocessContent(a)
        b_processed = ConflictFileCollector.preprocessContent(b)
        o_processed = ConflictFileCollector.preprocessContent(o)
        if a_processed == b_processed:
            return "same modification, formatting maybe different"
        if r_processed == a_processed:
            return "A"
        if r_processed == b_processed:
            return "B"
        if r_processed == o_processed:
            return "O"
        if r_processed == a_processed + b_processed:
            return "AB"
        if r_processed == b_processed + a_processed:
            return "BA"

        r_lines = set(r.split('\n'))
        a_lines = set(a.split('\n'))
        b_lines = set(b.split('\n'))
        o_lines = set(o.split('\n'))
        for rl in r_lines:
            if (rl not in a_lines) and (rl not in b_lines) and (rl not in o_lines) and not rl.isspace():
                return 'newline'
        return 'mixline'

    @staticmethod
    def getAllJsonsUnder(dirPath: str):
        for root, _, files in os.walk(dirPath):
            for file in files:
                if(file.endswith(".json")):
                    yield os.path.join(root, file)


In [None]:
data_dir = work_dir / "data" / "100+stars_4GB-_multidev_org_lang"
output_file = work_dir / "output" / "100+stars_4GB-_multidev_org_lang.csv.json"

class GraphQLFilteredRepoCollector(ConflictFileCollector):
    '''
    100+ stars, non_fork, 10+devs, org, 4GB- repos on GitHub
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
        # 1. 获取所有 json 文件名 /xxx/conflictFiles/hash/conflictFilesMetadata.json
        # 3. 读取 json 文件，构造 ConflictFile 对象
        ret = []

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            if basename != 'conflictFilesMetadata.json':
                raise ValueError("conflictFilesMetadata.json file name error")
            dirname = os.path.dirname(jsonPath)

            # jsonData
            with open(jsonPath, 'r') as f:      # 好多数据都没收集
                metadata = json.load(f)
                repo_url = None                 # 还真没记录 repo_url 或者 author/repoName，只记录 repoName 了
                path = metadata['filePath']
                suffix = path.split('.')[-1]
                conflictChunks = metadata['conflictChunks']
                commit_hash = metadata['resolvedCommitHash']

                a_content = '\n'.join(metadata['oursContent'])          # 不需要在最后 + '\n'，收集数据集是用的是 String.split('\n', -1) -1 代表尽量分割，所以 join 后不需要再加换行符
                b_content = '\n'.join(metadata['theirsContent'])
                base_content = '\n'.join(metadata['baseContent'])
                merged_content = '\n'.join(metadata['mergedContent'])
                
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, merged_content, base_content, commit_hash)
            for chunk in conflictChunks:
                if 'resolution' not in chunk or chunk['resolution'] == None:                          # gitMergeScenario 中 DeepMergeAligner 没有找到 resolution
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，先忽略
                # 最后得加 \n，因为 DeepMergeAligner 获取 resolution 是从代码行数组中提取出来的行，所以要加上换行符
                cc = ConflictChunk(chunk['startLine'], chunk['endLine'], '\n'.join(chunk['ours']) + '\n', '\n'.join(chunk['theirs']) + '\n', 
                                    '\n'.join(chunk['base']) + '\n', chunk['resolution'], None, None)
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)
                conflict_file.add_conflict_chunk(cc)
            ret.append(conflict_file)
        return ret
    

collector = GraphQLFilteredRepoCollector(data_dir)
conflict_files = collector.collect()
print("Total conflict files: ", len(conflict_files))

In [9]:
data_dir = work_dir / "data" / "2000repos"
output_file = work_dir / "output" / "2000repos.json"

class MergeNatureRepoCollector(ConflictFileCollector):
    '''
    2000 repos 数据集转化为 conflictMap
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
        # 1. 获取所有 json 文件名 /xxx/repo_name/hash/relativePath/filename/metadata.json
        # 2. 提取对应目录下的 ours.xxx theirs.xxx base.xxx conflict.xxx resolve.xxx
        # 3. 读取 metadata.json, 获取 repo_url, path 以及 conflict chunks
        ret = []

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            if basename != 'metadata.json':
                raise ValueError("metadata.json file name error")
            dirname = os.path.dirname(jsonPath)

            relativePath = os.sep.join(str(jsonPath)[len(str(data_dir)):].split(os.sep)[3:-1])

            # jsonData
            with open(jsonPath, 'r') as f:      # 好多数据都没收集
                metadata = json.load(f)
                repo_url = None
                path = relativePath
                suffix = metadata['filetype']
                conflict_chunks = metadata['conflicting_chunks']
                commit_hash = metadata['commitID']
            
            # 读取 a, b, base, merged, resolved
            a_path = os.path.join(dirname, 'ours' + suffix)
            b_path = os.path.join(dirname, 'theirs' + suffix)
            base_path = os.path.join(dirname, 'base' + suffix)
            merged_path = os.path.join(dirname, 'conflict' + suffix)

            # 读取文件内容
            try:
                with open(a_path, 'r') as f:
                    a_content = f.read()
                with open(b_path, 'r') as f:
                    b_content = f.read()
                with open (base_path, 'r') as f:
                    base_content = f.read()
                with open (merged_path, 'r') as f:
                    merged_content = f.read()
            except Exception as e:
                # 有的文件不存在，直接跳过
                # print(jsonPath)
                # print(e)
                continue
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, merged_content, base_content, commit_hash)
            for chunk in conflict_chunks:
                if 'resolve' not in chunk or chunk['resolve'] == None:
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，先忽略
                cc = ConflictChunk(-1, -1, chunk['a_contents'], chunk['b_contents'], 
                                    chunk['base_contents'], chunk['resolve'], None, None)
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)
                conflict_file.add_conflict_chunk(cc)
            ret.append(conflict_file)
        return ret
    

collector = MergeNatureRepoCollector(data_dir)
conflict_files = collector.collect()
print("Total conflict files: ", len(conflict_files))

100%|██████████| 103013/103013 [02:01<00:00, 847.65it/s] 


Total conflict files:  102695


In [None]:
data_dir = work_dir / "data" / "top50"
output_file = work_dir / "output" / "top50_2000repos.json"

class MergeNatureRepoTop50Collector(ConflictFileCollector):
    '''
    top50/2000 repos 数据集转化为 conflictMap
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
        # 1. 获取所有 json 文件名 /.../repo_name/12345_a.java
        #    提取最后的 12345
        # 2. 获取对应的 12345_a.xxx, 12345_b.xxx, 12345_base.xxx, 12345_merged.xxx, 12345_resolved.xxx
        # 3. 读取 metadata.json, 获取 repo_url, path 以及 conflict chunks
        ret = []

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            dirname = os.path.dirname(jsonPath)

            # jsonData
            with open(jsonPath, 'r') as f:      # 好多数据都没收集
                metadata = json.load(f)
                repo_url = None
                path = None
                suffix = metadata['filetype']
                conflict_chunks = metadata['conflicting_chunks']
                commit_hash = None
            
            # 读取 a, b, base, merged, resolved
            a_path = os.path.join(dirname, basename.replace('_metadata.json', '_a' + suffix))
            b_path = os.path.join(dirname, basename.replace('_metadata.json', '_b' + suffix))
            base_path = os.path.join(dirname, basename.replace('_metadata.json', '_base' + suffix))
            merged_path = os.path.join(dirname, basename.replace('_metadata.json', '_merged' + suffix))

            # 读取文件内容
            try:
                with open(a_path, 'r') as f:
                    a_content = f.read()
                with open(b_path, 'r') as f:
                    b_content = f.read()
                with open (base_path, 'r') as f:
                    base_content = f.read()
                with open (merged_path, 'r') as f:
                    merged_content = f.read()
            except Exception as e:
                print(jsonPath)
                print(e)
                continue
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, merged_content, base_content, commit_hash)
            for chunk in conflict_chunks:
                if 'resolve' not in chunk or chunk['resolve'] == None:
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，先忽略
                cc = ConflictChunk(-1, -1, chunk['a_contents'], chunk['b_contents'], 
                                    chunk['base_contents'], chunk['resolve'], None, None)
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)
                conflict_file.add_conflict_chunk(cc)
            ret.append(conflict_file)
        return ret
    

collector = MergeNatureRepoTop50Collector(data_dir)
conflict_files = collector.collect()
print("Total conflict files: ", len(conflict_files))

In [3]:
# data_dir = work_dir / "data" / "mergebert_data" / "automated-analysis-data" / "TypeScript"
# output_file = work_dir / "output" / "mergebert_ts.json"

data_dir = work_dir / "data" / "mergebert_data" / "automated-analysis-data"
output_file = work_dir / "output" / "mergebert_all_lang.json"

class MergeBERTConflictFileCollector(ConflictFileCollector):
    '''
    MergeBERT 数据集转化为 conflictMap
    '''
    def __init__(self, dataset_path):
        super().__init__(dataset_path)
    
    def collect(self):
       # 1. 获取所有 json 文件名，如 /Users/foril/projects/conflict_resolve/my_work/dataset_collect_analysis_script/data/mergebert_data/automated-analysis-data/TypeScript/55743_metadata.json,
        #    提取最后的 12345
        # 2. 获取对应的 12345_a.xxx, 12345_b.xxx, 12345_base.xxx, 12345_merged.xxx, 12345_resolved.xxx
        # 3. 读取 metadata.json, 获取 repo_url, path 以及 conflict chunks
        chunk_cnt = 0
        chunk_no_r_cnt = 0
        ret = []

        metadata_jsonPaths = [path for path in self.getAllJsonsUnder(self.dataset_path)]
        if len(metadata_jsonPaths) == 0:
            raise FileNotFoundError("No metadata json files found in the dataset path")
        for jsonPath in tqdm(metadata_jsonPaths):
            # 提取路径
            basename = os.path.basename(jsonPath)
            dirname = os.path.dirname(jsonPath)

            # jsonData
            with open(jsonPath, 'r') as f:
                metadata = json.load(f)
                repo_url = metadata['repo']
                path = metadata['fname']
                suffix = path.split('.')[-1]
                conflict_chunks = metadata['conflicting_chunks']
                commit_hash = metadata['commitHash']
            
            # 读取 a, b, base, merged, resolved
            a_path = os.path.join(dirname, basename.replace('_metadata.json', '_a.' + suffix))
            b_path = os.path.join(dirname, basename.replace('_metadata.json', '_b.' + suffix))
            base_path = os.path.join(dirname, basename.replace('_metadata.json', '_base.' + suffix))
            merged_path = os.path.join(dirname, basename.replace('_metadata.json', '_merged.' + suffix))

            # 读取文件内容
            with open(a_path, 'r') as f:
                a_content = f.read()
            with open(b_path, 'r') as f:
                b_content = f.read()
            with open (base_path, 'r') as f:
                base_content = f.read()
            with open (merged_path, 'r') as f:
                merged_content = f.read()
            
            # 构造 ConflictFile 对象
            conflict_file = ConflictFile(path, repo_url, a_content, b_content, base_content, merged_content, base_content, commit_hash)
            for chunk in conflict_chunks:
                chunk_cnt += 1
                if chunk['res_region'] == None:
                    chunk_no_r_cnt += 1
                    continue
                # m_start, m_end 和 chunk_idx 不太好拿，对 MergeBERT 数据集好像也不是很重要，先忽略吧
                cc = ConflictChunk(-1, -1, chunk['a_contents'], chunk['b_contents'], 
                                    chunk['base_contents'], chunk['res_region'], None, None)
                cc.mergebert_label = chunk.get('label', None) # type: ignore
                    # 'A',
                    #  'AB',
                    #  'B',
                    #  'BA',
                    #  'BASE',
                    #  None,
                    #  'OTHER',
                    #  'REM-BASE-A',
                    #  'REM-BASE-AB',
                    #  'REM-BASE-B',
                    #  'REM-BASE-BA',
                    #  'RES_EMPTY',
                    #  'RES_FILE_EMPTY'
                
                cc.label = self.getLabel(cc.a_content, cc.b_content, cc.o_content, cc.r_content)

                conflict_file.add_conflict_chunk(cc)
            ret.append(conflict_file)
        print(f"Total chunk count: {chunk_cnt}, chunk without r: {chunk_no_r_cnt}")
        return ret 

collector = MergeBERTConflictFileCollector(data_dir)
conflict_files = collector.collect()
print("Total conflict files: ", len(conflict_files))

100%|██████████| 48785/48785 [01:17<00:00, 628.50it/s]

Total chunk count: 193870, chunk without r: 42444
Total conflict files:  48785





In [None]:
# 不单独存储了，拆分成多个文件对内存友好

# with open(output_file, "w") as f:
#     # 使用 tqdm 包裹列表，显示进度条
#     json.dump([json.loads(x.getJSONstr()) for x in tqdm(conflict_files, desc="Processing conflict files")], f)

KeyboardInterrupt: 

# 分析冲突块的类型分布

In [10]:
# 输入 conflict_files，输出 类型分布 map，同时绘制饼图
def analyze_label_distribution(conflict_files):
    label_cnt = defaultdict(int)
    for cf in conflict_files:
        for cc in cf.conflict_chunks:
            label_cnt[cc.label] += 1
    
    import plotly.graph_objects as go
    # 创建饼图
    labels = list(label_cnt.keys())
    values = list(label_cnt.values())
    fig = go.Figure(data=[go.Pie(labels=labels, values=values)])
    # 设置布局
    fig.update_layout(title_text="各类型冲突占比", width=600, height=400)
    # 显示图形
    fig.show()
    from pprint import pprint
    pprint(label_cnt)
    return label_cnt

# 先收集后再运行
# 或者加一步从文件中读取
label_cnt = analyze_label_distribution(conflict_files)

defaultdict(<class 'int'>,
            {'A': 89376,
             'AB': 5479,
             'B': 52780,
             'BA': 2127,
             'O': 695,
             'mixline': 23787,
             'newline': 25166,
             'same modification, formatting maybe different': 7107})


# 编辑分析没有交集的情况下直接合并的正确率

In [9]:
def analyze_data(data, print_result=True):
    '''
    分析数据集，计算准确率等
    '''
    conflicts = [Conflict(conflict['ours'], conflict['theirs'],
                        conflict['base'], conflict['resolve'], conflict['resolution_kind']) for conflict in data]
    whitespace_conflict = 0
    correct_with_no_empty_line = 0
    resolution_offerable_kind = defaultdict(int)
    kind_counter = defaultdict(int)
    correct_counter = defaultdict(int)
    too_many_edit_script = 0
    too_many_lines = 0
    iter = enumerate(tqdm(conflicts, desc="Processing conflicts")) if print_result else enumerate(conflicts)
    for i, conflict in iter:
        kind_counter[conflict.resolution_kind] += 1

        if len(conflict.base) > 1000 or len(conflict.ours) > 1000 or len(conflict.theirs) > 1000:
            too_many_lines += 1
            continue

        from_ours = compute(conflict.base, conflict.ours)
        from_theirs = compute(conflict.base, conflict.theirs)
        
        edit_script_num = len(from_ours) + len(from_theirs)
        if edit_script_num > 10:  # 编辑脚本过多
            too_many_edit_script += 1
            continue


        from_ours = sorted(from_ours, key=lambda x: x.seq1Range.start)
        from_theirs = sorted(from_theirs, key=lambda x: x.seq1Range.start)
        
        generated = []
        i, j = 0, 0
        end = 0
        inf = float('inf')
        while i < len(from_ours) and j < len(from_theirs):  # 要在没有交集的前提下，按照顺序合并
            # 双指针
            ours_base_start, ours_base_end = from_ours[i].seq1Range.start, from_ours[i].seq1Range.end
            theirs_base_start, theirs_base_end = from_theirs[j].seq1Range.start, from_theirs[j].seq1Range.end
            if from_ours[i].seq1Range.intersect(from_theirs[j].seq1Range) != None:
                i = inf
                break
            if ours_base_start < theirs_base_start:
                generated.extend(conflict.base[end:ours_base_start])
                generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                end = ours_base_end
                i += 1
            elif ours_base_start > theirs_base_start:
                generated.extend(conflict.base[end:theirs_base_start])
                generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                end = theirs_base_end
                j += 1
            else:
                generated.extend(conflict.base[end:ours_base_start])
                if ours_base_end <= theirs_base_end:
                    generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                    generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                    end = theirs_base_end
                else:
                    generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                    generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                    end = ours_base_end
                i += 1
                j += 1
        
        if i == inf:    # 有交集
            continue
        resolution_offerable_kind[conflict.resolution_kind] += 1

        for diff_ours in from_ours[i:]:
            generated.extend(conflict.base[end:diff_ours.seq1Range.start])
            end = diff_ours.seq1Range.end
            generated.extend(
                conflict.ours[diff_ours.seq2Range.start:diff_ours.seq2Range.end])
        for diff_theirs in from_theirs[j:]:
            generated.extend(conflict.base[end:diff_theirs.seq1Range.start])
            end = diff_theirs.seq1Range.end
            generated.extend(
                conflict.theirs[diff_theirs.seq2Range.start:diff_theirs.seq2Range.end])
        generated.extend(conflict.base[end:])  # base有尾巴的话加上
        if generated == conflict.resolution:    # 完全相同
            correct_counter[conflict.resolution_kind] += 1

        def if_empty_line(line):
            # 使用正则表达式判断字符串 line 是否全为空白符
            return re.match(r'^\s*$', line) is not None # 应该可以换成 line.isspace() or line == ''

        if [x for x in generated if not if_empty_line(x)] == [x for x in conflict.resolution if not if_empty_line(x)]:
            # todo: 考虑过滤空格影响
            correct_with_no_empty_line += 1

    correct = sum(correct_counter.values())
    resolution_offerable = sum(resolution_offerable_kind.values())
    conflict_sum = sum(kind_counter.values())

    if print_result:
        print()
        print(f'编辑脚本过多的冲突数量：{too_many_edit_script}')
        print(f'行数过多的冲突数量：{too_many_lines}')
        print(f'空白符冲突数量：{whitespace_conflict}')
        print()
        print(f'没有重叠，可以提供推荐的数量: {resolution_offerable}/{conflict_sum} = {resolution_offerable/conflict_sum * 100}%')
        print(f'过滤空行之前 准确率: {correct}/{conflict_sum} = {correct/conflict_sum * 100}%，占可以提供推荐的 {correct/resolution_offerable * 100}%')
        print(f'过滤空行之后 准确率: {correct_with_no_empty_line}/{conflict_sum} = {correct_with_no_empty_line/conflict_sum * 100}%，占可以提供推荐的 {correct_with_no_empty_line/resolution_offerable * 100}%')

        print()
        print('各种类型的冲突数量占比：')
        for k, v in kind_counter.items():
            print(f'{k}:{v} , {v/conflict_sum*100:.2f}%')

        print()
        print('使用规则，各种类型的冲突规则正确率：')
        for k, v in kind_counter.items():
            print(f'{k}:{correct_counter[k]}/{v} , {correct_counter[k]/v*100:.2f}%')
    return { 
        'resolution_offerable': resolution_offerable, 
        'conflict_sum': conflict_sum, 
        'correct': correct, 
        'correct_with_no_empty_line': correct_with_no_empty_line, 
        'kind_counter': kind_counter, 
        'correct_counter': correct_counter,
        'too_many_edit_script': too_many_edit_script,
        'too_many_lines': too_many_lines,
        'whitespace_conflict': whitespace_conflict
    }

In [3]:
# 忽略 import 语句的冲突
# 读取冲突数据集
with open(output_file, 'r') as f:
    data = json.load(f)
no_import_data = filter(lambda c: not import_related_conflict(c), data)
analyze_data(no_import_data)

# 分析各仓库类型分布

尝试过滤一些分布特别不均匀的  
以及冲突很特殊的

In [47]:
repo_dir = work_dir / "data" / "2000repos" / "conflictFiles"       # 陈放所有仓库的冲突文件

# 所有仓库
repos = [repo for repo in os.listdir(repo_dir) if os.path.isdir(repo_dir / repo)]
conflicts_from_repos = {}
for repo in tqdm(repos):
    jsonPaths = getAllJsonsUnder(str(repo_dir / repo))
    results, kind_counter = collect_conflict_from_jsonPaths(jsonPaths, show_progress=False)
    conflicts_from_repos[repo] = results

100%|██████████| 2473/2473 [01:21<00:00, 30.47it/s] 


In [34]:
# 打印 XCoLab 的冲突
xcolab_conflicts = conflicts_from_repos['XCoLab']
print(len(xcolab_conflicts))
# 随机取一个打印
import random
random_conflict = random.choice(xcolab_conflicts)

view_dir = '/Users/foril/projects/conflict_resolve/test_with_vscode/tmp'
def print_conflict(conflict):
    # 输出冲突
    conflict2file(Conflict(conflict['ours'], conflict['theirs'], conflict['base'], conflict['resolve']), Path(view_dir))

print_conflict(random_conflict)

9468


In [35]:
# 删除特殊仓库，存储一份数据
deleted_repos = ['XCoLab']
for repo in deleted_repos:
    del conflicts_from_repos[repo]
remained_conflicts = [conflict for conflicts in conflicts_from_repos.values() for conflict in conflicts]

In [38]:
# 保存结果
remained_file = work_dir / "output" / "2000repos_remained.json"
with open( remained_file, "w") as f:
    json.dump(remained_conflicts, f)

In [48]:
# for conflicts in conflicts_from_repos.values():
for conflicts in [conflicts_from_repos['XCoLab']]:
    res = analyze_data(conflicts, print_result=False)
    kind_counter = res['kind_counter']
    # kind_counter 绘制饼图
    import plotly.graph_objects as go
    _labels = ['newline', 'mixline', 'accept_ours', 'accept_theirs', 'concat_ours_theirs', 'concat_theirs_ours', 'delete_all', 'accept_base']
    _values = [kind_counter[label] for label in _labels]
    fig = go.Figure(data=[go.Pie(labels=_labels, values=_values)])
    fig.show()

## 处理所有数据

分析各类别占比，以及采用规则能够解决的数量占比

In [10]:
# 读取冲突数据集
with open(output_file, 'r') as f:
    results = json.load(f)

In [None]:
all_result = analyze_data(results)

In [24]:
view_dir = '/Users/foril/projects/conflict_resolve/test_with_vscode/tmp'

def print_conflict(conflict):
    # 输出冲突
    conflict2file(Conflict(conflict['ours'], conflict['theirs'], conflict['base'], conflict['resolve']), Path(view_dir))

import random
i = random.randint(0, len(results))

conflict = results[i]
while conflict['resolution_kind'] != 'newline': # 选择一个 newline 的冲突
    i = random.randint(0, len(results))
    conflict = results[i]
print_conflict(conflict)
print('冲突类型：', conflict['resolution_kind'])
print('project_name:', conflict['project_name'])
print(work_dir)
print('path:', str(work_dir / "data/2000repos/conflictFiles") + conflict['path'][32:] + "/metadata.json") 

冲突类型： newline
project_name: XCoLab
/Users/foril/projects/conflict_resolve/dataset_collect_analysis_script
path: /Users/foril/projects/conflict_resolve/dataset_collect_analysis_script/data/2000repos/conflictFiles/XCoLab/1aeca074997e86fb0ca919d4eb61ff65fc8fe691_1408574682/services/plansProposalsFacade/plansProposalsFacade-portlet/src/main/java/com/ext/portlet/service/base/ActivitySubscriptionLocalServiceClpInvoker.java/metadata.json


## 结果可视化

In [6]:
# 绘制所有类型冲突的占比饼图
import plotly.graph_objects as go

# 和 MergeBERT数据集图片对应
_labels = ['newline', 'mixline', 'accept_ours', 'accept_theirs', 'concat_ours_theirs', 'concat_theirs_ours', 'delete_all', 'accept_base']
_values = [kind_counter[label] for label in _labels]

# 按label的顺序，顺时针绘制饼图
# 注意：按顺序
fig = go.Figure(data=[go.Pie(labels=_labels, values=_values, sort=False)])

# Updating the layout for better visualization
fig.update_layout(title_text='解决类型分布', width=600)

# Showing the pie chart
fig.show()

In [7]:
import pandas as pd
# 所有语言数据中伪冲突的比例

suggestion_given_rate = {
    k: resolution_offerable_kind[k]/v*100 for k, v in kind_counter.items()
}
suggestion_given_rate_df_dic = {'类别':suggestion_given_rate.keys(), '伪冲突占比':suggestion_given_rate.values()}
falsepositive_rate = pd.DataFrame(suggestion_given_rate_df_dic)
falsepositive_rate['伪冲突占比'] = falsepositive_rate['伪冲突占比'].round(2)
falsepositive_rate['伪冲突占比'] = falsepositive_rate['伪冲突占比'].apply(lambda x: str(x)+'%')
falsepositive_rate = falsepositive_rate.sort_values(by='伪冲突占比', ascending=False)
falsepositive_rate



Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        



Unnamed: 0,类别,伪冲突占比
6,concat_theirs_ours,95.93%
4,concat_ours_theirs,94.96%
3,mixline,74.85%
5,accept_base,69.56%
7,delete_all,56.0%
2,newline,25.24%
1,accept_theirs,23.41%
0,accept_ours,22.65%


In [8]:
# 所有语言数据中冲突的解决比例
# 这个数据中不能提供 BA 这样的类型，不能算上界，上界应该看回溯算法给出的结果
import pandas as pd
correct_rate = {
    k: correct_counter[k]/v*100 for k, v in kind_counter.items()
}
correct_rate_df_dic = {'类别':correct_rate.keys(), '正确率':correct_rate.values()}
correct_rate_df = pd.DataFrame(correct_rate_df_dic)
correct_rate_df['正确率'] = correct_rate_df['正确率'].round(2)
correct_rate_df['正确率'] = correct_rate_df['正确率'].apply(lambda x: str(x)+'%')
correct_rate_df

Unnamed: 0,类别,正确率
0,accept_ours,0.0%
1,accept_theirs,0.01%
2,newline,0.0%
3,mixline,52.12%
4,concat_ours_theirs,89.14%
5,accept_base,1.3%
6,concat_theirs_ours,0.0%
7,delete_all,37.76%


In [9]:
# 根据 伪冲突解决正确率 降序排列
# 所有语言数据中各类别伪冲突的正确率
import pandas as pd
correct_out_of_givin_dic = {'类别':correct_rate.keys(), '伪冲突解决正确率':[correct_rate[i]/suggestion_given_rate[i] * 100 for i in correct_rate.keys()]}
correct_out_of_givin_df = pd.DataFrame(correct_out_of_givin_dic)
# 根据 伪冲突解决正确率 降序排列
correct_out_of_givin_df = correct_out_of_givin_df.sort_values(by='伪冲突解决正确率', ascending=False)
correct_out_of_givin_df['伪冲突解决正确率'] = correct_out_of_givin_df['伪冲突解决正确率'].round(2)
correct_out_of_givin_df['伪冲突解决正确率'] = correct_out_of_givin_df['伪冲突解决正确率'].apply(lambda x: str(x)+'%')
correct_out_of_givin_df

Unnamed: 0,类别,伪冲突解决正确率
4,concat_ours_theirs,93.87%
3,mixline,69.63%
7,delete_all,67.42%
5,accept_base,1.86%
1,accept_theirs,0.05%
0,accept_ours,0.02%
2,newline,0.0%
6,concat_theirs_ours,0.0%


In [10]:
# 使用 plotly 绘制条形图，横坐标是各个类型，
# 纵坐标高度由三部分组成：灰色表示总数减去给出建议的部分，绿色部分表示正确的部分、蓝色表示给出建议但不正确的部分

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=[kind_counter[k] * correct_rate[k] / 100 for k in kind_counter.keys()],
    name='接受所有编辑脚本就能解决',
    marker_color='green',
    customdata=[round(correct_rate[k], 1) for k in kind_counter.keys()],
    texttemplate="%{customdata}%",
))
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=[kind_counter[k] * (suggestion_given_rate[k] - correct_rate[k]) / 100 for k in kind_counter.keys()],
    name='伪冲突，但不能接受全部脚本解决',
    marker_color='rgb(52,130,198)',
))
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=list([kind_counter[k] * (100 - suggestion_given_rate[k]) / 100 for k in kind_counter.keys()]),
    name='该类型冲突总数',
    marker_color='#aaa'
))

fig.update_layout(
    title='各类别冲突伪冲突解决情况',
    xaxis_tickfont_size=14,
    width=1000,
    yaxis=dict(
        title='冲突数量',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0.8,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='stack',
    bargap=0.15,
    bargroupgap=0.1
)

fig.show()