## 收集数据

从 MergeBert 数据集中收集所有语言（Java/C#/JavaScript/TypeScript）的数据集
改成了从 merge_nature top 50 repos 收集数据

In [1]:
from collections import defaultdict
import os
import json
from util import Conflict, conflict2file
from tqdm import tqdm
from pathlib import Path

file_path = Path(os.path.abspath(''))

def getAllSubFiles(dirPath: str):
    for root, _, files in os.walk(dirPath):
        for file in files:
            if(file.endswith(".json")):
                yield os.path.join(root, file)


paths = list(getAllSubFiles(file_path / "data" / "index_conflict_files"))

fields = ["ours", "theirs", "base", "resolve"]
results = []

whitespace_conflict = 0

kind_counter = defaultdict(int)

for jsonPath in tqdm(paths):
    with open(jsonPath) as jsonFile:
        jsonData = json.load(jsonFile)

        for chunk in jsonData["conflicting_chunks"]:
            
            if 'resolve' not in chunk: continue    # 本次收集不包含没有 resolution 的冲突块
            
            codes_origin = [chunk['a_contents'], chunk['b_contents'],
                     chunk['base_contents'], chunk['resolve']]
            conflict_map = dict()

            if codes_origin[3] == None: continue           # 本次收集不包含没有 resolution 的冲突块
            

            codes = list(map(lambda s: str(s).strip("\n").split("\n"), codes_origin))      # 去除前后的换行符，会导致一些空白冲突在数据集中只剩相同部分
            # 这里如果不 strip 的话，几乎所有 concat 都会被匹配到 mixline，综上，选择 strip 并且过滤空白符冲突 在比较时过滤所有空行影响
            # codes = list(map(lambda s: str(s).split("\n"), codes_origin))      # 去除前后的换行符，会导致一些空白冲突在数据集中只剩相同部分


            for i in range(4):
                # 去除空行 '' 不会被去除，但 '\n', ' \n', '\n\n', ' ' 会被去除，换行符的冲突会被显示为全空冲突
                # codes[i] = list(filter(lambda line: not(line == '' or line.isspace()), codes[i]))     # 过滤空行
                if codes[i] == ['']: codes[i] = []     # 过滤空行被匹配到编辑脚本的情况
                conflict_map[fields[i]] = codes[i]

            conflict = Conflict(conflict_map[fields[0]], conflict_map[fields[1]], conflict_map[fields[2]], conflict_map[fields[3]])
            
            # 过滤 CRLF/LF 冲突 以及 空白符冲突
            if conflict.base == conflict.ours or conflict.base == conflict.theirs:
                continue

            if 'resolution_kind' not in conflict_map:
                ours, base, theirs, resolution = conflict.ours, conflict.base, conflict.theirs, conflict.resolution
                if resolution == ours:    # accept ours
                    conflict_map['resolution_kind'] = 'accept_ours'
                elif resolution == theirs:    # accept theirs
                    conflict_map['resolution_kind'] = 'accept_theirs'
                elif resolution == base:    # accept base
                    conflict_map['resolution_kind'] = 'accept_base'
                elif resolution in[[''], []]:
                    conflict_map['resolution_kind'] = 'delete_all'     # MergeBert 数据集中 resolution 是 None 的应该是 lack of resolution？
                elif any(not (resolveline in ours + theirs + base) for resolveline in resolution):  # newline
                    conflict_map['resolution_kind'] = 'newline'
                elif resolution == ours + theirs:
                    # conflict2file(conflict, Path('/Users/foril/projects/conflict_resolve/test_with_vscode/tmp'))
                    conflict_map['resolution_kind'] = 'concat_ours_theirs'
                elif resolution == theirs + ours:
                    # conflict2file(conflict, Path('/Users/foril/projects/conflict_resolve/test_with_vscode/tmp'))
                    conflict_map['resolution_kind'] = 'concat_theirs_ours'
                elif all(resolveline in ours + theirs + base for resolveline in conflict_map[fields[3]]):
                    conflict_map['resolution_kind'] = 'mixline'
                else:
                    conflict_map['resolution_kind'] = 'others'

            kind_counter[conflict_map['resolution_kind']] += 1

            results.append(conflict_map)

print(f'共收集到 {len(results)} 个冲突块')

for k, v in kind_counter.items():
    print(f'{k}:{v} , {v/len(results)*100:.2f}%')

with open( file_path / "output" / "self_collected_most_50.json", "w") as f:
    json.dump(results, f)


100%|██████████| 26369/26369 [00:12<00:00, 2157.34it/s]


共收集到 60035 个冲突块
accept_ours:27973 , 46.59%
mixline:6944 , 11.57%
accept_theirs:13171 , 21.94%
newline:10301 , 17.16%
delete_all:252 , 0.42%
concat_ours_theirs:914 , 1.52%
accept_base:146 , 0.24%
concat_theirs_ours:334 , 0.56%


In [14]:
with open(file_path / 'output' / 'self_collected_most_50.json', 'r') as f:
    data = json.load(f)

# from pprint import pprint
# pprint(data[0])

# 把所有内容都写入文件，用于训练 Tokenizer
for conflict in data:
    for field in fields:
        if not os.path.exists(file_path / 'tokenizers' / 'data'):
            os.mkdir(file_path / 'tokenizers' / 'data')
        with open(file_path / 'tokenizers' / 'data' / 'self_collected_most_50.raw', 'a') as f:
            f.write('\n'.join(conflict[field]))
            f.write('\n')


## 处理数据

分析各类别占比，以及采用规则能够解决的数量占比

In [2]:
import re
from collections import defaultdict
import os
import json
from util import Conflict
from tqdm import tqdm
from pathlib import Path
from dpbt import compute

file_path = Path(os.path.abspath(''))

# 读取冲突数据集
with open(file_path / 'output' / 'self_collected_most_50.json', 'r') as f:
    data = json.load(f)

conflicts = [Conflict(conflict['ours'], conflict['theirs'],
                      conflict['base'], conflict['resolve'], conflict['resolution_kind']) for conflict in data]

correct_with_no_empty_line = 0
resolution_offerable_kind = defaultdict(int)
kind_counter = defaultdict(int)
correct_counter = defaultdict(int)
too_many_edit_script = 0
too_many_lines = 0

for i, conflict in enumerate(tqdm(conflicts[:], desc="Processing conflicts")):
    kind_counter[conflict.resolution_kind] += 1

    if len(conflict.base) > 1000 or len(conflict.ours) > 1000 or len(conflict.theirs) > 1000:
        too_many_lines += 1
        continue

    from_ours = compute(conflict.base, conflict.ours)
    from_theirs = compute(conflict.base, conflict.theirs)
    
    edit_script_num = len(from_ours) + len(from_theirs)
    if edit_script_num > 10:  # 编辑脚本过多
        too_many_edit_script += 1
        continue


    from_ours = sorted(from_ours, key=lambda x: x.seq1Range.start)
    from_theirs = sorted(from_theirs, key=lambda x: x.seq1Range.start)
    
    generated = []
    i, j = 0, 0
    end = 0
    inf = float('inf')
    while i < len(from_ours) and j < len(from_theirs):  # 要在没有交集的前提下，按照顺序合并
        # 双指针
        ours_base_start, ours_base_end = from_ours[i].seq1Range.start, from_ours[i].seq1Range.end
        theirs_base_start, theirs_base_end = from_theirs[j].seq1Range.start, from_theirs[j].seq1Range.end
        if from_ours[i].seq1Range.intersect(from_theirs[j].seq1Range) != None:
            i = inf
            break
        if ours_base_start < theirs_base_start:
            generated.extend(conflict.base[end:ours_base_start])
            generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
            end = ours_base_end
            i += 1
        elif ours_base_start > theirs_base_start:
            generated.extend(conflict.base[end:theirs_base_start])
            generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
            end = theirs_base_end
            j += 1
        else:
            generated.extend(conflict.base[end:ours_base_start])
            if ours_base_end <= theirs_base_end:
                generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                end = theirs_base_end
            else:
                generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                end = ours_base_end
            i += 1
            j += 1
    
    if i == inf:    # 有交集
        continue
    resolution_offerable_kind[conflict.resolution_kind] += 1

    for diff_ours in from_ours[i:]:
        generated.extend(conflict.base[end:diff_ours.seq1Range.start])
        end = diff_ours.seq1Range.end
        generated.extend(
            conflict.ours[diff_ours.seq2Range.start:diff_ours.seq2Range.end])
    for diff_theirs in from_theirs[j:]:
        generated.extend(conflict.base[end:diff_theirs.seq1Range.start])
        end = diff_theirs.seq1Range.end
        generated.extend(
            conflict.theirs[diff_theirs.seq2Range.start:diff_theirs.seq2Range.end])
    generated.extend(conflict.base[end:])  # base有尾巴的话加上
    if generated == conflict.resolution:    # 完全相同
        correct_counter[conflict.resolution_kind] += 1

    def if_empty_line(line):
        # 使用正则表达式判断字符串 line 是否全为空白符
        return re.match(r'^\s*$', line) is not None

    if [x for x in generated if not if_empty_line(x)] == [x for x in conflict.resolution if not if_empty_line(x)]:
        # todo: 考虑过滤空格影响
        correct_with_no_empty_line += 1

correct = sum(correct_counter.values())
resolution_offerable = sum(resolution_offerable_kind.values())
conflict_sum = sum(kind_counter.values())

print()
print(f'编辑脚本过多的冲突数量：{too_many_edit_script}')
print(f'行数过多的冲突数量：{too_many_lines}')
print(f'空白符冲突数量：{whitespace_conflict}')
print()
print(
    f'没有重叠，可以提供推荐的数量: {resolution_offerable}/{conflict_sum} = {resolution_offerable/conflict_sum * 100}%')
print(
    f'过滤空行之前 准确率: {correct}/{conflict_sum} = {correct/conflict_sum * 100}%，占可以提供推荐的 {correct/resolution_offerable * 100}%')
print(
    f'过滤空行之后 准确率: {correct_with_no_empty_line}/{conflict_sum} = {correct_with_no_empty_line/conflict_sum * 100}%，占可以提供推荐的 {correct_with_no_empty_line/resolution_offerable * 100}%')

print()
print('各种类型的冲突数量：')
for k, v in kind_counter.items():
    print(f'{k}:{v} , {v/conflict_sum*100:.2f}%')

print()
print('使用规则，各种类型的冲突规则正确率：')
for k, v in kind_counter.items():
    print(f'{k}:{correct_counter[k]}/{v} , {correct_counter[k]/v*100:.2f}%')

Processing conflicts: 100%|██████████| 60035/60035 [01:43<00:00, 577.37it/s]  


编辑脚本过多的冲突数量：630
行数过多的冲突数量：118
空白符冲突数量：0

没有重叠，可以提供推荐的数量: 14504/60035 = 24.15924044307487%
过滤空行之前 准确率: 4965/60035 = 8.270175730823686%，占可以提供推荐的 34.231936017650305%
过滤空行之后 准确率: 5875/60035 = 9.785958191055217%，占可以提供推荐的 40.50606729178158%

各种类型的冲突数量：
accept_ours:27973 , 46.59%
mixline:6944 , 11.57%
accept_theirs:13171 , 21.94%
newline:10301 , 17.16%
delete_all:252 , 0.42%
concat_ours_theirs:914 , 1.52%
accept_base:146 , 0.24%
concat_theirs_ours:334 , 0.56%

使用规则，各种类型的冲突规则正确率：
accept_ours:0/27973 , 0.00%
mixline:4016/6944 , 57.83%
accept_theirs:1/13171 , 0.01%
newline:0/10301 , 0.00%
delete_all:109/252 , 43.25%
concat_ours_theirs:838/914 , 91.68%
accept_base:1/146 , 0.68%
concat_theirs_ours:0/334 , 0.00%





## 结果可视化

In [3]:
# 绘制所有类型冲突的占比饼图
import plotly.graph_objects as go

# 和 MergeBERT数据集图片对应
_labels = ['newline', 'mixline', 'accept_ours', 'accept_theirs', 'concat_ours_theirs', 'concat_theirs_ours', 'delete_all', 'accept_base']
_values = [kind_counter[label] for label in _labels]

# 按label的顺序，顺时针绘制饼图
# 注意：按顺序
fig = go.Figure(data=[go.Pie(labels=_labels, values=_values, sort=False)])

# Updating the layout for better visualization
fig.update_layout(title_text='解决类型分布', width=600)

# Showing the pie chart
fig.show()

In [4]:
import pandas as pd
# 所有语言数据中伪冲突的比例

suggestion_given_rate = {
    k: resolution_offerable_kind[k]/v*100 for k, v in kind_counter.items()
}
suggestion_given_rate_df_dic = {'类别':suggestion_given_rate.keys(), '伪冲突占比':suggestion_given_rate.values()}
falsepositive_rate = pd.DataFrame(suggestion_given_rate_df_dic)
falsepositive_rate['伪冲突占比'] = falsepositive_rate['伪冲突占比'].round(2)
falsepositive_rate['伪冲突占比'] = falsepositive_rate['伪冲突占比'].apply(lambda x: str(x)+'%')
falsepositive_rate = falsepositive_rate.sort_values(by='伪冲突占比', ascending=False)
falsepositive_rate

Unnamed: 0,类别,伪冲突占比
7,concat_theirs_ours,97.6%
5,concat_ours_theirs,97.16%
1,mixline,76.25%
6,accept_base,65.75%
4,delete_all,55.16%
2,accept_theirs,19.09%
3,newline,16.6%
0,accept_ours,12.64%


In [5]:
# 所有语言数据中冲突的解决比例
import pandas as pd
correct_rate = {
    k: correct_counter[k]/v*100 for k, v in kind_counter.items()
}
correct_rate_df_dic = {'类别':correct_rate.keys(), '正确率':correct_rate.values()}
correct_rate_df = pd.DataFrame(correct_rate_df_dic)
correct_rate_df['正确率'] = correct_rate_df['正确率'].round(2)
correct_rate_df['正确率'] = correct_rate_df['正确率'].apply(lambda x: str(x)+'%')
correct_rate_df

Unnamed: 0,类别,正确率
0,accept_ours,0.0%
1,mixline,57.83%
2,accept_theirs,0.01%
3,newline,0.0%
4,delete_all,43.25%
5,concat_ours_theirs,91.68%
6,accept_base,0.68%
7,concat_theirs_ours,0.0%


In [6]:
# 根据 伪冲突解决正确率 降序排列
# 所有语言数据中各类别伪冲突的正确率
import pandas as pd
correct_out_of_givin_dic = {'类别':correct_rate.keys(), '伪冲突解决正确率':[correct_rate[i]/suggestion_given_rate[i] * 100 for i in correct_rate.keys()]}
correct_out_of_givin_df = pd.DataFrame(correct_out_of_givin_dic)
# 根据 伪冲突解决正确率 降序排列
correct_out_of_givin_df = correct_out_of_givin_df.sort_values(by='伪冲突解决正确率', ascending=False)
correct_out_of_givin_df['伪冲突解决正确率'] = correct_out_of_givin_df['伪冲突解决正确率'].round(2)
correct_out_of_givin_df['伪冲突解决正确率'] = correct_out_of_givin_df['伪冲突解决正确率'].apply(lambda x: str(x)+'%')
correct_out_of_givin_df

Unnamed: 0,类别,伪冲突解决正确率
5,concat_ours_theirs,94.37%
4,delete_all,78.42%
1,mixline,75.85%
6,accept_base,1.04%
2,accept_theirs,0.04%
0,accept_ours,0.0%
3,newline,0.0%
7,concat_theirs_ours,0.0%


In [7]:
# 使用 plotly 绘制条形图，横坐标是各个类型，
# 纵坐标高度由三部分组成：灰色表示总数减去给出建议的部分，绿色部分表示正确的部分、蓝色表示给出建议但不正确的部分

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=[kind_counter[k] * correct_rate[k] / 100 for k in kind_counter.keys()],
    name='接受所有编辑脚本就能解决',
    marker_color='green',
    customdata=[round(correct_rate[k], 1) for k in kind_counter.keys()],
    texttemplate="%{customdata}%",
))
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=[kind_counter[k] * (suggestion_given_rate[k] - correct_rate[k]) / 100 for k in kind_counter.keys()],
    name='伪冲突，但不能接受全部脚本解决',
    marker_color='rgb(52,130,198)',
))
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=list([kind_counter[k] * (100 - suggestion_given_rate[k]) / 100 for k in kind_counter.keys()]),
    name='该类型冲突总数',
    marker_color='#aaa'
))

fig.update_layout(
    title='各类别冲突伪冲突解决情况',
    xaxis_tickfont_size=14,
    width=1000,
    yaxis=dict(
        title='冲突数量',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0.8,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='stack',
    bargap=0.15,
    bargroupgap=0.1
)

fig.show()