## 收集数据

从 MergeBert 数据集中收集所有语言（Java/C#/JavaScript/TypeScript）的数据集
改成了从 merge_nature top 50 repos 收集数据

In [6]:
from collections import defaultdict
import os
import json
from util import Conflict
from tqdm import tqdm
from pathlib import Path

file_path = Path(os.path.abspath(''))

def getAllSubFiles(dirPath: str):
    for root, _, files in os.walk(dirPath):
        for file in files:
            if(file.endswith(".json")):
                yield os.path.join(root, file)


paths = list(getAllSubFiles(file_path / "data/index_conflict_files/"))

fields = ["ours", "theirs", "base", "resolve"]
results = []

whitespace_conflict = 0

kind_counter = defaultdict(int)

for jsonPath in tqdm(paths):
    with open(jsonPath) as jsonFile:
        jsonData = json.load(jsonFile)

        for chunk in jsonData["conflicting_chunks"]:
            
            if 'resolve' not in chunk: continue    # 本次收集不包含没有 resolution 的冲突块
            
            codes = [chunk['a_contents'], chunk['b_contents'],
                     chunk['base_contents'], chunk['resolve']]
            conflict_map = dict()

            if codes[3] == None: continue           # 本次收集不包含没有 resolution 的冲突块
            

            codes = list(map(lambda s: str(s).strip("\n").split("\n"), codes))

            for i in range(4):
                # 去除空行 '' 不会被去除，但 '\n', ' \n', '\n\n', ' ' 会被去除，换行符的冲突会被显示为全空冲突
                # codes[i] = list(filter(lambda line: not(line == '' or line.isspace()), codes[i]))     # 过滤空行
                if codes[i] == ['']: codes[i] = []     # 过滤空行被匹配到编辑脚本的情况
                conflict_map[fields[i]] = codes[i]

            conflict = Conflict(conflict_map[fields[0]], conflict_map[fields[1]], conflict_map[fields[2]], conflict_map[fields[3]])

            if 'resolution_kind' not in conflict_map:
                if conflict_map[fields[3]] == conflict_map[fields[0]]:    # accept ours
                    conflict_map['resolution_kind'] = 'accept_ours'
                elif conflict_map[fields[3]] == conflict_map[fields[1]]:    # accept theirs
                    conflict_map['resolution_kind'] = 'accept_theirs'
                elif conflict_map[fields[3]] == conflict_map[fields[2]]:    # accept base
                    conflict_map['resolution_kind'] = 'accept_base'
                elif conflict_map[fields[3]] in[[''], []]:
                    conflict_map['resolution_kind'] = 'delete_all'     # MergeBert 数据集中 resolution 是 None 的应该是 lack of resolution？
                elif not all(resolveline in conflict_map[fields[0]] + conflict_map[fields[1]] + conflict_map[fields[2]] for resolveline in conflict_map[fields[3]]):  # newline
                    conflict_map['resolution_kind'] = 'newline'
                elif conflict_map[fields[3]] == conflict_map[fields[0]] + conflict_map[fields[1]]:
                    conflict_map['resolution_kind'] = 'concat_ours_theirs'
                elif conflict_map[fields[3]] == conflict_map[fields[1]] + conflict_map[fields[0]]:
                    conflict_map['resolution_kind'] = 'concat_theirs_ours'
                elif all(resolveline in conflict_map[fields[0]] + conflict_map[fields[1]] + conflict_map[fields[2]] for resolveline in conflict_map[fields[3]]):
                    conflict_map['resolution_kind'] = 'mixline'
                else:
                    conflict_map['resolution_kind'] = 'others'

            kind_counter[conflict_map['resolution_kind']] += 1

            results.append(conflict_map)

print(f'共收集到 {len(results)} 个冲突块')

for k, v in kind_counter.items():
    print(f'{k}:{v} , {v/len(results)*100:.2f}%')

with open( file_path / "output/self_collected_most_50.json", "w") as f:
    json.dump(results, f)


100%|██████████| 26369/26369 [00:10<00:00, 2550.50it/s]


共收集到 61166 个冲突块
accept_ours:28541 , 46.66%
mixline:6997 , 11.44%
accept_theirs:13620 , 22.27%
newline:10358 , 16.93%
delete_all:255 , 0.42%
concat_ours_theirs:915 , 1.50%
accept_base:146 , 0.24%
concat_theirs_ours:334 , 0.55%


## 处理数据

分析各类别占比，以及采用规则能够解决的数量占比

In [2]:
import re
from collections import defaultdict
import os
import json
from util import Conflict
from tqdm import tqdm
from pathlib import Path
from dpbt import compute

file_path = Path(os.path.abspath(''))

# 读取冲突数据集
with open(file_path / 'output/self_collected_most_50.json', 'r') as f:
    data = json.load(f)

conflicts = [Conflict(conflict['ours'], conflict['theirs'],
                      conflict['base'], conflict['resolve'], conflict['resolution_kind']) for conflict in data]

correct_with_no_empty_line = 0
resolution_offerable_kind = defaultdict(int)
kind_counter = defaultdict(int)
correct_counter = defaultdict(int)

for i, conflict in enumerate(tqdm(conflicts[:], desc="Processing conflicts")):
    kind_counter[conflict.resolution_kind] += 1

    from_ours = compute(conflict.base, conflict.ours)
    from_theirs = compute(conflict.base, conflict.theirs)

    from_ours = sorted(from_ours, key=lambda x: x.seq1Range.start)
    from_theirs = sorted(from_theirs, key=lambda x: x.seq1Range.start)
    
    generated = []
    i, j = 0, 0
    end = 0
    inf = float('inf')
    while i < len(from_ours) and j < len(from_theirs):  # 要在没有交集的前提下，按照顺序合并
        # 双指针
        ours_base_start, ours_base_end = from_ours[i].seq1Range.start, from_ours[i].seq1Range.end
        theirs_base_start, theirs_base_end = from_theirs[j].seq1Range.start, from_theirs[j].seq1Range.end
        if from_ours[i].seq1Range.intersect(from_theirs[j].seq1Range) != None:
            i = inf
            break
        if ours_base_start < theirs_base_start:
            generated.extend(conflict.base[end:ours_base_start])
            generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
            end = ours_base_end
            i += 1
        elif ours_base_start > theirs_base_start:
            generated.extend(conflict.base[end:theirs_base_start])
            generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
            end = theirs_base_end
            j += 1
        else:
            generated.extend(conflict.base[end:ours_base_start])
            if ours_base_end <= theirs_base_end:
                generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                end = theirs_base_end
            else:
                generated.extend(conflict.theirs[from_theirs[j].seq2Range.start:from_theirs[j].seq2Range.end])
                generated.extend(conflict.ours[from_ours[i].seq2Range.start:from_ours[i].seq2Range.end])
                end = ours_base_end
            i += 1
            j += 1
    
    if i == inf:    # 有交集
        continue
    resolution_offerable_kind[conflict.resolution_kind] += 1

    for diff_ours in from_ours[i:]:
        generated.extend(conflict.base[end:diff_ours.seq1Range.start])
        end = diff_ours.seq1Range.end
        generated.extend(
            conflict.ours[diff_ours.seq2Range.start:diff_ours.seq2Range.end])
    for diff_theirs in from_theirs[j:]:
        generated.extend(conflict.base[end:diff_theirs.seq1Range.start])
        end = diff_theirs.seq1Range.end
        generated.extend(
            conflict.theirs[diff_theirs.seq2Range.start:diff_theirs.seq2Range.end])
    generated.extend(conflict.base[end:])  # base有尾巴的话加上
    if generated == conflict.resolution:    # 完全相同
        correct_counter[conflict.resolution_kind] += 1

    def if_empty_line(line):
        # 使用正则表达式判断字符串 line 是否全为空白符
        return re.match(r'^\s*$', line) is not None

    if [x for x in generated if not if_empty_line(x)] == [x for x in conflict.resolution if not if_empty_line(x)]:
        # todo: 考虑过滤空格影响
        correct_with_no_empty_line += 1

correct = sum(correct_counter.values())
resolution_offerable = sum(resolution_offerable_kind.values())
conflict_sum = sum(kind_counter.values())
print(
    f'没有重叠，可以提供推荐的数量: {resolution_offerable}/{conflict_sum} = {resolution_offerable/conflict_sum * 100}%')
print(
    f'过滤空行之前 准确率: {correct}/{conflict_sum} = {correct/conflict_sum * 100}%，占可以提供推荐的 {correct/resolution_offerable * 100}%')
print(
    f'过滤空行之后 准确率: {correct_with_no_empty_line}/{conflict_sum} = {correct_with_no_empty_line/conflict_sum * 100}%，占可以提供推荐的 {correct_with_no_empty_line/resolution_offerable * 100}%')

print()
print('各种类型的冲突数量：')
for k, v in kind_counter.items():
    print(f'{k}:{v} , {v/conflict_sum*100:.2f}%')

print()
print('各种类型的冲突正确率：')
for k, v in kind_counter.items():
    print(f'{k}:{correct_counter[k]}/{v} , {correct_counter[k]/v*100:.2f}%')

Processing conflicts: 100%|██████████| 61166/61166 [12:11<00:00, 83.58it/s]   

没有重叠，可以提供推荐的数量: 15766/61166 = 25.77575777392669%
过滤空行之前 准确率: 5867/61166 = 9.59193015727692%，占可以提供推荐的 37.21298997843461%
过滤空行之后 准确率: 6850/61166 = 11.199032142039696%，占可以提供推荐的 43.44792591652924%

各种类型的冲突数量：
accept_ours:28541 , 46.66%
mixline:6997 , 11.44%
accept_theirs:13620 , 22.27%
newline:10358 , 16.93%
delete_all:255 , 0.42%
concat_ours_theirs:915 , 1.50%
accept_base:146 , 0.24%
concat_theirs_ours:334 , 0.55%

各种类型的冲突正确率：
accept_ours:479/28541 , 1.68%
mixline:4066/6997 , 58.11%
accept_theirs:374/13620 , 2.75%
newline:0/10358 , 0.00%
delete_all:109/255 , 42.75%
concat_ours_theirs:838/915 , 91.58%
accept_base:1/146 , 0.68%
concat_theirs_ours:0/334 , 0.00%





## 结果可视化

In [1]:
kind_counter = {
    'accept_ours':28541,
    'mixline': 6997,
    'accept_theirs': 13620,
    'newline': 10358,
    'delete_all': 255,
    'concat_ours_theirs': 915,
    'accept_base': 146,
    'concat_theirs_ours': 334,
}

In [5]:
# 绘制所有类型冲突的占比饼图
import plotly.graph_objects as go

# 和 MergeBERT数据集图片对应
_labels = ['newline', 'mixline', 'accept_ours', 'accept_theirs', 'concat_ours_theirs', 'concat_theirs_ours', 'delete_all', 'accept_base']
_values = [kind_counter[label] for label in _labels]

# 按label的顺序，顺时针绘制饼图
# 注意：按顺序
fig = go.Figure(data=[go.Pie(labels=_labels, values=_values, sort=False)])

# Updating the layout for better visualization
fig.update_layout(title_text='解决类型分布', width=600)

# Showing the pie chart
fig.show()

In [13]:
import pandas as pd
# 所有语言数据中伪冲突的比例

suggestion_given_rate = {
    k: resolution_offerable_kind[k]/v*100 for k, v in kind_counter.items()
}
suggestion_given_rate_df_dic = {'类别':suggestion_given_rate.keys(), '伪冲突占比':suggestion_given_rate.values()}
falsepositive_rate = pd.DataFrame(suggestion_given_rate_df_dic)
falsepositive_rate['伪冲突占比'] = falsepositive_rate['伪冲突占比'].round(2)
falsepositive_rate['伪冲突占比'] = falsepositive_rate['伪冲突占比'].apply(lambda x: str(x)+'%')
falsepositive_rate

Unnamed: 0,类别,伪冲突占比
0,accept_ours,14.46%
1,mixline,77.53%
2,accept_theirs,21.79%
3,newline,17.3%
4,delete_all,55.69%
5,concat_ours_theirs,97.16%
6,accept_base,66.44%
7,concat_theirs_ours,97.9%


In [9]:
# 所有语言数据中冲突的解决比例
import pandas as pd
correct_rate = {
    k: correct_counter[k]/v*100 for k, v in kind_counter.items()
}
correct_rate_df_dic = {'类别':correct_rate.keys(), '正确率':correct_rate.values()}
correct_rate_df = pd.DataFrame(correct_rate_df_dic)
correct_rate_df['正确率'] = correct_rate_df['正确率'].round(2)
correct_rate_df['正确率'] = correct_rate_df['正确率'].apply(lambda x: str(x)+'%')
correct_rate_df

Unnamed: 0,类别,正确率
0,accept_ours,1.68%
1,mixline,58.11%
2,accept_theirs,2.75%
3,newline,0.0%
4,delete_all,42.75%
5,concat_ours_theirs,91.58%
6,accept_base,0.68%
7,concat_theirs_ours,0.0%


In [10]:
# 根据 伪冲突解决正确率 降序排列
# 所有语言数据中各类别伪冲突的正确率
import pandas as pd
correct_out_of_givin_dic = {'类别':correct_rate.keys(), '伪冲突解决正确率':[correct_rate[i]/suggestion_given_rate[i] * 100 for i in correct_rate.keys()]}
correct_out_of_givin_df = pd.DataFrame(correct_out_of_givin_dic)
# 根据 伪冲突解决正确率 降序排列
correct_out_of_givin_df = correct_out_of_givin_df.sort_values(by='伪冲突解决正确率', ascending=False)
correct_out_of_givin_df['伪冲突解决正确率'] = correct_out_of_givin_df['伪冲突解决正确率'].round(2)
correct_out_of_givin_df['伪冲突解决正确率'] = correct_out_of_givin_df['伪冲突解决正确率'].apply(lambda x: str(x)+'%')
correct_out_of_givin_df

Unnamed: 0,类别,伪冲突解决正确率
5,concat_ours_theirs,94.26%
4,delete_all,76.76%
1,mixline,74.95%
2,accept_theirs,12.6%
0,accept_ours,11.61%
6,accept_base,1.03%
3,newline,0.0%
7,concat_theirs_ours,0.0%


In [21]:
# 使用 plotly 绘制条形图，横坐标是各个类型，
# 纵坐标高度由三部分组成：灰色表示总数减去给出建议的部分，绿色部分表示正确的部分、蓝色表示给出建议但不正确的部分

import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=[kind_counter[k] * correct_rate[k] / 100 for k in kind_counter.keys()],
    name='correct',
    marker_color='green',
    customdata=[round(correct_rate[k], 1) for k in kind_counter.keys()],
    texttemplate="%{customdata}%",
))
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=[kind_counter[k] * (suggestion_given_rate[k] - correct_rate[k]) / 100 for k in kind_counter.keys()],
    name='incorrect',
    marker_color='rgb(52,130,198)',
))
fig.add_trace(go.Bar(
    x=list(kind_counter.keys()),
    y=list([kind_counter[k] * (100 - suggestion_given_rate[k]) / 100 for k in kind_counter.keys()]),
    name='total',
    marker_color='#aaa'
))

fig.update_layout(
    title='各类别冲突伪冲突解决情况',
    xaxis_tickfont_size=14,
    width=1000,
    yaxis=dict(
        title='冲突数量',
        titlefont_size=16,
        tickfont_size=14,
    ),
    legend=dict(
        x=0.8,
        y=1.0,
        bgcolor='rgba(255, 255, 255, 0)',
        bordercolor='rgba(255, 255, 255, 0)'
    ),
    barmode='stack',
    bargap=0.15,
    bargroupgap=0.1
)

fig.show()