In [1]:
import re
import numpy as np

def remove_docstrings(code):
    # 使用正则表达式匹配被 """ 包括的部分，并替换为空字符串
    cleaned_code = re.sub(r'""".*?"""', '', code, flags=re.DOTALL)
    cleaned_code = re.sub(r"'''.*?'''", '', cleaned_code, flags=re.DOTALL)
    return cleaned_code

def cosine_similarity_matrix(nl_features, code_features):
    # 计算每个特征向量的范数
    nl_norms = np.linalg.norm(nl_features, axis=1, keepdims=True)
    code_norms = np.linalg.norm(code_features, axis=1, keepdims=True)
    
    # 计算点积
    dot_product = np.dot(nl_features, code_features.T)
    
    # 计算余弦相似度矩阵
    cosine_similarity = dot_product / (nl_norms * code_norms.T)
    
    return cosine_similarity

In [2]:
def remove_padding_intervals(tokenized_id_data):
    """
    移除 tokenized_id_data 中所有 `[0,0]` 区间（除非它是开头的第一个区间）。

    Args:
    - tokenized_id_data (list of lists): 包含区间的列表，每个区间是一个长度为 2 的列表。

    Returns:
    - filtered_data (list of lists): 移除 `[0,0]` 填充数据后的区间列表。
    """
    if isinstance(tokenized_id_data, np.ndarray):
        tokenized_id_data = tokenized_id_data.tolist()  # 将 NumPy 数组转换为列表
    # 处理的结果列表
    filtered_data = []

    # 保留开头的 `[0,0]` 区间（如果存在）
    if tokenized_id_data and tokenized_id_data[0] == [0,0]:
        filtered_data.append([0,0])
        start_index = 1  # 从第二个元素开始检查
    else:
        start_index = 0

    # 处理剩余的区间
    for interval in tokenized_id_data[start_index:]:
        if interval != [0,0]:  # 仅添加非 `[0,0]` 区间
            filtered_data.append(interval)

    return filtered_data

In [3]:
import json
def load_loss_data(filepath):
    with open(filepath, 'r') as file:
        data = json.load(file)
    return np.array(data)

In [4]:
with open("/home/yiming/cophi/projects/fork/CodeBERT/GraphCodeBERT/codesearch/auto_labelling/train.jsonl", "r") as f:
    train_dataset = [json.loads(line) for line in f.readlines()]


In [5]:
import json
import os

# 文件路径
json_file_path = '/home/yiming/cophi/projects/fork/CodeBERT/GraphCodeBERT/codesearch/auto_labelling/tokenized_code_tokens_train.json'

# 读取JSON文件
with open(json_file_path, 'r', encoding='utf-8') as f:
    code_tokens_data = json.load(f)

# 文件路径
json_file_path = '/home/yiming/cophi/projects/fork/CodeBERT/GraphCodeBERT/codesearch/auto_labelling/tokenized_comment_tokens_train.json'


# 读取JSON文件
with open(json_file_path, 'r', encoding='utf-8') as f:
    nl_tokens_data = json.load(f)

In [6]:
# 预设10种高对比度颜色
high_contrast_colors = [
    "#FF0000", "#00FF00", "#0000FF", "#FF00FF", "#00FFFF",
    "#800000", "#008000", "#000080", "#808080", "#FFA500"
]

In [403]:
import json
input_path = "/home/yiming/cophi/projects/fork/CodeBERT/GraphCodeBERT/codesearch/auto_labelling/auto_label_unique.jsonl"
idx_list = []
match_list = []

with open(input_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip().rstrip(',')  # 去除行末的逗号
        json_obj = json.loads(line)
        idx_list.append(json_obj['idx'])
        match_list.append(json_obj['match'])

In [510]:
indice = 42

In [511]:
# 提取match_list中对应索引的值
match_values = match_list[indice]

# 输出结果
print("Corresponding values in match_list:", match_values)
# 初始化结果列表
comment_list = []
code_list = []

# 遍历 match_values，处理每个 comment 和 code 的区间
for match in match_values:  # 假设 match_values 的结构是 [[...]]
    comment_intervals, code_intervals = match

    # 展开 comment 的所有索引
    comment_indices = []
    for start, end in zip(comment_intervals[::2], comment_intervals[1::2]):
        comment_indices.extend(range(start, end + 1))
    comment_list.append(comment_indices)

    # 展开 code 的所有索引
    code_indices = []
    for start, end in zip(code_intervals[::2], code_intervals[1::2]):
        code_indices.extend(range(start, end + 1))
    code_list.append(code_indices)

# 输出结果
print("Comment indices list:", comment_list)
print("Code indices list:", code_list)


Corresponding values in match_list: [[[0, 2], [13, 31]], [[4, 4, 6, 6], [10, 10, 33, 36]]]
Comment indices list: [[0, 1, 2], [4, 6]]
Code indices list: [[13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31], [10, 33, 34, 35, 36]]


In [512]:
import numpy as np
from collections import deque
from IPython.display import display, HTML


token_list1 = nl_tokens_data[idx_list[indice]][1:]
doc_snippet = train_dataset[idx_list[indice]]["docstring"]
# 将 token 与对应的贡献值配对，并去掉 token 中的 "Ġ" 符号
tokens_with_contributions = deque([(token.replace("Ġ", ""), 1.0) for token in token_list1])

# 为每个 comment_list 的索引组指定颜色
color_map = {}
for i, indices in enumerate(comment_list):
    color = high_contrast_colors[i % len(high_contrast_colors)]  # 循环使用颜色
    for idx in indices:
        color_map[idx] = color

# 生成HTML带字体大小和颜色调整的输出
html_string = "<pre>"
buffer = ""
current_index = 0  # 追踪 code_snippet 中字符的索引位置

# 遍历原始代码片段的每个字符
for char in doc_snippet:
    if char == "\n":
        # 遇到换行符则添加 <br> 标签并清空缓冲区
        html_string += buffer + "<br>"
        buffer = ""
    elif tokens_with_contributions:
        # 获取当前 token 和其贡献值
        token, _ = tokens_with_contributions[0]
        buffer += char

        # 逐字符匹配：检查 token 是否与 buffer 逐字符匹配
        if buffer == token:
            color = color_map.get(current_index, "black")  # 如果索引有颜色则应用，否则默认为黑色
            html_string += f'<span style="color: {color};">{buffer}</span>'
            buffer = ""  # 清空缓冲区
            tokens_with_contributions.popleft()  # 移除已匹配的 token
            current_index += 1
        elif not token.startswith(buffer):
            # 如果缓冲区字符序列和当前 token 不匹配，将缓冲区第一个字符添加到 HTML 并继续逐字符匹配
            color = color_map.get(current_index, "black")
            html_string += f'<span style="color: {color};">{buffer[0]}</span>'
            buffer = buffer[1:]      
        
    else:
        # 如果没有更多 token 需要匹配，默认输出字符
        color = color_map.get(current_index, "black")
        html_string += f'<span style="color: {color};">{char}</span>'

html_string += buffer  # 添加剩余的缓冲区内容
html_string += "</pre>"

In [513]:
# 假设我们有原始代码片段和分词后的 tokens（包含缩进和换行）
code_snippet = remove_docstrings(train_dataset[idx_list[indice]]["code"])
token_list2 = code_tokens_data[idx_list[indice]][1:]


# 将 token 与对应的贡献值配对，并去掉 token 中的 "Ġ" 符号
tokens_with_contributions = deque([(token.replace("Ġ", ""), 1.0) for token in token_list2])

# 为每个 code_list 的索引组指定颜色
color_map = {}
for i, indices in enumerate(code_list):
    color = high_contrast_colors[i % len(high_contrast_colors)]  # 循环使用颜色
    for idx in indices:
        color_map[idx] = color

# 生成HTML带颜色调整的输出
formatted_output = "<pre>"
buffer = ""
current_index = 0  # 追踪 code_snippet 中字符的索引位置

# 遍历原始代码片段的每个字符
for char in code_snippet:
    if char == "\n":
        # 遇到换行符则添加 <br> 标签并清空缓冲区
        formatted_output += buffer + "<br>"
        buffer = ""
    elif tokens_with_contributions:
        # 获取当前 token 和其贡献值
        token, _ = tokens_with_contributions[0]
            
        buffer += char

        # 逐字符匹配：检查 token 是否与 buffer 逐字符匹配
        if buffer == token:
            color = color_map.get(current_index, "black")  # 如果索引有颜色则应用，否则默认为黑色
            formatted_output += f'<span style="color: {color};">{buffer}</span>'
            buffer = ""  # 清空缓冲区
            tokens_with_contributions.popleft()  # 移除已匹配的 token
            current_index += 1
        elif not token.startswith(buffer):
            # 如果缓冲区字符序列和当前 token 不匹配，将缓冲区第一个字符添加到 HTML 并继续逐字符匹配
            color = color_map.get(current_index, "black")
            formatted_output += f'<span style="color: {color};">{buffer[0]}</span>'
            buffer = buffer[1:]      
        
    else:
        # 如果没有更多 token 需要匹配，默认输出字符
        color = color_map.get(current_index, "black")
        formatted_output += f'<span style="color: {color};">{char}</span>'

formatted_output += buffer  # 添加剩余的缓冲区内容
formatted_output += "</pre>"

# 在Jupyter Notebook中显示带有字体大小和颜色调整的文本
display(HTML(html_string))
# 在Jupyter Notebook中显示带有颜色调整的文本
display(HTML(formatted_output))