In [5]:
import json
import os
import re

# 设定语言
LANGS = [
    'java',
    'python',
    'javascript',
    'typescript',
    'go'
]

# 创建目录
NECESSARY_DIRS = [f'dataset/{x}' for x in LANGS] \
    + [f'enre_out/{x}' for x in LANGS] \
    + [f'repo/{x}' for x in LANGS]

for d in NECESSARY_DIRS:
    os.makedirs(d, exist_ok=True)


In [6]:
# 提取各个数据集的仓库 URL，已有忽略

# 根据使用修改：存放 CoEdPilot 训练集的目录
base_folder = r"C:\Users\aaa\Desktop\CoEdPilot-final-phase\dataset&model\locator_data"

def commit_url_to_repo_url(commit_url):
    return re.search(r"(.*?)/commit/\w+$", commit_url)[1]

# 提取 base_folder 中 from_path 数据集的后缀为 lang_suffix 的文件的仓库地址，存放到 out_path
def extract_distinct_repo_url(from_path, to_path, lang_suffix):
    distinct_repo_url = set()
    with open(os.path.join(base_folder, from_path), 'r', encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            json_data = json.loads(line)
            if json_data["file_name"].endswith(f".{lang_suffix}"):
                distinct_repo_url.add(commit_url_to_repo_url(json_data["html_url"]))
    distinct_repo_url = list(distinct_repo_url)

    with open(to_path, "w+", encoding="utf-8") as f:
        json.dump(distinct_repo_url, f)

# 为指定 lang 提取仓库地址
def extract_distinct_repo_url_for_lang(lang, lang_suffix):
    extract_distinct_repo_url(f"{lang}/test.jsonl", f"repo_url/{lang}_repo_urls.json", lang_suffix)

In [8]:
sets = {}

DATASET_TYPES = ["test", "train", "dev"]
for i in DATASET_TYPES:
    distinct_repo_url = set()
    with open(os.path.join(base_folder, f"java/{i}.jsonl"), 'r', encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            json_data = json.loads(line)
            if json_data["file_name"].endswith(f".java"):
                distinct_repo_url.add(commit_url_to_repo_url(json_data["html_url"]))
    sets[i] = distinct_repo_url

In [15]:
extract_distinct_repo_url_for_lang("", "js")

In [1]:
# 运行对应语言的 ENRE 程序，并制作数据集

import make_nn_dataset_for_lang
import importlib
importlib.reload(make_nn_dataset_for_lang)
import json

# 为指定语言运行 make_dataset
def make_dataset_for_lang(lang):
    repos = []
    with open(f"repo_url/{lang}_repo_urls.json", "r", encoding="utf-8") as f:
        repos = json.load(f)

    for repo_url in repos:
        make_nn_dataset_for_lang.make_dataset(repo_url, lang)

In [3]:
make_nn_dataset_for_lang.make_dataset("https://github.com/openzipkin/zipkin", "java")

Ignoring https://github.com/openzipkin/zipkin. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\zipkin-enre-out\zipkin-out.json existed. Skipping...


100%|██████████| 546/546 [00:28<00:00, 19.07it/s]


pos_n: 62104, neg_n: 10935
Totally 73039 samples
dataset length: 73039


In [3]:
import json
import random
with open('dataset/java/zipkin/all.json', 'r') as f:
    temp = json.load(f)

In [10]:
print(json.dumps(random.choice(temp), indent=4))

[
    [
        "    Endpoint three = Endpoint.newBuilder().serviceName(db).ip(\"127.0.0.3\").build();\n\n    List<Span> trace = asList(\n      Span.newBuilder().traceId(traceId).id(\"10\").name(\"get\")\n        .kind(Kind.SERVER)\n",
        "    Endpoint two = Endpoint.newBuilder().serviceName(backend).ip(\"127.0.0.2\").build();\n    Endpoint twoPort3002 = two.toBuilder().port(3002).build();\n    Endpoint three = Endpoint.newBuilder().serviceName(db).ip(\"127.0.0.3\").build();\n\n    List<Span> trace = asList(\n      Span.newBuilder().traceId(traceId).id(\"10\").name(\"get\")\n        .kind(Kind.SERVER)\n        .timestamp(TODAY * 1000L)\n        .duration(350 * 1000L)\n        .localEndpoint(one)\n"
    ],
    [
        1,
        1
    ],
    [
        {
            "src": 45104,
            "dest": 45116,
            "type": "Set"
        },
        {
            "src": 45104,
            "dest": 45115,
            "type": "Set"
        }
    ]
]


In [2]:
make_dataset_for_lang("java")

Ignoring https://github.com/openzipkin/zipkin. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\zipkin-enre-out\zipkin-out.json existed. Skipping...


100%|██████████| 546/546 [00:56<00:00,  9.63it/s]


pos_n: 18858, neg_n: 37860
Totally 56718 samples
dataset length: 56718
Ignoring https://github.com/signalapp/Signal-Android. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\Signal-Android-enre-out\Signal-Android-out.json existed. Skipping...


100%|██████████| 2352/2352 [01:38<00:00, 23.87it/s]


pos_n: 36336, neg_n: 73356
Totally 109692 samples
dataset length: 109692
Ignoring https://github.com/oracle/graal. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\graal-enre-out\graal-out.json existed. Skipping...


100%|██████████| 11834/11834 [26:25<00:00,  7.46it/s]  


pos_n: 304324, neg_n: 612708
Totally 917032 samples
dataset length: 917032
Ignoring https://github.com/scwang90/SmartRefreshLayout. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\SmartRefreshLayout-enre-out\SmartRefreshLayout-out.json existed. Skipping...


100%|██████████| 206/206 [00:04<00:00, 47.09it/s]


pos_n: 384, neg_n: 2412
Totally 2796 samples
dataset length: 2796
Ignoring https://github.com/redisson/redisson. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\redisson-enre-out\redisson-out.json existed. Skipping...


100%|██████████| 2482/2482 [01:58<00:00, 20.98it/s] 


pos_n: 44037, neg_n: 89479
Totally 133516 samples
dataset length: 133516
Ignoring https://github.com/winterbe/java8-tutorial. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\java8-tutorial-enre-out\java8-tutorial-out.json existed. Skipping...


100%|██████████| 68/68 [00:00<00:00, 96.13it/s] 


pos_n: 315, neg_n: 631
Totally 946 samples
dataset length: 946
Ignoring https://github.com/proxyee-down-org/proxyee-down. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\proxyee-down-enre-out\proxyee-down-out.json existed. Skipping...


100%|██████████| 42/42 [00:01<00:00, 34.00it/s]


pos_n: 174, neg_n: 610
Totally 784 samples
dataset length: 784
Ignoring https://github.com/spring-projects/spring-framework. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\spring-framework-enre-out\spring-framework-out.json existed. Skipping...


100%|██████████| 7847/7847 [07:39<00:00, 17.07it/s] 


pos_n: 133153, neg_n: 270576
Totally 403729 samples
dataset length: 403729
Ignoring https://github.com/skylot/jadx. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\jadx-enre-out\jadx-out.json existed. Skipping...


100%|██████████| 1530/1530 [00:37<00:00, 41.09it/s] 


pos_n: 21642, neg_n: 43466
Totally 65108 samples
dataset length: 65108
Ignoring https://github.com/zxing/zxing. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\zxing-enre-out\zxing-out.json existed. Skipping...


100%|██████████| 515/515 [00:19<00:00, 26.25it/s]


pos_n: 6972, neg_n: 14070
Totally 21042 samples
dataset length: 21042
Ignoring https://github.com/spring-projects/spring-boot. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\spring-boot-enre-out\spring-boot-out.json existed. Skipping...


100%|██████████| 6667/6667 [03:06<00:00, 35.72it/s] 


pos_n: 61810, neg_n: 125448
Totally 187258 samples
dataset length: 187258
Ignoring https://github.com/termux/termux-app. Repo already cloned
ENRE out file c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\enre_out\java\termux-app-enre-out\termux-app-out.json existed. Skipping...


JSONDecodeError: Invalid \escape: line 377581 column 26 (char 10068035)

In [6]:
import os
import json
import random
import math
from run import train_on_single_lang, run_test
from itertools import chain

DATASET_TYPES = ["test", "train", "valid"]

def combine_dataset_for_lang(lang):
    dataset_dir = f"dataset/{lang}"
    dataset_of_repos = [f.path for f in os.scandir(dataset_dir) if f.is_dir()]
    for dataset_type in DATASET_TYPES:
        all_data = []
        for repo in dataset_of_repos:
            with open(f"{repo}/{dataset_type}.json", "r", encoding="utf-8") as f:
                all_data += json.load(f)
        with open(os.path.join(dataset_dir, f"{dataset_type}.json"), "w") as f:
            json.dump(all_data, f)

def combine_dataset_for_multiple_lang(langs, out_dir, weights):
    if weights is None:
        weights = {}
    for lang in langs:
        if lang not in weights:
            weights[lang] = 1.0
    print(weights)
    
    all_data = {x:{y:[] for y in DATASET_TYPES} for x in langs}
    
    for lang in langs:
        dataset_dir = f"dataset/{lang}"
        for dataset_type in DATASET_TYPES:
            data = all_data[lang][dataset_type]
            with open(os.path.join(dataset_dir, f"{dataset_type}.json"), "r", encoding="utf-8") as f:
                data_to_add = json.load(f)
                if weights[lang] < 1.0:
                    data_to_add = random.sample(data_to_add, math.floor(len(data_to_add) * weights[lang]))
                data += data_to_add

    for dataset_type in DATASET_TYPES:
        data = list(chain.from_iterable([all_data[x][dataset_type] for x in langs]))
        random.shuffle(data)
        with open(os.path.join(out_dir, f"{dataset_type}.json"), "w") as f:
            json.dump(data, f)

def train_on_lang_default(lang):
    model_name = 'huggingface/CodeBERTa-small-v1'
    batch_size = 32
    train = True
    test = True
    print(f'--model: {model_name}, --lang: {lang}, --train: {train}, --test {test}, --batch_size: {batch_size}')
    train_on_single_lang(model_name, lang, batch_size, train, test, None)


In [3]:
combine_dataset_for_lang("python")

In [7]:
train_on_lang_default("java_small")

--model: huggingface/CodeBERTa-small-v1, --lang: java, --train: True, --test True, --batch_size: 32


Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You shoul

No checkpoint found. Training from scratch...


  0%|          | 0/478853 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (689 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 478853/478853 [05:14<00:00, 1522.07it/s]
100%|██████████| 68407/68407 [00:44<00:00, 1548.38it/s]
100%|██████████| 136816/136816 [01:29<00:00, 1529.26it/s]
Training: 100%|██████████| 14965/14965 [3:24:59<00:00,  1.22it/s, loss=1.01]    
Training: 100%|██████████| 14965/14965 [3:18:42<00:00,  1.26it/s, loss=0.0333]  
100%|██████████| 2138/2138 [08:24<00:00,  4.23it/s]


mode: dev, A depend on B
acc: 0.9698130308301782, precision: 0.964363916359874, recall: 0.985637696129064, F1: 0.9748847618005132
mode: dev, A depend by B
acc: 0.9673132866519508, precision: 0.96370441922241, recall: 0.9819635826771653, F1: 0.9727483241925655
Validation Loss: 0.10413794356012339


100%|██████████| 4276/4276 [16:48<00:00,  4.24it/s]


mode: test, A depend on B
acc: 0.9695284177289206, precision: 0.9641039730511025, recall: 0.9857828588179254, F1: 0.9748229027641059
mode: test, A depend by B
acc: 0.9670140919190738, precision: 0.9636727791703146, recall: 0.9818661713671245, F1: 0.9726844089893898
Test Loss: 0.10475712369907894


Training: 100%|██████████| 14965/14965 [3:00:18<00:00,  1.38it/s, loss=0.0292]   
Training: 100%|██████████| 14965/14965 [7:06:37<00:00,  1.71s/it, loss=0.0116]   
100%|██████████| 2138/2138 [18:00<00:00,  1.98it/s]


mode: dev, A depend on B
acc: 0.9746078617685324, precision: 0.9666482844606421, recall: 0.991490826816192, F1: 0.978911969308842
mode: dev, A depend by B
acc: 0.9740669814492668, precision: 0.9660862474216914, recall: 0.9911417322834646, F1: 0.9784536157601963
Validation Loss: 0.08887577650657358


100%|██████████| 4276/4276 [28:28<00:00,  2.50it/s]


mode: test, A depend on B
acc: 0.9747178692550579, precision: 0.9668389216993714, recall: 0.9917677378378709, F1: 0.9791446848791429
mode: test, A depend by B
acc: 0.974045433282657, precision: 0.9663763419080413, recall: 0.991091939977516, F1: 0.97857810756191
Test Loss: 0.08917729847261227


Training:   1%|          | 156/14965 [04:53<7:44:48,  1.88s/it, loss=0.0111] 


KeyboardInterrupt: 

In [4]:
run_test("model/java/model_checkpoint_java_23_12_24.bin", "python", 'huggingface/CodeBERTa-small-v1', 32)

Some weights of the model checkpoint at huggingface/CodeBERTa-small-v1 were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You shoul

Loading checkpoint...


  9%|▉         | 2269/25469 [00:00<00:08, 2676.12it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (755 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 25469/25469 [00:12<00:00, 2097.12it/s]
100%|██████████| 796/796 [03:20<00:00,  3.97it/s]

mode: test, A depend on B
acc: 0.7414503906710118, precision: 0.5545750854598934, recall: 0.8766021613470721, F1: 0.6793592053367095
mode: test, A depend by B
acc: 0.5549491538733362, precision: 0.3475036179450072, recall: 0.4824814768303403, F1: 0.40401703559598295
Test Loss: 1.5594558830387029





In [5]:
combine_dataset_for_multiple_lang(["python", "java"], "dataset/python_java", {"python": 1.0, "java": 0.1})

{'python': 1.0, 'java': 0.1}


In [2]:
def train_on_lang_from_checkpoint(lang, checkpoint_path=None, epoch=10, batch_size=32):
    model_name = 'huggingface/CodeBERTa-small-v1'
    train = True
    test = True
    print(f'--model: {model_name}, --lang: {lang}, --train: {train}, --test {test}, --batch_size: {batch_size}')
    train_on_single_lang(model_name, lang, batch_size, train, test, checkpoint_path, epoch)

train_on_lang_from_checkpoint("java_new_small", epoch=6, batch_size=16)
# train_on_lang_from_checkpoint("python_java", "./model/java_small/*.bin", epoch=8, batch_size=16)

--model: huggingface/CodeBERTa-small-v1, --lang: java_small, --train: True, --test True, --batch_size: 16


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No checkpoint found. Training from scratch...


  1%|          | 1607/143655 [00:00<01:00, 2349.59it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2800 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 143655/143655 [01:00<00:00, 2362.15it/s]
100%|██████████| 20522/20522 [00:08<00:00, 2430.77it/s]
100%|██████████| 41044/41044 [00:16<00:00, 2437.22it/s]
Training: 100%|██████████| 8979/8979 [55:50<00:00,  2.68it/s, loss=0.222]  
Training: 100%|██████████| 8979/8979 [55:37<00:00,  2.69it/s, loss=0.135]  
100%|██████████| 1283/1283 [02:33<00:00,  8.37it/s]


mode: dev, A depend on B
acc: 0.9465451710359614, precision: 0.9523265306122449, recall: 0.9578783151326054, F1: 0.955094355069794
mode: dev, B depend by A
acc: 0.9414774388461163, precision: 0.9486153092067642, recall: 0.9530531845042679, F1: 0.9508290685772774
Validation Loss: 0.17346096512682255


Training: 100%|██████████| 8979/8979 [51:17<00:00,  2.92it/s, loss=0.0194]  
Training: 100%|██████████| 8979/8979 [51:19<00:00,  2.92it/s, loss=0.0138]   
100%|██████████| 1283/1283 [02:33<00:00,  8.37it/s]


mode: dev, A depend on B
acc: 0.9547802358444596, precision: 0.9612199721242929, recall: 0.9626406108875935, F1: 0.9619297669839186
mode: dev, B depend by A
acc: 0.9520027287788715, precision: 0.9599934280785345, recall: 0.9591267235718975, F1: 0.9595598801165989
Validation Loss: 0.14275865586723518


Training: 100%|██████████| 8979/8979 [51:19<00:00,  2.92it/s, loss=0.00176]  
Training: 100%|██████████| 8979/8979 [51:18<00:00,  2.92it/s, loss=0.00417]  
100%|██████████| 1283/1283 [02:33<00:00,  8.38it/s]


mode: dev, A depend on B
acc: 0.9609687164993665, precision: 0.955849358974359, recall: 0.9794728631250513, F1: 0.9675169309379943
mode: dev, B depend by A
acc: 0.9589708605399084, precision: 0.9541159513132608, recall: 0.9779218647406435, F1: 0.9658722438391699
Validation Loss: 0.14923144726442727


Training: 100%|██████████| 8979/8979 [51:18<00:00,  2.92it/s, loss=0.0382]   
Training: 100%|██████████| 8979/8979 [51:18<00:00,  2.92it/s, loss=0.00041]  
100%|██████████| 1283/1283 [02:33<00:00,  8.38it/s]


mode: dev, A depend on B
acc: 0.9496637754604814, precision: 0.9387498031806015, recall: 0.9790623203875524, F1: 0.9584823761102849
mode: dev, B depend by A
acc: 0.9481532014423546, precision: 0.9395256916996048, recall: 0.9754596191726855, F1: 0.9571555126036885
Validation Loss: 0.19144804556077363


100%|██████████| 41044/41044 [00:17<00:00, 2406.59it/s]
100%|██████████| 2566/2566 [05:06<00:00,  8.37it/s]


mode: test, A depend on B
acc: 0.9523925543319365, precision: 0.9418713450292397, recall: 0.9811956786613597, F1: 0.9611314449395291
mode: test, B depend by A
acc: 0.950687067537277, precision: 0.9426489599247855, recall: 0.9772570361044551, F1: 0.9596410767696909
Test Loss: 0.17644720438354622
--model: huggingface/CodeBERTa-small-v1, --lang: python_java, --train: True, --test True, --batch_size: 16


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using detected checkpoint file: ./model/java_small\model_java_small_20231225_225929.bin


  1%|▏         | 1869/136995 [00:00<01:05, 2054.80it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (953 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 136995/136995 [01:00<00:00, 2261.34it/s]
100%|██████████| 19571/19571 [00:08<00:00, 2289.92it/s]
100%|██████████| 39150/39150 [00:16<00:00, 2307.88it/s]
Training: 100%|██████████| 8563/8563 [53:02<00:00,  2.69it/s, loss=0.47]   
Training: 100%|██████████| 8563/8563 [53:05<00:00,  2.69it/s, loss=0.128]  
100%|██████████| 1224/1224 [02:26<00:00,  8.37it/s]


mode: dev, A depend on B
acc: 0.9508456389555976, precision: 0.9313440077632217, recall: 0.9509536784741145, F1: 0.9410466968991299
mode: dev, B depend by A
acc: 0.946349190128251, precision: 0.9151780432982373, recall: 0.9587309455942497, F1: 0.9364483718678126
Validation Loss: 0.18471033477948773


Training: 100%|██████████| 8563/8563 [48:57<00:00,  2.91it/s, loss=0.00892] 
Training: 100%|██████████| 8563/8563 [49:00<00:00,  2.91it/s, loss=0.0244]   
100%|██████████| 1224/1224 [02:26<00:00,  8.37it/s]


mode: dev, A depend on B
acc: 0.9597363445914874, precision: 0.9399758454106281, recall: 0.9639583849393114, F1: 0.9518160694631284
mode: dev, B depend by A
acc: 0.9585100403658474, precision: 0.9366951498375256, recall: 0.9645557070268931, F1: 0.9504212968616437
Validation Loss: 0.12643623364139334


Training: 100%|██████████| 8563/8563 [48:59<00:00,  2.91it/s, loss=0.000844] 
Training: 100%|██████████| 8563/8563 [50:56<00:00,  2.80it/s, loss=0.000804] 
100%|██████████| 1224/1224 [02:33<00:00,  7.98it/s]


mode: dev, A depend on B
acc: 0.9562107199427725, precision: 0.9348114230630197, recall: 0.9608620262571216, F1: 0.9476577291882978
mode: dev, B depend by A
acc: 0.9549844157171324, precision: 0.9301101005265677, recall: 0.9631924649894659, F1: 0.9463622526636225
Validation Loss: 0.1652732740799395


Training:   9%|▉         | 793/8563 [04:43<46:21,  2.79it/s, loss=0.00348]  

Unexpected exception formatting exception. Falling back to standard exception



Traceback (most recent call last):
  File "c:\Users\aaa\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\aaa\AppData\Local\Temp\ipykernel_31864\2672801796.py", line 9, in <module>
    train_on_lang_from_checkpoint("python_java", "./model/java_small/*.bin", epoch=8, batch_size=16)
  File "C:\Users\aaa\AppData\Local\Temp\ipykernel_31864\2672801796.py", line 6, in train_on_lang_from_checkpoint
    train_on_single_lang(model_name, lang, batch_size, train, test, checkpoint_path, epoch)
  File "c:\Users\aaa\Desktop\edit-pilot-model\dependency_analyzer\run.py", line 174, in train_on_single_lang
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\aaa\AppData\Local\Programs\Python\Python311\Lib\site-packages\IPython\core\interactiveshell.py", line 2102, in showtraceback
 

In [3]:
train_on_lang_from_checkpoint("java_small", "./model/java/*.bin", epoch=10, batch_size=16)

--model: huggingface/CodeBERTa-small-v1, --lang: java_small, --train: True, --test True, --batch_size: 16


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using detected checkpoint file: ./model/java\model_checkpoint_java_23_12_24.bin


  1%|          | 1656/143655 [00:00<00:59, 2404.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (2800 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 143655/143655 [01:02<00:00, 2286.07it/s]
100%|██████████| 20522/20522 [00:08<00:00, 2291.92it/s]
100%|██████████| 41044/41044 [00:17<00:00, 2328.42it/s]
Training:   1%|          | 67/8979 [00:29<1:04:27,  2.30it/s, loss=0.82] 


KeyboardInterrupt: 

In [3]:
dataset_dir = 'dataset/java/'
for dataset_type in DATASET_TYPES:
    with open(os.path.join(dataset_dir, f"{dataset_type}.json"), "r", encoding="utf-8") as f:
        data_to_add = json.load(f)
        data_to_add = random.sample(data_to_add, math.floor(len(data_to_add) * 0.3))
        with open('dataset/java_small/' + dataset_type + '.json', 'w', encoding="utf-8") as f:
            json.dump(data_to_add, f)

In [4]:
import json
DATASET_TYPES = ["test", "train", "valid"]
tot_len = 0
for i in DATASET_TYPES:
    with open(f"./dataset/python/{i}.json", "r", encoding="utf-8") as f:
        tot_len += len(json.load(f))
print(tot_len)

127310


In [3]:
import json
import os
import random
ENRE_OUT_PATH = "dataset/java"

VALID_REPOS = ['redisson', 'Signal-Android']
TEST_REPOS = ['spring-boot', 'spring-framework']

datasets = {
    'train': [],
    'valid': [],
    'test': []
}

def filter_data_to(i, repo_set, dataset):
    if i in repo_set:
        print(f'using {i} for repo type {repo_set}')
        with open(os.path.join(ENRE_OUT_PATH, i, "all.json"), "r", encoding="utf-8") as f:
            data = json.load(f)
            dataset += data
        return True
    return False

for i in os.listdir(ENRE_OUT_PATH):
    if not os.path.isdir(os.path.join(ENRE_OUT_PATH, i)):
        continue
    if not (filter_data_to(i, VALID_REPOS, datasets['valid']) or filter_data_to(i, TEST_REPOS, datasets['test'])):
        print(f'using {i} for training')
        with open(os.path.join(ENRE_OUT_PATH, i, "all.json"), "r", encoding="utf-8") as f:
            data = json.load(f)
            datasets['train'] += data

for dataset_type in 'train', 'valid', 'test':
    print(f'{dataset_type} dataset length: {len(datasets[dataset_type])}')
    random.shuffle(datasets[dataset_type])
    with open(os.path.join(ENRE_OUT_PATH, f"{dataset_type}.json"), 'w') as f:
        json.dump(datasets[dataset_type], f)

using graal for training
using jadx for training
using java8-tutorial for training
using proxyee-down for training
using redisson for repo type ['redisson', 'Signal-Android']
using Signal-Android for repo type ['redisson', 'Signal-Android']
using SmartRefreshLayout for training
using spring-boot for repo type ['spring-boot', 'spring-framework']
using spring-framework for repo type ['spring-boot', 'spring-framework']
using zipkin for training
using zxing for training
train dataset length: 1064426
valid dataset length: 243208
test dataset length: 590987


In [4]:
for dataset_type in 'train', 'valid', 'test':
    with open(os.path.join(ENRE_OUT_PATH, f"{dataset_type}.json"), 'r', encoding="utf-8") as f:
        large_dataset = json.load(f)
    small_dataset = random.sample(large_dataset, int(len(large_dataset)*0.1))
    with open(os.path.join(ENRE_OUT_PATH, '../java_new_small', f"{dataset_type}.json"), 'w') as f:
        json.dump(small_dataset, f)

In [7]:
def train_on_lang_from_checkpoint(lang, checkpoint_path=None, epoch=10, batch_size=32):
    model_name = 'huggingface/CodeBERTa-small-v1'
    train = True
    test = True
    print(f'--model: {model_name}, --lang: {lang}, --train: {train}, --test {test}, --batch_size: {batch_size}')
    train_on_single_lang(model_name, lang, batch_size, train, test, checkpoint_path, epoch)

train_on_lang_from_checkpoint("java_new_small", epoch=6, batch_size=16)
# train_on_lang_from_checkpoint("python_java", "./model/java_small/*.bin", epoch=8, batch_size=16)

--model: huggingface/CodeBERTa-small-v1, --lang: java_new_small, --train: True, --test True, --batch_size: 16


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at huggingface/CodeBERTa-small-v1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


No checkpoint found. Training from scratch...


  0%|          | 0/106442 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (756 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 106442/106442 [00:52<00:00, 2028.32it/s]
100%|██████████| 24320/24320 [00:10<00:00, 2265.45it/s]
100%|██████████| 59098/59098 [00:25<00:00, 2292.85it/s]
Training: 100%|██████████| 6653/6653 [43:14<00:00,  2.56it/s, loss=0.169]  
Training: 100%|██████████| 6653/6653 [43:10<00:00,  2.57it/s, loss=0.186]  
100%|██████████| 1520/1520 [03:02<00:00,  8.31it/s]


mode: dev, A depend on B
acc: 0.941735197368421, precision: 0.8371749044743005, recall: 0.9860627177700348, F1: 0.9055396306912873
mode: dev, B depend by A
acc: 0.9868009868421053, precision: 0.9624293785310735, recall: 0.9919930120832727, F1: 0.9769875976772529
Validation Loss: 0.18777880140972372


Training: 100%|██████████| 6653/6653 [39:09<00:00,  2.83it/s, loss=0.000283]
Training: 100%|██████████| 6653/6653 [38:14<00:00,  2.90it/s, loss=0.000218] 
100%|██████████| 1520/1520 [03:00<00:00,  8.41it/s]


mode: dev, A depend on B
acc: 0.9048519736842106, precision: 0.7516505281690141, recall: 0.9917247386759582, F1: 0.8551577366049075
mode: dev, B depend by A
acc: 0.9847861842105263, precision: 0.958904109589041, recall: 0.9884990537196099, F1: 0.9734767025089605
Validation Loss: 0.24292665464707552


Training: 100%|██████████| 6653/6653 [38:50<00:00,  2.85it/s, loss=0.00034]  
Training: 100%|██████████| 6653/6653 [39:14<00:00,  2.83it/s, loss=0.0347]  
100%|██████████| 1520/1520 [03:11<00:00,  7.93it/s]


mode: dev, A depend on B
acc: 0.9050986842105263, precision: 0.7510964912280702, recall: 0.9944831591173054, F1: 0.8558220889555223
mode: dev, B depend by A
acc: 0.9850740131578948, precision: 0.9575246132208157, recall: 0.991119522492357, F1: 0.9740324772873596
Validation Loss: 0.21279558275588753


100%|██████████| 59098/59098 [00:26<00:00, 2268.38it/s]
100%|██████████| 3694/3694 [07:52<00:00,  7.81it/s]


mode: test, A depend on B
acc: 0.99076110866696, precision: 0.9900773963087914, recall: 0.974223784417106, F1: 0.9820866141732284
mode: test, B depend by A
acc: 0.9935700023689465, precision: 0.9889610389610389, recall: 0.9863989637305699, F1: 0.9876783398184176
Test Loss: 0.02648400789081337


In [None]:
import torch
cp = torch.load('model/java_new_small/model_java_new_small_20240103_135049_epoch5.bin')
torch.save({'model_state_dict': cp['model_state_dict']}, 'model_java_new_small_20240103_135049_epoch5.bin')