In [1]:
import json
import os
import sys
from datetime import datetime
import pandas as pd
import torch
from torch import BoolTensor
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 256

DIR_OUTPUT = 'results'
DEVICE_DEFAULT = 'cuda'

INDEXS_LABELS = [3231, 3698, 4031, 4044, 5310, 6502, 8241]
LABELS = ['testcase', 'targetvm', 'product', 'nimbus', 'usererror', 'infra', 'testbed']
DICT_CODE_LABEL = {
     6502: 'infra',
     3698: 'targetvm',
     3231: 'testcase',
     4044: 'nimbus',
     5310: 'usererror',
     4031: 'product',
     8241: 'testbed'
}

def get_ts():
    return datetime.utcnow().replace(microsecond=0).isoformat()
# end



class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __getitem__(self, idx):
        masks, embedding = self.pairs[idx]
        
        while len(embedding) < MAX_LENGTH:
            embedding.append(0)
        # end

        attention = list([1 for _ in range(MAX_LENGTH)])

        pt_embedding = torch.LongTensor(embedding)
        pt_label = torch.LongTensor(embedding)
        pt_attention = torch.LongTensor(attention)

        pt_index_masks = torch.LongTensor(masks)

        pt_embedding.index_fill_(0, pt_index_masks, 103)
        pt_attention.masked_fill_(pt_attention == 103, 0)
        pt_attention.masked_fill_(pt_attention == 0, 0)

        item = {'input_ids': pt_embedding, 'attention_mask': pt_attention, 'labels': pt_label}
        return item

    def __len__(self):
        return len(self.pairs)
    # end
# end

def read_passages(path_data, test_size=0):
    df = pd.read_csv(path_data)

    pairs_mask_embedding = [(eval(str_masks), eval(str_embedding)) for str_masks, str_embedding in zip(df['masks'], df['embedding'])]
    
    if test_size > 0:
        return train_test_split(pairs_mask_embedding, test_size=test_size, random_state=234)
    else:
        return (pairs_mask_embedding, pairs_mask_embedding)
    # end
# end


def compute_metrics(pred):


    # labels: (64, 512)
    # preds: (64, 512, 30522)
    
    labels_all = pred.label_ids
    preds_all = pred.predictions.argmax(-1)
    
    
    filter_token = labels_all > 103
    labels = labels_all[filter_token]
    preds = preds_all[filter_token]
    
    # print(f'labels: {labels.shape}')
    # print(f'preds: {preds.shape}')

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
# end

def predict_plus(input_tokenized, model):

    out = model(**{k:v.unsqueeze(0).to(DEVICE_DEFAULT) for k,v in input_tokenized.items()}, output_attentions=True)
    logits = out.logits.cpu().squeeze(0)
    return logits
# end

def main_train_and_evaluate(name_train, path_train, path_test, path_output):
    print('[{}] start main_train_and_evaluate with {} {}'.format(get_ts(), path_train, path_test))

    model_name = MODEL_NAME
    max_length = MAX_LENGTH
    output_dir = DIR_OUTPUT

#     train_samples, valid_samples, train_labels, valid_labels = read_passages(path_train, 0.1)
    
#     # # TODO: DEBUG
#     train_samples = train_samples[:100]
#     train_labels = train_labels[:100]
    
#     valid_samples = valid_samples[:100]
#     valid_labels = valid_labels[:100]
#     # ###

#     tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

#     train_dataset = SimpleDataset(train_samples, train_labels)
#     valid_dataset = SimpleDataset(valid_samples, valid_labels)


    train_pairs, valid_pairs = read_passages(path_train, 120)
    
    # TODO: debug only
    # train_pairs = train_pairs[:32]
    # valid_pairs = valid_pairs[:32]
    
    train_dataset = SimpleDataset(train_pairs)
    valid_dataset = SimpleDataset(valid_pairs)

    model = DistilBertForMaskedLM.from_pretrained(model_name)

    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        num_train_epochs=10,  # total number of training epochs
        per_device_train_batch_size=8,  # batch size per device during training
        per_device_eval_batch_size=8,  # batch size for evaluation
        warmup_steps=0,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        load_best_model_at_end=True,
        # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_steps=1,  # log & save weights each logging_steps
        evaluation_strategy="epoch",  # evaluate each `logging_steps`
        learning_rate=2e-5,
        save_strategy='epoch',
        save_total_limit=5,
        metric_for_best_model='f1',
        # eval_accumulation_steps=5
    )

    trainer = Trainer(
        model=model,  # the instantiated Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=valid_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # the callback that computes metrics of interest
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # trainer = Trainer(
    #     model=model,  # the instantiated Transformers model to be trained
    #     args=training_args,  # training arguments, defined above
    #     train_dataset=train_dataset,  # training dataset
    #     eval_dataset=valid_dataset,  # evaluation dataset
    #     compute_metrics=compute_metrics
    # )

    print('[{}] start training...'.format(get_ts()))
    trainer.train()

    info_state_model = trainer.evaluate()
    print('[{}] finish training.'.format(get_ts()))

    ################## start to do eval ##################

    test_pairs, _ = read_passages(path_test, 0)
    test_dataset = SimpleDataset(test_pairs)

    list_conf_output = []
    list_label_output = []
    list_answer_output = []

    # INDEXS_LABELS = [3231, 3698, 4031, 4044, 5310, 6502, 6922]
    # LABELS = ['testcase', 'targetvm', 'product', 'nimbus', 'usererror', 'infra', 'testbed']
    
    for test_tokenized in test_dataset:
        
        with torch.no_grad():
            # out = model(**input_tokenized, output_hidden_states=True, output_attentions=True)
            logits_all_token = predict_plus(test_tokenized, model)
        # end
        
        # logits_this: (512, 30522)
        input_ids = test_tokenized['input_ids']
        index_mask = (input_ids == 103).nonzero(as_tuple=True)[-1].tolist()[0]
        # print(index_mask)

        logits_target_token = logits_all_token[index_mask, :]  # 35535, tensor
        logits_this = torch.index_select(logits_target_token, 0, torch.LongTensor(INDEXS_LABELS))
        
        probas_evaluate = torch.nn.functional.softmax(logits_this, dim=-1)
        answer_evaluate = int(probas_evaluate.argmax())
        label_evaluate = LABELS[answer_evaluate]

        list_conf_output.append(probas_evaluate.tolist()[answer_evaluate])
        list_label_output.append(label_evaluate)
        
        code_origin = test_tokenized['labels'][index_mask].item()
        label_origin = DICT_CODE_LABEL[code_origin]
        list_answer_output.append(label_origin)
    # end

    print('[{}] finish testing.'.format(get_ts()))

    pairs_label_conf = [[a, b, c] for a, b, c in zip(list_label_output, list_conf_output, list_answer_output)]

    filename_output = f'output-{name_train}.json'
    path_file_output = os.path.join(path_output, filename_output)

    with open(path_file_output, 'w+') as file:
        file.write(json.dumps(pairs_label_conf))
    # end

    print('[{}] main_train_and_evaluate finished.'.format(get_ts()))
    

# end


In [2]:
path_folder_train = 'data_model_mlm'
path_output = 'output_noseed_mlm'

os.makedirs(path_output, exist_ok=True)

import numpy as np
import random
import torch

seed_val = 234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

import os
import subprocess

filenames = sorted([filename for filename in os.listdir(path_folder_train) if filename[0] != '.' and 'train' in filename])
# filenames = [filenames[0]]

for _filename in filenames:
    filename_base = _filename.split('_')[0]
    
    filename_test = f'{filename_base}_test.csv'
    filename_train = _filename
    
    
    path_train = os.path.join(path_folder_train, filename_train)
    path_test = os.path.join(path_folder_train, filename_test)
    
    main_train_and_evaluate(filename_base, path_train, path_test, path_output)

    subprocess.run("rm -rf results", shell=True)
    # subprocess.run("rm -rf mlruns", shell=True)
# end

[2023-08-25T15:06:56] start main_train_and_evaluate with data_model_mlm/202305091133_train_0.35_15.csv data_model_mlm/202305091133_test.csv


***** Running training *****
  Num examples = 57105
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 71390


[2023-08-25T15:07:16] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0035,0.014325,0.98956,0.984708,0.972969,0.97079
2,0.0001,0.005455,0.995351,0.993387,0.987383,0.986933
3,0.0,0.003921,0.996411,0.996369,0.992277,0.992977
4,0.0011,0.004197,0.997145,0.994408,0.995119,0.991992
5,0.0,0.002023,0.998206,0.997142,0.997857,0.997147
6,0.0,0.002155,0.997961,0.998225,0.998094,0.998028
7,0.0001,0.001776,0.998369,0.996385,0.998359,0.996339
8,0.0002,0.001732,0.998287,0.99722,0.998286,0.996868
9,0.0,0.001817,0.997961,0.996829,0.998243,0.996622
10,0.0,0.001375,0.998532,0.997272,0.998562,0.997039


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7139
Configuration saved in results/checkpoint-7139/config.json
Model weights saved in results/checkpoint-7139/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-20260] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-14278
Configuration saved in results/checkpoint-14278/config.json
Model weights saved in results/checkpoint-14278/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-25325] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-21417
Configuration saved in results/checkpoint-21417/config.json
Model weights saved in results/checkpoint-21417/pytorch_model.bin
Deleting older checkpoint [results/checkpoint-30390] due to args.save_total_limit
***** Running Ev

[2023-08-25T17:42:58] finish training.
[2023-08-25T17:42:58] finish testing.
[2023-08-25T17:42:58] main_train_and_evaluate finished.
[2023-08-25T17:43:00] start main_train_and_evaluate with data_model_mlm/202305221222_train_0.35_15.csv data_model_mlm/202305221222_test.csv


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677

[2023-08-25T17:43:18] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0203,0.007502,0.994826,0.988938,0.983218,0.979346
2,0.0026,0.002538,0.99801,0.996138,0.993084,0.992167
3,0.003,0.001739,0.998806,0.997112,0.995233,0.994073
4,0.0044,0.000968,0.999363,0.999569,0.996363,0.99625
5,0.003,0.000851,0.999363,0.998844,0.995676,0.995347
6,0.0003,0.000704,0.999284,0.998758,0.995966,0.995494
7,0.0004,0.000728,0.999363,0.999653,0.996102,0.996153
8,0.0002,0.000705,0.999363,0.999541,0.996027,0.996057
9,0.0011,0.000598,0.999443,0.999601,0.997973,0.997923
10,0.0001,0.000496,0.999443,0.999633,0.996271,0.996236


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7204
Configuration saved in results/checkpoint-7204/config.json
Model weights saved in results/checkpoint-7204/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-14408
Configuration saved in results/checkpoint-14408/config.json
Model weights saved in results/checkpoint-14408/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-21612
Configuration saved in results/checkpoint-21612/config.json
Model weights saved in results/checkpoint-21612/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-28816
Configuration saved in results/checkpoint-28816/config.json
Model weights saved in results/checkpoint-28816/pytorch_model.bin
***** Running E

[2023-08-25T20:22:24] finish training.
[2023-08-25T20:22:24] finish testing.
[2023-08-25T20:22:24] main_train_and_evaluate finished.
[2023-08-25T20:22:25] start main_train_and_evaluate with data_model_mlm/202307030930_train_0.35_15.csv data_model_mlm/202307030930_test.csv


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677

[2023-08-25T20:22:39] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0144,0.012876,0.991554,0.987491,0.981854,0.978942
2,0.0001,0.006931,0.994796,0.994693,0.990943,0.990551
3,0.0001,0.004728,0.996531,0.99395,0.994706,0.991528
4,0.0005,0.003,0.997662,0.993826,0.995468,0.991838
5,0.0,0.002731,0.998039,0.99775,0.995801,0.995878
6,0.0,0.002267,0.998341,0.996633,0.99833,0.996483
7,0.0,0.00225,0.998265,0.99839,0.998242,0.998089
8,0.0,0.002147,0.998039,0.99734,0.995876,0.994754
9,0.0,0.00176,0.998643,0.995602,0.998718,0.995408
10,0.0,0.001917,0.998718,0.995683,0.998768,0.995475


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7349
Configuration saved in results/checkpoint-7349/config.json
Model weights saved in results/checkpoint-7349/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-14698
Configuration saved in results/checkpoint-14698/config.json
Model weights saved in results/checkpoint-14698/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-22047
Configuration saved in results/checkpoint-22047/config.json
Model weights saved in results/checkpoint-22047/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-29396
Configuration saved in results/checkpoint-29396/config.json
Model weights saved in results/checkpoint-29396/pytorch_model.bin
***** Running E

[2023-08-25T23:03:50] finish training.
[2023-08-25T23:03:51] finish testing.
[2023-08-25T23:03:51] main_train_and_evaluate finished.
[2023-08-25T23:03:52] start main_train_and_evaluate with data_model_mlm/202307050919_train_0.35_15.csv data_model_mlm/202307050919_test.csv


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677

[2023-08-25T23:04:08] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.023,0.010909,0.992901,0.988023,0.976226,0.973474
2,0.0268,0.005385,0.996303,0.994258,0.989703,0.988636
3,0.0015,0.002783,0.998077,0.996593,0.994491,0.993485
4,0.0,0.003413,0.997856,0.997269,0.993796,0.99353
5,0.0003,0.002651,0.998299,0.997342,0.996774,0.996482
6,0.0006,0.002084,0.998521,0.998362,0.994761,0.995125
7,0.0001,0.002077,0.998299,0.996977,0.99318,0.992943
8,0.0,0.002342,0.998521,0.997914,0.994192,0.994524
9,0.0,0.002115,0.998595,0.997861,0.994445,0.994217
10,0.0001,0.001823,0.998669,0.997928,0.994939,0.994542


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7808
Configuration saved in results/checkpoint-7808/config.json
Model weights saved in results/checkpoint-7808/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-15616
Configuration saved in results/checkpoint-15616/config.json
Model weights saved in results/checkpoint-15616/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-23424
Configuration saved in results/checkpoint-23424/config.json
Model weights saved in results/checkpoint-23424/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-31232
Configuration saved in results/checkpoint-31232/config.json
Model weights saved in results/checkpoint-31232/pytorch_model.bin
***** Running E

[2023-08-26T01:55:56] finish training.
[2023-08-26T01:55:56] finish testing.
[2023-08-26T01:55:56] main_train_and_evaluate finished.
[2023-08-26T01:55:57] start main_train_and_evaluate with data_model_mlm/202307070327_train_0.35_15.csv data_model_mlm/202307070327_test.csv


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677

[2023-08-26T01:56:14] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0139,0.006809,0.995398,0.993905,0.990126,0.990222
2,0.0033,0.003073,0.998006,0.997763,0.997038,0.997219
3,0.0003,0.001994,0.998773,0.996823,0.998006,0.996373
4,0.0022,0.001465,0.99908,0.996012,0.998815,0.995699
5,0.0032,0.000913,0.999233,0.999151,0.999154,0.999127
6,0.0029,0.001256,0.999233,0.997578,0.999311,0.997586
7,0.0001,0.000658,0.999386,0.999214,0.999448,0.999292
8,0.0001,0.000793,0.99931,0.998684,0.999307,0.998943
9,0.0002,0.000632,0.999386,0.998948,0.999411,0.999133
10,0.0,0.000633,0.999386,0.999101,0.99939,0.999203


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7834
Configuration saved in results/checkpoint-7834/config.json
Model weights saved in results/checkpoint-7834/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-15668
Configuration saved in results/checkpoint-15668/config.json
Model weights saved in results/checkpoint-15668/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-23502
Configuration saved in results/checkpoint-23502/config.json
Model weights saved in results/checkpoint-23502/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-31336
Configuration saved in results/checkpoint-31336/config.json
Model weights saved in results/checkpoint-31336/pytorch_model.bin
***** Running E

[2023-08-26T04:48:29] finish training.
[2023-08-26T04:48:29] finish testing.
[2023-08-26T04:48:29] main_train_and_evaluate finished.
[2023-08-26T04:48:30] start main_train_and_evaluate with data_model_mlm/202307191009_train_0.35_15.csv data_model_mlm/202307191009_test.csv


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677

[2023-08-26T04:48:47] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0231,0.014921,0.991763,0.986898,0.965219,0.961964
2,0.0005,0.006583,0.99526,0.989797,0.985067,0.980645
3,0.0,0.00471,0.996426,0.992344,0.988772,0.985063
4,0.0001,0.004199,0.997436,0.995802,0.993328,0.991164
5,0.0004,0.004177,0.997047,0.994013,0.990561,0.987976
6,0.0001,0.003248,0.997669,0.991953,0.99319,0.989059
7,0.0,0.003307,0.997902,0.996446,0.99656,0.995336
8,0.0001,0.003014,0.997824,0.994084,0.993289,0.991078
9,0.0011,0.002886,0.99798,0.991819,0.992815,0.988855
10,0.0,0.002725,0.998213,0.994459,0.993429,0.989729


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7887
Configuration saved in results/checkpoint-7887/config.json
Model weights saved in results/checkpoint-7887/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-15774
Configuration saved in results/checkpoint-15774/config.json
Model weights saved in results/checkpoint-15774/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-23661
Configuration saved in results/checkpoint-23661/config.json
Model weights saved in results/checkpoint-23661/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-31548
Configuration saved in results/checkpoint-31548/config.json
Model weights saved in results/checkpoint-31548/pytorch_model.bin
***** Running E

[2023-08-26T07:39:43] finish training.
[2023-08-26T07:39:43] finish testing.
[2023-08-26T07:39:43] main_train_and_evaluate finished.
[2023-08-26T07:39:44] start main_train_and_evaluate with data_model_mlm/202308030456_train_0.35_15.csv data_model_mlm/202308030456_test.csv


loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at /home/jovyan/.cache/huggingface/transformers/23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.12.0",
  "vocab_size": 30522
}

loading weights file https://huggingface.co/distilbert-base-uncased/resolve/main/pytorch_model.bin from cache at /home/jovyan/.cache/huggingface/transformers/9c169103d7e5a73936dd2b627e42851bec0831212b677

[2023-08-26T07:40:00] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0042,0.009741,0.993852,0.989815,0.981337,0.98008
2,0.0033,0.004209,0.997275,0.993972,0.991945,0.989844
3,0.0008,0.002593,0.998114,0.997747,0.993706,0.993569
4,0.0072,0.00254,0.998463,0.997782,0.994231,0.99457
5,0.0077,0.001853,0.998603,0.998099,0.998049,0.997705
6,0.0001,0.001448,0.998812,0.999125,0.995574,0.996117
7,0.0,0.001486,0.999092,0.999423,0.998297,0.998734
8,0.0026,0.001294,0.999162,0.999057,0.998857,0.998864
9,0.0001,0.001338,0.999022,0.998973,0.99801,0.998134
10,0.0006,0.001078,0.998952,0.998812,0.997953,0.998023


***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-7939
Configuration saved in results/checkpoint-7939/config.json
Model weights saved in results/checkpoint-7939/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-15878
Configuration saved in results/checkpoint-15878/config.json
Model weights saved in results/checkpoint-15878/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-23817
Configuration saved in results/checkpoint-23817/config.json
Model weights saved in results/checkpoint-23817/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 120
  Batch size = 8
Saving model checkpoint to results/checkpoint-31756
Configuration saved in results/checkpoint-31756/config.json
Model weights saved in results/checkpoint-31756/pytorch_model.bin
***** Running E

[2023-08-26T10:33:14] finish training.
[2023-08-26T10:33:14] finish testing.
[2023-08-26T10:33:14] main_train_and_evaluate finished.


In [3]:
print('hello')

hello
