In [1]:
import json
import os
import sys
from datetime import datetime
import pandas as pd
import torch
from torch import BoolTensor
from transformers import DistilBertTokenizerFast, DistilBertForMaskedLM, EarlyStoppingCallback
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

MODEL_NAME = 'distilbert-base-uncased'
MAX_LENGTH = 512

DIR_OUTPUT = 'results'
DEVICE_DEFAULT = 'cuda'

INDEXS_LABELS = [3231, 3698, 4031, 4044, 5310, 6502, 6922]
LABELS = ['testcase', 'targetvm', 'product', 'nimbus', 'usererror', 'infra', 'testbed']
DICT_CODE_LABEL = {
     6502: 'infra',
     3698: 'targetvm',
     3231: 'testcase',
     4044: 'nimbus',
     5310: 'usererror',
     4031: 'product',
     8241: 'testbed'
}

def get_ts():
    return datetime.utcnow().replace(microsecond=0).isoformat()
# end


# class SimpleDataset(torch.utils.data.Dataset):
#     def __init__(self, contents, labels):
#         self.contents = contents
#         self.labels = labels

#     def __getitem__(self, idx):
#         item = {'input_ids': self.contents[idx][0], 'attention_mask': self.contents[idx][1], 'labels': self.labels[idx]}
#         return item

#     def __len__(self):
#         return len(self.labels)
#     # end
# # end


# def read_passages(path_data, test_size=0):
#     df = pd.read_csv(path_data)

#     pairs_mask_embedding = [(eval(str_masks), eval(str_embedding)) for str_masks, str_embedding in zip(df['masks'], df['embedding'])]
    
#     samples = []
#     labels = []
                             
#     for masks, embedding in pairs_mask_embedding:

#         while len(embedding) < MAX_LENGTH:
#             embedding.append(0)
#         # end

#         attention = list([1 for _ in range(MAX_LENGTH)])

#         pt_embedding = torch.LongTensor(embedding)
#         pt_label = torch.LongTensor(embedding)
#         pt_attention = torch.LongTensor(attention)

#         pt_index_masks = torch.LongTensor(masks)

#         pt_embedding.index_fill_(0, pt_index_masks, 103)
#         pt_attention.masked_fill_(pt_attention == 103, 0)
#         pt_attention.masked_fill_(pt_attention == 0, 0)

#         samples.append((pt_embedding, pt_attention))
#         labels.append(pt_label)
#     # end
    
    
#     if test_size > 0:
#         return train_test_split(samples, labels, test_size=test_size, random_state=234)
#     else:
#         return (samples, samples, labels, labels)
#     # end
# # end


class SimpleDataset(torch.utils.data.Dataset):
    def __init__(self, pairs):
        self.pairs = pairs

    def __getitem__(self, idx):
        masks, embedding = self.pairs[idx]
        
        while len(embedding) < MAX_LENGTH:
            embedding.append(0)
        # end

        attention = list([1 for _ in range(MAX_LENGTH)])

        pt_embedding = torch.LongTensor(embedding)
        pt_label = torch.LongTensor(embedding)
        pt_attention = torch.LongTensor(attention)

        pt_index_masks = torch.LongTensor(masks)

        pt_embedding.index_fill_(0, pt_index_masks, 103)
        pt_attention.masked_fill_(pt_attention == 103, 0)
        pt_attention.masked_fill_(pt_attention == 0, 0)

        item = {'input_ids': pt_embedding, 'attention_mask': pt_attention, 'labels': pt_label}
        return item

    def __len__(self):
        return len(self.pairs)
    # end
# end

def read_passages(path_data, test_size=0):
    df = pd.read_csv(path_data)

    pairs_mask_embedding = [(eval(str_masks), eval(str_embedding)) for str_masks, str_embedding in zip(df['masks'], df['embedding'])]
    
    if test_size > 0:
        return train_test_split(pairs_mask_embedding, test_size=test_size, random_state=234)
    else:
        return (pairs_mask_embedding, pairs_mask_embedding)
    # end
# end


def compute_metrics(pred):


    # labels: (64, 512)
    # preds: (64, 512, 30522)
    
    labels_all = pred.label_ids
    preds_all = pred.predictions.argmax(-1)
    
    
    filter_token = labels_all > 103
    labels = labels_all[filter_token]
    preds = preds_all[filter_token]
    
    # print(f'labels: {labels.shape}')
    # print(f'preds: {preds.shape}')

    accuracy = accuracy_score(y_true=labels, y_pred=preds)
    precision = precision_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    recall = recall_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')
    f1 = f1_score(y_true=labels, y_pred=preds, zero_division=1, average='macro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
# end

def predict_plus(input_tokenized, model):

    out = model(**{k:v.unsqueeze(0).to(DEVICE_DEFAULT) for k,v in input_tokenized.items()}, output_attentions=True)
    logits = out.logits.cpu().squeeze(0)
    return logits
# end

def main_train_and_evaluate(name_train, path_train, path_test, path_output):
    print('[{}] start main_train_and_evaluate with {} {}'.format(get_ts(), path_train, path_test))

    model_name = MODEL_NAME
    max_length = MAX_LENGTH
    output_dir = DIR_OUTPUT

#     train_samples, valid_samples, train_labels, valid_labels = read_passages(path_train, 0.1)
    
#     # # TODO: DEBUG
#     train_samples = train_samples[:100]
#     train_labels = train_labels[:100]
    
#     valid_samples = valid_samples[:100]
#     valid_labels = valid_labels[:100]
#     # ###

#     tokenizer = DistilBertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

#     train_dataset = SimpleDataset(train_samples, train_labels)
#     valid_dataset = SimpleDataset(valid_samples, valid_labels)


    train_pairs, valid_pairs = read_passages(path_train, 0.1)
    
    # TODO: debug only
    train_pairs = train_pairs[:32]
    valid_pairs = valid_pairs[:32]
    
    train_dataset = SimpleDataset(train_pairs)
    valid_dataset = SimpleDataset(valid_pairs)

    model = DistilBertForMaskedLM.from_pretrained(model_name)

    training_args = TrainingArguments(
        output_dir=output_dir,  # output directory
        num_train_epochs=1,  # total number of training epochs
        per_device_train_batch_size=2,  # batch size per device during training
        per_device_eval_batch_size=2,  # batch size for evaluation
        warmup_steps=0,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        load_best_model_at_end=True,
        # load the best model when finished training (default metric is loss)    # but you can specify `metric_for_best_model` argument to change to accuracy or other metric
        logging_steps=1,  # log & save weights each logging_steps
        evaluation_strategy="epoch",  # evaluate each `logging_steps`
        learning_rate=2e-5,
        save_strategy='epoch',
        save_total_limit=5,
        metric_for_best_model='f1'
    )

    trainer = Trainer(
        model=model,  # the instantiated Transformers model to be trained
        args=training_args,  # training arguments, defined above
        train_dataset=train_dataset,  # training dataset
        eval_dataset=valid_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # the callback that computes metrics of interest
        callbacks=[EarlyStoppingCallback(early_stopping_patience=5)]
    )

    # trainer = Trainer(
    #     model=model,  # the instantiated Transformers model to be trained
    #     args=training_args,  # training arguments, defined above
    #     train_dataset=train_dataset,  # training dataset
    #     eval_dataset=valid_dataset,  # evaluation dataset
    #     compute_metrics=compute_metrics
    # )

    print('[{}] start training...'.format(get_ts()))
    trainer.train()

    info_state_model = trainer.evaluate()
    print('[{}] finish training.'.format(get_ts()))

    ################## start to do eval ##################

    test_pairs, _ = read_passages(path_test, 0)
    test_dataset = SimpleDataset(test_pairs)

    list_conf_output = []
    list_label_output = []
    list_answer_output = []

    # INDEXS_LABELS = [3231, 3698, 4031, 4044, 5310, 6502, 6922]
    # LABELS = ['testcase', 'targetvm', 'product', 'nimbus', 'usererror', 'infra', 'testbed']
    
    for test_tokenized in test_dataset:
        
        with torch.no_grad():
            # out = model(**input_tokenized, output_hidden_states=True, output_attentions=True)
            logits_all_token = predict_plus(test_tokenized, model)
        # end
        
        # logits_this: (512, 30522)
        input_ids = test_tokenized['input_ids']
        index_mask = (input_ids == 103).nonzero(as_tuple=True)[-1].tolist()[0]
        # print(index_mask)

        logits_target_token = logits_all_token[index_mask, :]  # 35535, tensor
        logits_this = torch.index_select(logits_target_token, 0, torch.LongTensor(INDEXS_LABELS))
        
        probas_evaluate = torch.nn.functional.softmax(logits_this, dim=-1)
        answer_evaluate = int(probas_evaluate.argmax())
        label_evaluate = LABELS[answer_evaluate]

        list_conf_output.append(probas_evaluate.tolist()[answer_evaluate])
        list_label_output.append(label_evaluate)
        
        code_origin = test_tokenized['labels'][index_mask].item()
        label_origin = DICT_CODE_LABEL[code_origin]
        list_answer_output.append(label_origin)
    # end

    print('[{}] finish testing.'.format(get_ts()))

    pairs_label_conf = [[a, b, c] for a, b, c in zip(list_label_output, list_conf_output, list_answer_output)]

    filename_output = f'output-{name_train}.json'
    path_file_output = os.path.join(path_output, filename_output)

    with open(path_file_output, 'w+') as file:
        file.write(json.dumps(pairs_label_conf))
    # end

    print('[{}] main_train_and_evaluate finished.'.format(get_ts()))
    

# end


In [2]:
path_folder_train = 'data_model_mlm'
path_output = 'output_noseed_mlm'

os.makedirs(path_output, exist_ok=True)

import numpy as np
import random
import torch

seed_val = 234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

import os
import subprocess

filenames = sorted([filename for filename in os.listdir(path_folder_train) if filename[0] != '.' and 'train' in filename])
filenames = filenames[:1]

for _filename in filenames:
    filename_base = _filename.split('_')[0]
    
    filename_test = f'{filename_base}_test.csv'
    filename_train = _filename
    
    
    path_train = os.path.join(path_folder_train, filename_train)
    path_test = os.path.join(path_folder_train, filename_test)
    
    main_train_and_evaluate(filename_base, path_train, path_test, path_output)

    subprocess.run("rm -rf results", shell=True)
    # subprocess.run("rm -rf mlruns", shell=True)
# end

[2023-08-24T11:24:28] start main_train_and_evaluate with data_model_mlm/202206171000_train_0.35_15.csv data_model_mlm/202206171000_test.csv


***** Running training *****
  Num examples = 32
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 16


[2023-08-24T11:24:39] start training...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,4.8115,4.540048,0.714345,0.637693,0.754855,0.477078


***** Running Evaluation *****
  Num examples = 32
  Batch size = 2
Saving model checkpoint to results/checkpoint-16
Configuration saved in results/checkpoint-16/config.json
Model weights saved in results/checkpoint-16/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-16 (score: 0.47707843954229356).
***** Running Evaluation *****
  Num examples = 32
  Batch size = 2


[2023-08-24T11:24:57] finish training.


KeyError: 2271

In [None]:
print('hello')