In [1]:
from data_loader import get_task1_conver, get_task2_conver, preprocess

In [2]:
import pandas as pd

In [3]:
# df = get_task1_conver("../Task1//annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [4]:
# df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = [], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [5]:
# df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = [], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [6]:
import sys
sys.path.append('..')

from utils import load_jsonl, dump_jsonl, set_random_seed

In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import wandb
# from sklearn.metrics import precision_recall_fscore_support, classification_report
# from pythainlp.tokenize import word_tokenize
import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer



In [8]:
from sklearn.utils import compute_class_weight
import torch.nn as nn
import os, shutil
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import mean_squared_error, r2_score

def run_exp(out_dir, df, report="none", regressor_configs=None):

    set_random_seed()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = torch.device("cpu")
    print("START")
    print("step 1: load data")
    train, val, test = df
    
#     train = train.head(100)
#     val = val.head(100)
#     test = test.head(100)

    print("step 2: load tokenizer")
    model_name = "airesearch/wangchanberta-base-att-spm-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["usr", "sys", "rep"]})

    print("step 3: init data")
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(train)
    ds['val'] = Dataset.from_pandas(val)
    ds['test'] = Dataset.from_pandas(test)

    if regressor_configs is None:
        labels = train["label"].unique()
        num_labels = len(labels)
        print(labels)
    
        class_weights = compute_class_weight("balanced", classes=labels, y=train["label"].values)
        class_weights = torch.tensor(class_weights).float().to(device)

        id2label = {i:l for i, l in enumerate(labels)}
        label2id = {l:i for i, l in enumerate(labels)}

        def word_tokenize(d, tokenizer=None, label2id=None, max_length=256):
            texts = [preprocess(t) for t in d["text"]]
    #         print(texts)
            tokens = tokenizer(texts, truncation=True, max_length=max_length)
            num = [len(t) for t in tokens["input_ids"]]
    #         print(num)
    #         print("AVG", len(num), sum(num)/len(num))
            tokens["label"] = [label2id[label] for label in d["label"]]
            return tokens
    else:
#         labels = train["label"].unique()
        num_labels = 1

        id2label = {1: regressor_configs["label"]}
        label2id = [regressor_configs["not_label"], regressor_configs["label"]]

        def word_tokenize(d, tokenizer=None, label2id=None, max_length=256):
            texts = [preprocess(t) for t in d["text"]]
    #         print(texts)
            tokens = tokenizer(texts, truncation=True, max_length=max_length)
            num = [len(t) for t in tokens["input_ids"]]
    #         print(num)
    #         print("AVG", len(num), sum(num)/len(num))
            tokens["label"] = [regressor_configs["label_fn"](label) for label in d["label"]]
            return tokens
        
        
        
    tokenized_ds = ds.map(word_tokenize, batched=True, fn_kwargs={"tokenizer":tokenizer, "label2id": label2id, "max_length":max_length})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("step 4: load model")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id);
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)

    if regressor_configs is None:
        metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return metrics.compute(predictions=predictions, references=labels, average="macro")
    else:
        label_fn = regressor_configs["label_fn"]
        def compute_metrics(eval_pred):           
            predictions, actual = eval_pred
            predictions = predictions.reshape(-1)
            
            predicted_labels = [label_fn(p) for p in predictions]
            actual_labels = [label_fn(p) for p in actual]
            p, r, f1, _ = precision_recall_fscore_support(actual_labels, predicted_labels, average='macro')
            
            return {
                "r2_score": r2_score(actual, predictions),
                "mean_squared_error": np.sqrt(mean_squared_error(actual, predictions)),
                "accuracy": accuracy_score(actual_labels, predicted_labels),
                "f1": f1,
                "precision": p,
                "recall": r,
            }


    print("step 5: fine-tune")
    
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to=report,
        metric_for_best_model="f1",
        save_total_limit=2,
        load_best_model_at_end=True,
        push_to_hub=False,
        run_name=out_dir,
    )
    
    if regressor_configs is None:
        class CustomTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False):
                labels = inputs.get("labels")
                # forward pass
                outputs = model(**inputs)

                logits = outputs.get("logits")

                loss_fct = nn.CrossEntropyLoss(weight=class_weights)
                loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
                return (loss, outputs) if return_outputs else loss
    else:
        CustomTrainer = Trainer
        
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,   
    )

    trainer.train()
    best_ckpt_path = trainer.state.best_model_checkpoint
    print(best_ckpt_path)

    modle_out_path = out_dir+"/best_model"
    if os.path.exists(modle_out_path):
        shutil.rmtree(modle_out_path)
        
    os.rename(best_ckpt_path, modle_out_path)
    best_ckpt_path = modle_out_path
    
    print("step 6: evaluate")
    e = trainer.evaluate(tokenized_ds["test"])
    print(e)

    print("DONE")

## Task1: Train Model

In [9]:
report = "none"
batch_size = 16
max_length = 128
num_epochs = 20

In [10]:
# import os
# stream = os.popen('nohup python3 run_train_task_classifier.py > train2.out &')
# output = stream.read()
# output

In [11]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# df = (df[0].head(), df[1].head(), df[2].head())
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1096 60 60


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Close,551
2. Know each other,230
3. Don't know each other,435


In [12]:
# df[0]["label"].value_counts().loc[['1. Close', '2. Know each other', "3. Don't know each other"]].plot.bar()

In [13]:
# df[1]["label"].value_counts().loc[['1. Close', '2. Know each other', "3. Don't know each other"]].plot.bar()

In [14]:
# df[2]["label"].value_counts().loc[['1. Close', '2. Know each other', "3. Don't know each other"]].plot.bar()

In [15]:
def closeness_label_fn(label):
    if label == '1. Close':
        return 1
    elif label =='2. Know each other':
        return 0.5
    elif label == "3. Don't know each other":
        return 0
    elif type(label)==str:
        assert(False)
    
    # [0, 0.33) =>
    # [0.33, 0.66) =>
    # [0.66, 1] =>
    
    if label > 0.66:
        return '1. Close'
    elif label > 0.33:
        return '2. Know each other'
    else:
        return "3. Don't know each other"
    
run_exp("./Regressors/task1_clse_usr", df, report=report, regressor_configs={
    "label": "close",
    "not_label": "not_close",
    "label_fn": closeness_label_fn,
})

START
step 1: load data
step 2: load tokenizer
step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.124605,0.408794,0.352994,0.7,0.671802,0.722222,0.727686
2,No log,0.109696,0.479533,0.331203,0.666667,0.57445,0.577886,0.573039
3,No log,0.0862,0.591012,0.293598,0.733333,0.666497,0.666473,0.67351
4,No log,0.091351,0.566574,0.302243,0.7,0.564348,0.583403,0.563952
5,No log,0.09563,0.546271,0.309241,0.716667,0.653266,0.655704,0.664194
6,No log,0.085599,0.593864,0.292573,0.733333,0.649784,0.654401,0.648378
7,No log,0.097556,0.537132,0.312339,0.716667,0.608116,0.62338,0.603577
8,0.090700,0.088962,0.577906,0.298265,0.7,0.603046,0.61348,0.596848
9,0.090700,0.084994,0.596731,0.291538,0.7,0.641753,0.655556,0.647113
10,0.090700,0.081809,0.611845,0.286023,0.733333,0.666949,0.674169,0.670922


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 60
  Batch size = 16
Saving model checkpoint to ./Regressors/task1_clse_usr/checkpoint-69
Configuration saved in ./Regressors/task1_clse_usr/checkpoint-69/config.json
Model weights saved in ./Regressors/task1_clse_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Regressors/task1_clse_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Regressors/task1_clse_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safe

./Regressors/task1_clse_usr/checkpoint-1380
step 6: evaluate


{'eval_loss': 0.06106751784682274, 'eval_r2_score': 0.7007920664827132, 'eval_mean_squared_error': 0.24711842834949493, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.7307692307692308, 'eval_precision': 0.7509803921568627, 'eval_recall': 0.7229390681003585, 'eval_runtime': 0.6888, 'eval_samples_per_second': 87.11, 'eval_steps_per_second': 5.807, 'epoch': 20.0}
DONE


In [16]:
run_exp("./Models/task1_clse_usr", df, report=report)

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['1. Close' "3. Don't know each other" '2. Know each other']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1. Close",
    "1": "3. Don't know each other",
    "2": "2. Know each other"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Close": 0,
    "2. Know each other": 2,
    "3. Don't know each other": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.858159,0.733333,0.52992,0.49537,0.572981
2,No log,0.765787,0.716667,0.610147,0.612001,0.611341
3,No log,0.744573,0.75,0.661561,0.668779,0.657695
4,No log,0.760221,0.683333,0.642628,0.664021,0.662928
5,No log,0.688059,0.8,0.724808,0.723765,0.726305
6,No log,0.786535,0.783333,0.623859,0.631944,0.636416
7,No log,0.731113,0.733333,0.681634,0.695833,0.696055
8,0.625600,0.803274,0.7,0.654447,0.680693,0.677421
9,0.625600,0.842892,0.733333,0.68265,0.695609,0.698643
10,0.625600,1.013666,0.716667,0.679365,0.706539,0.706694


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 60
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./Models/task1_clse_usr/checkpoint-69
Configuration saved in ./Models/task1_clse_usr/checkpoint-69/config.json
Model weights saved in ./Models/task1_clse_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task1_clse_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task1_clse_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceCla

./Models/task1_clse_usr/checkpoint-345
step 6: evaluate


{'eval_loss': 0.6723384857177734, 'eval_accuracy': 0.75, 'eval_f1': 0.6599640125955916, 'eval_precision': 0.6772058823529411, 'eval_recall': 0.6514336917562724, 'eval_runtime': 0.7019, 'eval_samples_per_second': 85.486, 'eval_steps_per_second': 5.699, 'epoch': 20.0}
DONE


In [17]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "authority", skips = ["3. Not respect"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1098 61 61


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0. Very respect,248
1. Respect,289
2. Normal,683


In [18]:
def authority_label_fn(label):
    if label == '0. Very respect':
        return 1
    elif label =='1. Respect':
        return 0.5
    elif label == "2. Normal":
        return 0
    elif type(label)==str:
        assert(False)
    
    # [0, 0.33) =>
    # [0.33, 0.66) =>
    # [0.66, 1] =>
    
    if label > 0.66:
        return '0. Very respect'
    elif label > 0.33:
        return '1. Respect'
    else:
        return "2. Normal"
    
run_exp("./Regressors/task1_auth_usr", df, report=report, regressor_configs={
    "label": "respect",
    "not_label": "not_respect",
    "label_fn": authority_label_fn,
})

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": [
    "not_respect",
    "respect"
  ],
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.140429,0.099854,0.374738,0.491803,0.327916,0.29955,0.368627
2,No log,0.137052,0.121501,0.370205,0.47541,0.387336,0.662142,0.426471
3,No log,0.139538,0.105565,0.373547,0.540984,0.378151,0.346405,0.435294
4,No log,0.153806,0.014104,0.392181,0.377049,0.374275,0.561111,0.428431
5,No log,0.143608,0.079474,0.378957,0.47541,0.468989,0.597222,0.517647
6,No log,0.132371,0.151505,0.363828,0.491803,0.261672,0.229483,0.306536
7,No log,0.122363,0.215651,0.349805,0.590164,0.473151,0.648268,0.456536
8,0.171400,0.131772,0.15534,0.363005,0.606557,0.424558,0.677019,0.44281
9,0.171400,0.119984,0.230905,0.346387,0.606557,0.557239,0.585317,0.551961
10,0.171400,0.128107,0.178832,0.357921,0.57377,0.493243,0.636765,0.464706


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./Regressors/task1_auth_usr/checkpoint-69
Configuration saved in ./Regressors/task1_auth_usr/checkpoint-69/config.json
Model weights saved in ./Regressors/task1_auth_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Regressors/task1_auth_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Regressors/task1_auth_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `Cam

./Regressors/task1_auth_usr/checkpoint-1380


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16


step 6: evaluate


{'eval_loss': 0.16568411886692047, 'eval_r2_score': 0.04342795844278746, 'eval_mean_squared_error': 0.4070431590080261, 'eval_accuracy': 0.5081967213114754, 'eval_f1': 0.4935711631363806, 'eval_precision': 0.5576190476190477, 'eval_recall': 0.4875, 'eval_runtime': 0.7216, 'eval_samples_per_second': 84.537, 'eval_steps_per_second': 5.543, 'epoch': 20.0}
DONE


In [19]:
run_exp("./Models/task1_auth_usr", df, report=report)

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['1. Respect' '2. Normal' '0. Very respect']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1. Respect",
    "1": "2. Normal",
    "2": "0. Very respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0. Very respect": 2,
    "1. Respect": 0,
    "2. Normal": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_toke

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.069297,0.311475,0.220588,0.365828,0.360131
2,No log,1.033785,0.590164,0.460894,0.481611,0.463399
3,No log,1.000865,0.57377,0.396011,0.35303,0.45098
4,No log,0.897825,0.52459,0.512021,0.517874,0.538889
5,No log,0.873287,0.590164,0.55914,0.55,0.589216
6,No log,0.928451,0.590164,0.536448,0.551012,0.529739
7,No log,1.029924,0.508197,0.489947,0.508718,0.505556
8,0.880100,0.983697,0.57377,0.538847,0.556999,0.544771
9,0.880100,1.087047,0.540984,0.517507,0.534872,0.537582
10,0.880100,1.443468,0.672131,0.545513,0.663702,0.530392


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
Saving model checkpoint to ./Models/task1_auth_usr/checkpoint-69
Configuration saved in ./Models/task1_auth_usr/checkpoint-69/config.json
Model weights saved in ./Models/task1_auth_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task1_auth_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task1_auth_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this messa

./Models/task1_auth_usr/checkpoint-345
step 6: evaluate


{'eval_loss': 1.0375056266784668, 'eval_accuracy': 0.47540983606557374, 'eval_f1': 0.4509031198686371, 'eval_precision': 0.4423076923076923, 'eval_recall': 0.47212301587301586, 'eval_runtime': 0.7083, 'eval_samples_per_second': 86.125, 'eval_steps_per_second': 5.648, 'epoch': 20.0}
DONE


## Task2: Train Model

In [20]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 2486 records from ../Task2/annotated/annotated.jsonl
N 1495 186 186


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Close,222
2. Know each other,158
3. Don't know each other,1487


In [21]:
run_exp("./Regressors/task2_clse_usr", df, report=report, regressor_configs={
    "label": "close",
    "not_label": "not_close",
    "label_fn": closeness_label_fn,
})

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "close"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": [
    "not_close",
    "close"
  ],
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "ty

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.133882,-0.063556,0.365899,0.747312,0.312443,0.294697,0.342908
2,No log,0.111323,0.115654,0.333651,0.623656,0.313614,0.615624,0.313806
3,No log,0.109697,0.128568,0.331206,0.715054,0.440014,0.677027,0.447447
4,No log,0.105042,0.165552,0.324101,0.731183,0.428345,0.667756,0.425934
5,No log,0.109765,0.128033,0.331307,0.741935,0.430898,0.584087,0.409693
6,0.108500,0.097504,0.225434,0.312256,0.704301,0.471415,0.560479,0.47896
7,0.108500,0.103255,0.179748,0.321333,0.709677,0.486842,0.572759,0.474657
8,0.108500,0.112339,0.107585,0.33517,0.758065,0.464167,0.581491,0.44539
9,0.108500,0.099276,0.211352,0.315081,0.736559,0.464612,0.578886,0.450236
10,0.108500,0.108264,0.139956,0.329035,0.774194,0.467582,0.631337,0.459149


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 186
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./Regressors/task2_clse_usr/checkpoint-94
Configuration saved in ./Regressors/task2_clse_usr/checkpoint-94/config.json
Model weights saved in ./Regressors/task2_clse_usr/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./Regressors/task2_clse_usr/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./Regressors/task2_clse_usr/checkpoint-94/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `Ca

./Regressors/task2_clse_usr/checkpoint-1128
step 6: evaluate


{'eval_loss': 0.1304672509431839, 'eval_r2_score': -0.03636878056182069, 'eval_mean_squared_error': 0.3612024784088135, 'eval_accuracy': 0.7741935483870968, 'eval_f1': 0.42439560439560436, 'eval_precision': 0.47797578472425095, 'eval_recall': 0.420612944102877, 'eval_runtime': 1.8704, 'eval_samples_per_second': 99.444, 'eval_steps_per_second': 6.416, 'epoch': 20.0}
DONE


In [22]:
run_exp("./Models/task2_clse_usr", df, report=report)

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['2. Know each other' "3. Don't know each other" '1. Close']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "2. Know each other",
    "1": "3. Don't know each other",
    "2": "1. Close"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Close": 2,
    "2. Know each other": 0,
    "3. Don't know each other": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.017085,0.38172,0.333442,0.356964,0.43487
2,No log,1.078606,0.473118,0.359277,0.382293,0.415485
3,No log,1.038162,0.747312,0.494355,0.652848,0.4626
4,No log,1.042971,0.505376,0.401805,0.410867,0.468274
5,No log,1.114519,0.473118,0.334397,0.397588,0.430213
6,1.020500,1.127517,0.424731,0.32673,0.411377,0.434208
7,1.020500,0.980263,0.634409,0.503602,0.4903,0.571253
8,1.020500,1.004971,0.521505,0.433274,0.449022,0.529243
9,1.020500,1.02572,0.741935,0.539372,0.544749,0.536052
10,1.020500,0.855173,0.623656,0.50772,0.502529,0.58513


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 186
  Batch size = 16
Saving model checkpoint to ./Models/task2_clse_usr/checkpoint-94
Configuration saved in ./Models/task2_clse_usr/checkpoint-94/config.json
Model weights saved in ./Models/task2_clse_usr/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./Models/task2_clse_usr/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./Models/task2_clse_usr/checkpoint-94/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this mess

./Models/task2_clse_usr/checkpoint-1316
step 6: evaluate


{'eval_loss': 0.8854197859764099, 'eval_accuracy': 0.7473118279569892, 'eval_f1': 0.54585326953748, 'eval_precision': 0.5300235036543354, 'eval_recall': 0.5819761893587396, 'eval_runtime': 1.8457, 'eval_samples_per_second': 100.772, 'eval_steps_per_second': 6.501, 'epoch': 20.0}
DONE


In [23]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 2486 records from ../Task2/annotated/annotated.jsonl
N 1876 234 234


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Respect,319
2. Normal,1661
3. Not respect,364


In [24]:
def authority2_label_fn(label):
    if label == '1. Respect':
        return 1
    elif label =='2. Normal':
        return 0.5
    elif label == "3. Not respect":
        return 0
    elif type(label)==str:
        assert(False)
    
    if label > 0.66:
        return '1. Respect'
    elif label > 0.33:
        return '2. Normal'
    else:
        return "3. Not respect"

run_exp("./Regressors/task2_auth_usr", df, report=report, regressor_configs={
    "label": "respect",
    "not_label": "not_respect",
    "label_fn": authority2_label_fn,
})

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": [
    "not_respect",
    "respect"
  ],
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.057401,0.278662,0.239585,0.675214,0.601399,0.591984,0.635712
2,No log,0.048853,0.386083,0.221027,0.705128,0.633363,0.629526,0.683342
3,No log,0.041869,0.473841,0.20462,0.752137,0.702352,0.679728,0.737997
4,No log,0.041747,0.475382,0.204321,0.752137,0.702102,0.679753,0.743327
5,0.066500,0.041258,0.481532,0.203119,0.786325,0.751808,0.718938,0.805342
6,0.066500,0.039422,0.5046,0.198549,0.764957,0.75021,0.739035,0.78654
7,0.066500,0.042149,0.470325,0.205303,0.747863,0.677238,0.676239,0.682663
8,0.066500,0.063887,0.197155,0.252759,0.619658,0.551315,0.62886,0.605142
9,0.046200,0.039107,0.50855,0.197756,0.794872,0.747337,0.732637,0.767282
10,0.046200,0.040081,0.496322,0.200201,0.730769,0.67922,0.679959,0.727189


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 234
  Batch size = 16
Saving model checkpoint to ./Regressors/task2_auth_usr/checkpoint-118
Configuration saved in ./Regressors/task2_auth_usr/checkpoint-118/config.json
Model weights saved in ./Regressors/task2_auth_usr/checkpoint-118/pytorch_model.bin
tokenizer config file saved in ./Regressors/task2_auth_usr/checkpoint-118/tokenizer_config.json
Special tokens file saved in ./Regressors/task2_auth_usr/checkpoint-118/special_tokens_map.json
Deleting older checkpoint [Regressors/task2_auth_usr/checkpoint-1652] due to args.save_total_limit
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have be

./Regressors/task2_auth_usr/checkpoint-1652
step 6: evaluate


{'eval_loss': 0.03201077878475189, 'eval_r2_score': 0.5525477856687152, 'eval_mean_squared_error': 0.17891556024551392, 'eval_accuracy': 0.7905982905982906, 'eval_f1': 0.7337138345303059, 'eval_precision': 0.6999205876513798, 'eval_recall': 0.7869849586541203, 'eval_runtime': 2.3902, 'eval_samples_per_second': 97.9, 'eval_steps_per_second': 6.276, 'epoch': 20.0}
DONE


In [25]:
run_exp("./Models/task2_auth_usr", df, report=report)

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}



START
step 1: load data
step 2: load tokenizer


loading file sentencepiece.bpe.model from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/sentencepiece.bpe.model
loading file tokenizer.json from cache at None
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/tokenizer_config.json
loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "cla

step 3: init data
['2. Normal' '3. Not respect' '1. Respect']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "2. Normal",
    "1": "3. Not respect",
    "2": "1. Respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Respect": 2,
    "2. Normal": 0,
    "3. Not respect": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.587866,0.773504,0.723465,0.705228,0.754135
2,No log,0.596551,0.790598,0.74023,0.727321,0.770515
3,No log,0.595517,0.816239,0.773542,0.764795,0.78342
4,No log,0.662912,0.811966,0.777886,0.74772,0.825915
5,0.427700,1.589213,0.833333,0.760353,0.839275,0.724265
6,0.427700,1.182574,0.816239,0.775388,0.759325,0.794731
7,0.427700,1.413458,0.824786,0.770465,0.784708,0.76499
8,0.427700,1.693525,0.833333,0.7717,0.814347,0.749552
9,0.088400,1.364664,0.84188,0.802448,0.79816,0.807309
10,0.088400,1.689437,0.82906,0.768159,0.801699,0.755776


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 234
  Batch size = 16
Saving model checkpoint to ./Models/task2_auth_usr/checkpoint-118
Configuration saved in ./Models/task2_auth_usr/checkpoint-118/config.json
Model weights saved in ./Models/task2_auth_usr/checkpoint-118/pytorch_model.bin
tokenizer config file saved in ./Models/task2_auth_usr/checkpoint-118/tokenizer_config.json
Special tokens file saved in ./Models/task2_auth_usr/checkpoint-118/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this

./Models/task2_auth_usr/checkpoint-1062
step 6: evaluate


{'eval_loss': 1.2533892393112183, 'eval_accuracy': 0.8247863247863247, 'eval_f1': 0.7676337930756904, 'eval_precision': 0.7615776669130327, 'eval_recall': 0.7803696179070431, 'eval_runtime': 2.3852, 'eval_samples_per_second': 98.104, 'eval_steps_per_second': 6.289, 'epoch': 20.0}
DONE


## Task3: Train Model

In [26]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()


Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1090 60 60


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Close,462
2. Know each other,696
3. Don't know each other,52


In [27]:
run_exp("./Regressors/task3_clse_usr", df, report=report, regressor_configs={
    "label": "close",
    "not_label": "not_close",
    "label_fn": closeness_label_fn,
})

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "close"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": [
    "not_close",
    "close"
  ],
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "ty

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.150253,-0.809065,0.387625,0.483333,0.294074,0.442593,0.310833
2,No log,0.062383,0.248898,0.249766,0.7,0.50553,0.519231,0.49875
3,No log,0.08397,-0.011008,0.289776,0.6,0.498506,0.496296,0.510694
4,No log,0.114989,-0.384482,0.3391,0.416667,0.196078,0.138889,0.333333
5,No log,0.086265,-0.038646,0.29371,0.433333,0.218615,0.474576,0.34375
6,No log,0.093363,-0.124098,0.305553,0.416667,0.196078,0.138889,0.333333
7,No log,0.07833,0.056896,0.279875,0.533333,0.231884,0.177778,0.333333
8,0.094400,0.084721,-0.020058,0.291069,0.533333,0.231884,0.177778,0.333333
9,0.094400,0.083792,-0.008863,0.289468,0.533333,0.231884,0.177778,0.333333
10,0.094400,0.088217,-0.062139,0.297013,0.533333,0.231884,0.177778,0.333333


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 60
  Batch size = 16
Saving model checkpoint to ./Regressors/task3_clse_usr/checkpoint-69
Configuration saved in ./Regressors/task3_clse_usr/checkpoint-69/config.json
Model weights saved in ./Regressors/task3_clse_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Regressors/task3_clse_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Regressors/task3_clse_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safe

./Regressors/task3_clse_usr/checkpoint-138
step 6: evaluate


{'eval_loss': 0.042262692004442215, 'eval_r2_score': 0.44217882703627065, 'eval_mean_squared_error': 0.20557895302772522, 'eval_accuracy': 0.7, 'eval_f1': 0.5786243386243387, 'eval_precision': 0.5782051282051283, 'eval_recall': 0.6183183183183183, 'eval_runtime': 0.6676, 'eval_samples_per_second': 89.875, 'eval_steps_per_second': 5.992, 'epoch': 20.0}
DONE


In [28]:
run_exp("./Models/task3_clse_usr", df, report=report)

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

Assigning ['usr', 'sys'

step 3: init data
['2. Know each other' "3. Don't know each other" '1. Close']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "2. Know each other",
    "1": "3. Don't know each other",
    "2": "1. Close"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Close": 2,
    "2. Know each other": 0,
    "3. Don't know each other": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.754529,0.8,0.551553,0.555973,0.555417
2,No log,0.531545,0.816667,0.745177,0.729249,0.770139
3,No log,0.585955,0.816667,0.770333,0.776768,0.767222
4,No log,0.709013,0.8,0.678169,0.70915,0.659028
5,No log,0.730013,0.833333,0.782828,0.786587,0.780556
6,No log,0.726652,0.85,0.795683,0.794882,0.799722
7,No log,1.026091,0.816667,0.688889,0.729515,0.666528
8,0.482100,0.843036,0.85,0.795116,0.796717,0.793889
9,0.482100,1.051181,0.816667,0.770333,0.776768,0.767222
10,0.482100,1.253408,0.783333,0.665879,0.699172,0.645694


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 60
  Batch size = 16
Saving model checkpoint to ./Models/task3_clse_usr/checkpoint-69
Configuration saved in ./Models/task3_clse_usr/checkpoint-69/config.json
Model weights saved in ./Models/task3_clse_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task3_clse_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task3_clse_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this messa

./Models/task3_clse_usr/checkpoint-414
step 6: evaluate


{'eval_loss': 1.4971728324890137, 'eval_accuracy': 0.8333333333333334, 'eval_f1': 0.5735430157261795, 'eval_precision': 0.5594135802469136, 'eval_recall': 0.6036036036036037, 'eval_runtime': 0.676, 'eval_samples_per_second': 88.762, 'eval_steps_per_second': 5.917, 'epoch': 20.0}
DONE


In [29]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()
run_exp("./Models/task3_auth_usr", df, report=report)

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1099 61 61
START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['2. Normal' '1. Respect' '3. Not respect']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "2. Normal",
    "1": "1. Respect",
    "2": "3. Not respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Respect": 1,
    "2. Normal": 0,
    "3. Not respect": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.795773,0.885246,0.70933,0.726424,0.702614
2,No log,0.774079,0.803279,0.665107,0.629293,0.746732
3,No log,0.698042,0.737705,0.591258,0.615079,0.643791
4,No log,1.013291,0.803279,0.611772,0.627778,0.620915
5,No log,0.863391,0.803279,0.619062,0.599473,0.669935
6,No log,1.393473,0.803279,0.541474,0.577778,0.544118
7,No log,1.427778,0.836066,0.57906,0.628931,0.55719
8,0.445000,2.085276,0.786885,0.434119,0.413462,0.460784
9,0.445000,2.234012,0.786885,0.434119,0.413462,0.460784
10,0.445000,2.136647,0.836066,0.57906,0.628931,0.55719


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
Saving model checkpoint to ./Models/task3_auth_usr/checkpoint-69
Configuration saved in ./Models/task3_auth_usr/checkpoint-69/config.json
Model weights saved in ./Models/task3_auth_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task3_auth_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task3_auth_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this messa

./Models/task3_auth_usr/checkpoint-69
step 6: evaluate


{'eval_loss': 0.584971010684967, 'eval_accuracy': 0.8852459016393442, 'eval_f1': 0.7664720600500416, 'eval_precision': 0.7757575757575758, 'eval_recall': 0.7592592592592592, 'eval_runtime': 0.7069, 'eval_samples_per_second': 86.292, 'eval_steps_per_second': 5.658, 'epoch': 20.0}
DONE


In [30]:
run_exp("./Regressors/task3_auth_usr", df, report=report, regressor_configs={
    "label": "respect",
    "not_label": "not_respect",
    "label_fn": authority2_label_fn,
})

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

Assigning ['usr', 'sys'

step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "1": "respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": [
    "not_respect",
    "respect"
  ],
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.085439,-1.098467,0.292299,0.360656,0.385193,0.539367,0.521242
2,No log,0.047561,-0.168145,0.218084,0.754098,0.522751,0.50528,0.552288
3,No log,0.046304,-0.137272,0.215183,0.836066,0.569164,0.610806,0.55719
4,No log,0.051438,-0.263368,0.226799,0.836066,0.624143,0.61619,0.633987
5,No log,0.079456,-0.951516,0.281879,0.770492,0.415429,0.393333,0.454248
6,No log,0.060507,-0.486114,0.245982,0.852459,0.651512,0.672269,0.640523
7,No log,0.044141,-0.084143,0.210097,0.786885,0.529444,0.565986,0.537582
8,0.043500,0.054235,-0.33208,0.232885,0.737705,0.498161,0.540691,0.517974
9,0.043500,0.059654,-0.465172,0.244242,0.803279,0.541474,0.577778,0.544118
10,0.043500,0.045618,-0.120429,0.213584,0.786885,0.60064,0.618096,0.614379


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
Saving model checkpoint to ./Regressors/task3_auth_usr/checkpoint-69
Configuration saved in ./Regressors/task3_auth_usr/checkpoint-69/config.json
Model weights saved in ./Regressors/task3_auth_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Regressors/task3_auth_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Regressors/task3_auth_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safe

./Regressors/task3_auth_usr/checkpoint-414
step 6: evaluate


{'eval_loss': 0.026724383234977722, 'eval_r2_score': 0.010533113879875766, 'eval_mean_squared_error': 0.16347593069076538, 'eval_accuracy': 0.9016393442622951, 'eval_f1': 0.8148148148148149, 'eval_precision': 0.8148148148148149, 'eval_recall': 0.8148148148148149, 'eval_runtime': 0.715, 'eval_samples_per_second': 85.32, 'eval_steps_per_second': 5.595, 'epoch': 20.0}
DONE


In [31]:
"DOOOOOM"

'DOOOOOM'