In [1]:
from data_loader import get_task1_conver, get_task2_conver, preprocess

In [2]:
import pandas as pd

In [3]:
# df = get_task1_conver("../Task1//annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [4]:
# df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = [], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [5]:
# df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = [], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [6]:
import sys
sys.path.append('..')

from utils import load_jsonl, dump_jsonl, set_random_seed

In [7]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import wandb
# from sklearn.metrics import precision_recall_fscore_support, classification_report
# from pythainlp.tokenize import word_tokenize
import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer



In [8]:
from sklearn.utils import compute_class_weight
import torch.nn as nn
import os, shutil

def run_exp(out_dir, df, report="none"):

    set_random_seed()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = torch.device("cpu")
    print("START")
    print("step 1: load data")
    train, val, test = df
    
    #train = train.head(100)
    #val = val.head(100)
    #test = test.head(100)

    print("step 2: load tokenizer")
    model_name = "airesearch/wangchanberta-base-att-spm-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["usr", "sys", "rep"]})

    print("step 3: init data")
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(train)
    ds['val'] = Dataset.from_pandas(val)
    ds['test'] = Dataset.from_pandas(test)

    labels = train["label"].unique()
    num_labels = len(labels)
    print(labels)
    
    class_weights = compute_class_weight("balanced", classes=labels, y=train["label"].values)
    class_weights = torch.tensor(class_weights).float().to(device)
    
    id2label = {i:l for i, l in enumerate(labels)}
    label2id = {l:i for i, l in enumerate(labels)}

    def word_tokenize(d, tokenizer=None, label2id=None, max_length=256):
        texts = [preprocess(t) for t in d["text"]]
#         print(texts)
        tokens = tokenizer(texts, truncation=True, max_length=max_length)
        num = [len(t) for t in tokens["input_ids"]]
#         print(num)
#         print("AVG", len(num), sum(num)/len(num))
        tokens["label"] = [label2id[label] for label in d["label"]]
        return tokens

    tokenized_ds = ds.map(word_tokenize, batched=True, fn_kwargs={"tokenizer":tokenizer, "label2id": label2id, "max_length":max_length})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("step 4: load model")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id);
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)

    metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return metrics.compute(predictions=predictions, references=labels, average="macro")



    print("step 5: fine-tune")
    
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to=report,
        metric_for_best_model="f1",
        save_total_limit=2,
        load_best_model_at_end=True,
        push_to_hub=False,
        run_name=out_dir,
    )
    
    class CustomTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
            labels = inputs.get("labels")
            # forward pass
            outputs = model(**inputs)
            logits = outputs.get("logits")
            
            loss_fct = nn.CrossEntropyLoss(weight=class_weights)
            loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
            return (loss, outputs) if return_outputs else loss

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,   
    )

    trainer.train()
    best_ckpt_path = trainer.state.best_model_checkpoint
    print(best_ckpt_path)

    modle_out_path = out_dir+"/best_model"
    if os.path.exists(modle_out_path):
        shutil.rmtree(modle_out_path)
        
    os.rename(best_ckpt_path, modle_out_path)
    best_ckpt_path = modle_out_path
    
    print("step 6: evaluate")
    e = trainer.evaluate(tokenized_ds["test"])
    print(e)

    print("DONE")

## Task1: Train Model

In [9]:
report = "none"
batch_size = 16
max_length = 128
num_epochs = 5

In [10]:
# import os
# stream = os.popen('nohup python3 run_train_task_classifier.py > train2.out &')
# output = stream.read()
# output

In [11]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# df = (df[0].head(), df[1].head(), df[2].head())
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()
run_exp("./Models/task1_clse_usr", df, report=report)

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1096 60 60
START
step 1: load data
step 2: load tokenizer
step 3: init data
['1. Close' "3. Don't know each other" '2. Know each other']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.858184,0.65,0.49011,0.484416,0.598148
2,No log,0.810594,0.716667,0.546458,0.542588,0.648148
3,No log,0.703888,0.716667,0.540055,0.529085,0.605556
4,No log,0.655914,0.7,0.548822,0.545139,0.618519
5,No log,0.640477,0.733333,0.554167,0.540476,0.612963


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 60
  Batch size = 16
Saving model checkpoint to ./Models/task1_clse_usr/checkpoint-69
Configuration saved in ./Models/task1_clse_usr/checkpoint-69/config.json
Model weights saved in ./Models/task1_clse_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task1_clse_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task1_clse_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this messa

./Models/task1_clse_usr/checkpoint-345
step 6: evaluate


{'eval_loss': 0.8123156428337097, 'eval_accuracy': 0.6666666666666666, 'eval_f1': 0.3947746195808211, 'eval_precision': 0.4102564102564103, 'eval_recall': 0.3871158392434988, 'eval_runtime': 0.6838, 'eval_samples_per_second': 87.749, 'eval_steps_per_second': 5.85, 'epoch': 5.0}
DONE


In [12]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "authority", skips = ["3. Not respect"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()
run_exp("./Models/task1_auth_usr", df, report=report)

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1098 61 61
START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['2. Normal' '0. Very respect' '1. Respect']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "2. Normal",
    "1": "0. Very respect",
    "2": "1. Respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "0. Very respect": 1,
    "1. Respect": 2,
    "2. Normal": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_toke

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.101633,0.344262,0.328449,0.417647,0.396693
2,No log,1.071011,0.52459,0.348709,0.365801,0.349735
3,No log,1.119959,0.540984,0.391765,0.424339,0.405556
4,No log,1.090216,0.557377,0.439948,0.448686,0.442063
5,No log,1.075473,0.540984,0.412387,0.417262,0.409392


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
Saving model checkpoint to ./Models/task1_auth_usr/checkpoint-69
Configuration saved in ./Models/task1_auth_usr/checkpoint-69/config.json
Model weights saved in ./Models/task1_auth_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task1_auth_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task1_auth_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this messa

./Models/task1_auth_usr/checkpoint-276
step 6: evaluate


{'eval_loss': 1.1755664348602295, 'eval_accuracy': 0.3770491803278688, 'eval_f1': 0.3663824504160638, 'eval_precision': 0.37103174603174605, 'eval_recall': 0.3637037037037037, 'eval_runtime': 0.7009, 'eval_samples_per_second': 87.035, 'eval_steps_per_second': 5.707, 'epoch': 5.0}
DONE


## Task2: Train Model

In [13]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

run_exp("./Models/task2_clse_usr", df, report=report)

Loaded 2463 records from ../Task2/annotated/annotated.jsonl
N 1495 186 186
START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
["3. Don't know each other" '2. Know each other' '1. Close']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "3. Don't know each other",
    "1": "2. Know each other",
    "2": "1. Close"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Close": 2,
    "2. Know each other": 1,
    "3. Don't know each other": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.100057,0.311828,0.29483,0.426487,0.404218
2,No log,0.998645,0.569892,0.435973,0.470728,0.462818
3,No log,0.99674,0.586022,0.457904,0.491238,0.479289
4,No log,1.04607,0.634409,0.454364,0.487032,0.477223
5,No log,1.0066,0.655914,0.482706,0.494429,0.515221


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 186
  Batch size = 16
Saving model checkpoint to ./Models/task2_clse_usr/checkpoint-94
Configuration saved in ./Models/task2_clse_usr/checkpoint-94/config.json
Model weights saved in ./Models/task2_clse_usr/checkpoint-94/pytorch_model.bin
tokenizer config file saved in ./Models/task2_clse_usr/checkpoint-94/tokenizer_config.json
Special tokens file saved in ./Models/task2_clse_usr/checkpoint-94/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this mess

./Models/task2_clse_usr/checkpoint-470
step 6: evaluate


{'eval_loss': 1.0347660779953003, 'eval_accuracy': 0.6182795698924731, 'eval_f1': 0.4694461789009754, 'eval_precision': 0.4980901451489687, 'eval_recall': 0.47104668381264125, 'eval_runtime': 1.8693, 'eval_samples_per_second': 99.505, 'eval_steps_per_second': 6.42, 'epoch': 5.0}
DONE


In [14]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

run_exp("./Models/task2_auth_usr", df, report=report)

Loaded 2463 records from ../Task2/annotated/annotated.jsonl
N 1642 205 205
START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['2. Normal' '3. Not respect' '1. Respect']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "2. Normal",
    "1": "3. Not respect",
    "2": "1. Respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Respect": 2,
    "2. Normal": 0,
    "3. Not respect": 1
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.581241,0.663415,0.634901,0.640859,0.692504
2,No log,0.43069,0.829268,0.787835,0.776331,0.802583
3,No log,0.593266,0.795122,0.759779,0.764052,0.787101
4,No log,0.51617,0.839024,0.79587,0.794421,0.806861
5,0.523600,0.565616,0.809756,0.76968,0.768041,0.794113


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 205
  Batch size = 16
Saving model checkpoint to ./Models/task2_auth_usr/checkpoint-103
Configuration saved in ./Models/task2_auth_usr/checkpoint-103/config.json
Model weights saved in ./Models/task2_auth_usr/checkpoint-103/pytorch_model.bin
tokenizer config file saved in ./Models/task2_auth_usr/checkpoint-103/tokenizer_config.json
Special tokens file saved in ./Models/task2_auth_usr/checkpoint-103/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this

./Models/task2_auth_usr/checkpoint-412
step 6: evaluate


{'eval_loss': 0.8025819063186646, 'eval_accuracy': 0.7707317073170732, 'eval_f1': 0.7648739164696611, 'eval_precision': 0.7783709434773266, 'eval_recall': 0.7665112665112664, 'eval_runtime': 2.0915, 'eval_samples_per_second': 98.016, 'eval_steps_per_second': 6.216, 'epoch': 5.0}
DONE


## Task3: Train Model

In [11]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()
run_exp("./Models/task3_clse_usr", df, report=report)

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1090 60 60
START
step 1: load data
step 2: load tokenizer
step 3: init data
['1. Close' '2. Know each other' "3. Don't know each other"]


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.970821,0.683333,0.439338,0.425,0.454718
2,No log,0.810533,0.733333,0.493582,0.478114,0.524419
3,No log,0.850519,0.766667,0.517598,0.500561,0.550972
4,No log,0.779988,0.766667,0.629574,0.81886,0.591157
5,No log,0.753837,0.75,0.674884,0.701502,0.656472


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 60
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./Models/task3_clse_usr/checkpoint-69
Configuration saved in ./Models/task3_clse_usr/checkpoint-69/config.json
Model weights saved in ./Models/task3_clse_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task3_clse_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task3_clse_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceCla

./Models/task3_clse_usr/checkpoint-345
step 6: evaluate


{'eval_loss': 0.6559752821922302, 'eval_accuracy': 0.65, 'eval_f1': 0.38003565062388595, 'eval_precision': 0.39829059829059826, 'eval_recall': 0.375, 'eval_runtime': 0.6718, 'eval_samples_per_second': 89.313, 'eval_steps_per_second': 5.954, 'epoch': 5.0}
DONE


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()
run_exp("./Models/task3_auth_usr", df, report=report)

Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1099 61 61
START
step 1: load data
step 2: load tokenizer


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 25005
}

loading file sentencepi

step 3: init data
['1. Respect' '2. Normal' '3. Not respect']


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


loading configuration file config.json from cache at /home/imtk/.cache/huggingface/hub/models--airesearch--wangchanberta-base-att-spm-uncased/snapshots/b81d38df6b4755dbedec0bfea863c9956cbb963e/config.json
Model config CamembertConfig {
  "_name_or_path": "airesearch/wangchanberta-base-att-spm-uncased",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1. Respect",
    "1": "2. Normal",
    "2": "3. Not respect"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1. Respect": 0,
    "2. Normal": 1,
    "3. Not respect": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "camembert",
  "num_attention_head": 12,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.905223,0.622951,0.516219,0.504006,0.655674
2,No log,0.957578,0.52459,0.448141,0.464806,0.613121
3,No log,0.85701,0.737705,0.589075,0.576907,0.679078
4,No log,0.798238,0.754098,0.630644,0.593567,0.738652
5,No log,0.866463,0.819672,0.702891,0.658333,0.793262


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
Saving model checkpoint to ./Models/task3_auth_usr/checkpoint-69
Configuration saved in ./Models/task3_auth_usr/checkpoint-69/config.json
Model weights saved in ./Models/task3_auth_usr/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Models/task3_auth_usr/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Models/task3_auth_usr/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this messa

./Models/task3_auth_usr/checkpoint-345
step 6: evaluate


{'eval_loss': 0.2966509163379669, 'eval_accuracy': 0.819672131147541, 'eval_f1': 0.813040293040293, 'eval_precision': 0.795791487326638, 'eval_recall': 0.8916666666666666, 'eval_runtime': 0.7108, 'eval_samples_per_second': 85.819, 'eval_steps_per_second': 5.627, 'epoch': 5.0}
DONE
