In [34]:
from data_loader import get_task1_conver, get_task2_conver, preprocess

In [35]:
import pandas as pd

In [36]:
# df = get_task1_conver("../Task1//annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [37]:
# df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = [], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [38]:
# df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = [], only_user=False)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [39]:
import sys
sys.path.append('..')

from utils import load_jsonl, dump_jsonl, set_random_seed

In [40]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# import wandb
# from sklearn.metrics import precision_recall_fscore_support, classification_report
# from pythainlp.tokenize import word_tokenize
import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer



In [41]:
from sklearn.utils import compute_class_weight
import torch.nn as nn
import os, shutil
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import mean_squared_error, r2_score

def run_exp(out_dir, df, report="none", regressor_configs=None):

    set_random_seed()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # device = torch.device("cpu")
    print("START")
    print("step 1: load data")
    train, val, test = df
    
#     train = train.head(100)
#     val = val.head(100)
#     test = test.head(100)

    print("step 2: load tokenizer")
    model_name = "airesearch/wangchanberta-base-att-spm-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    num_added_toks = tokenizer.add_special_tokens({"additional_special_tokens": ["usr", "sys", "rep"]})

    print("step 3: init data")
    ds = DatasetDict()
    ds['train'] = Dataset.from_pandas(train)
    ds['val'] = Dataset.from_pandas(val)
    ds['test'] = Dataset.from_pandas(test)

    if regressor_configs is None:
        labels = train["label"].unique()
        num_labels = len(labels)
        print(labels)
    
        class_weights = compute_class_weight("balanced", classes=labels, y=train["label"].values)
        class_weights = torch.tensor(class_weights).float().to(device)

        id2label = {i:l for i, l in enumerate(labels)}
        label2id = {l:i for i, l in enumerate(labels)}

        def word_tokenize(d, tokenizer=None, label2id=None, max_length=256):
            texts = [preprocess(t) for t in d["text"]]
    #         print(texts)
            tokens = tokenizer(texts, truncation=True, max_length=max_length)
            num = [len(t) for t in tokens["input_ids"]]
    #         print(num)
    #         print("AVG", len(num), sum(num)/len(num))
            tokens["label"] = [label2id[label] for label in d["label"]]
            return tokens
    else:
#         labels = train["label"].unique()
        num_labels = 1

        id2label = {1: regressor_configs["label"]}
        label2id = [regressor_configs["not_label"], regressor_configs["label"]]

        def word_tokenize(d, tokenizer=None, label2id=None, max_length=256):
            texts = [preprocess(t) for t in d["text"]]
    #         print(texts)
            tokens = tokenizer(texts, truncation=True, max_length=max_length)
            num = [len(t) for t in tokens["input_ids"]]
    #         print(num)
    #         print("AVG", len(num), sum(num)/len(num))
            tokens["label"] = [regressor_configs["label_fn"](label) for label in d["label"]]
            return tokens
        
        
        
    tokenized_ds = ds.map(word_tokenize, batched=True, fn_kwargs={"tokenizer":tokenizer, "label2id": label2id, "max_length":max_length})
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    print("step 4: load model")
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id);
    model.resize_token_embeddings(len(tokenizer))
    model = model.to(device)

    if regressor_configs is None:
        metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])
        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return metrics.compute(predictions=predictions, references=labels, average="macro")
    else:
        label_fn = regressor_configs["label_fn"]
        def compute_metrics(eval_pred):           
            predictions, actual = eval_pred
            predictions = predictions.reshape(-1)
            
            predicted_labels = [label_fn(p) for p in predictions]
            actual_labels = [label_fn(p) for p in actual]
            p, r, f1, _ = precision_recall_fscore_support(actual_labels, predicted_labels, average='macro')
            
            return {
                "r2_score": r2_score(actual, predictions),
                "mean_squared_error": np.sqrt(mean_squared_error(actual, predictions)),
                "accuracy": accuracy_score(actual_labels, predicted_labels),
                "f1": f1,
                "precision": p,
                "recall": r,
            }


    print("step 5: fine-tune")
    
    training_args = TrainingArguments(
        output_dir=out_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        report_to=report,
        metric_for_best_model="f1",
        save_total_limit=2,
        load_best_model_at_end=True,
        push_to_hub=False,
        run_name=out_dir,
    )
    
    if regressor_configs is None:
        class CustomTrainer(Trainer):
            def compute_loss(self, model, inputs, return_outputs=False):
                labels = inputs.get("labels")
                # forward pass
                outputs = model(**inputs)

                logits = outputs.get("logits")

                loss_fct = nn.CrossEntropyLoss(weight=class_weights)
                loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
                return (loss, outputs) if return_outputs else loss
    else:
        CustomTrainer = Trainer
        
    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_ds["train"],
        eval_dataset=tokenized_ds["val"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,   
    )

    trainer.train()
    best_ckpt_path = trainer.state.best_model_checkpoint
    print(best_ckpt_path)

    modle_out_path = out_dir+"/best_model"
    if os.path.exists(modle_out_path):
        shutil.rmtree(modle_out_path)
        
    os.rename(best_ckpt_path, modle_out_path)
    best_ckpt_path = modle_out_path
    
    print("step 6: evaluate")
    e = trainer.evaluate(tokenized_ds["test"])
    print(e)

    print("DONE")

## Task1: Train Model

In [44]:
report = "none"
batch_size = 16
max_length = 128
num_epochs = 100

In [45]:
# import os
# stream = os.popen('nohup python3 run_train_task_classifier.py > train2.out &')
# output = stream.read()
# output

In [46]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# df = (df[0].head(), df[1].head(), df[2].head())
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1096 60 60


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Close,551
2. Know each other,230
3. Don't know each other,435


In [47]:
# df[0]["label"].value_counts().loc[['1. Close', '2. Know each other', "3. Don't know each other"]].plot.bar()

In [48]:
# df[1]["label"].value_counts().loc[['1. Close', '2. Know each other', "3. Don't know each other"]].plot.bar()

In [49]:
# df[2]["label"].value_counts().loc[['1. Close', '2. Know each other', "3. Don't know each other"]].plot.bar()

In [50]:
# def closeness_label_fn(label):
#     if label == '1. Close':
#         return 1
#     elif label =='2. Know each other':
#         return 0.5
#     elif label == "3. Don't know each other":
#         return 0
#     elif type(label)==str:
#         assert(False)
    
#     # [0, 0.33) =>
#     # [0.33, 0.66) =>
#     # [0.66, 1] =>
    
#     if label > 0.66:
#         return '1. Close'
#     elif label > 0.33:
#         return '2. Know each other'
#     else:
#         return "3. Don't know each other"
    
# run_exp("./Regressors/task1_clse_usr100e", df, report=report, regressor_configs={
#     "label": "close",
#     "not_label": "not_close",
#     "label_fn": closeness_label_fn,
# })

In [51]:
# run_exp("./Models/task1_clse_usr100e", df, report=report)

In [52]:
df = get_task1_conver("../Task1/annotated_conersations.jsonl", "authority", skips = ["3. Not respect"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 1234 records from ../Task1/annotated_conersations.jsonl
N 1098 61 61


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0. Very respect,248
1. Respect,289
2. Normal,683


In [56]:
df[0]["label"].unique()

array(['1. Respect', '2. Normal', '0. Very respect'], dtype=object)

In [60]:
# df[0]["label"].value_counts().loc[["0. Very respect", "1. Respect", "2. Normal"]].plot.bar()

In [61]:
# df[1]["label"].value_counts().loc[["0. Very respect", "1. Respect", "2. Normal"]].plot.bar()

In [62]:
# df[2]["label"].value_counts().loc[["0. Very respect", "1. Respect", "2. Normal"]].plot.bar()

In [53]:
def authority_label_fn(label):
    if label == '0. Very respect':
        return 1
    elif label =='1. Respect':
        return 0.5
    elif label == "2. Normal":
        return 0
    elif type(label)==str:
        assert(False)
    
    # [0, 0.33) =>
    # [0.33, 0.66) =>
    # [0.66, 1] =>
    
    if label > 0.66:
        return '0. Very respect'
    elif label > 0.33:
        return '1. Respect'
    else:
        return "2. Normal"
    
run_exp("./Regressors/task1_auth_usr100e", df, report=report, regressor_configs={
    "label": "respect",
    "not_label": "not_respect",
    "label_fn": authority_label_fn,
})

START
step 1: load data
step 2: load tokenizer
step 3: init data


  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

step 4: load model


Some weights of the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased were not used when initializing CamembertForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CamembertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at airesearch/wa

step 5: fine-tune
['labels']


Epoch,Training Loss,Validation Loss,R2 Score,Mean Squared Error,Accuracy,F1,Precision,Recall
1,No log,0.174809,-0.120523,0.418101,0.508197,0.224638,0.178161,0.303922
2,No log,0.172884,-0.108184,0.415793,0.508197,0.227106,0.181287,0.303922
3,No log,0.153108,0.018577,0.391291,0.508197,0.355571,0.328054,0.403268
4,No log,0.150741,0.033749,0.388254,0.409836,0.369243,0.455922,0.417647
5,No log,0.136468,0.12524,0.369416,0.508197,0.390296,0.488733,0.408824
6,No log,0.135456,0.131728,0.368043,0.47541,0.319048,0.292963,0.358824
7,No log,0.142805,0.08462,0.377896,0.442623,0.409457,0.533251,0.393137
8,0.170300,0.148909,0.045498,0.385887,0.52459,0.399441,0.650999,0.418627
9,0.170300,0.13941,0.106385,0.373376,0.508197,0.478658,0.575054,0.462745
10,0.170300,0.169991,-0.089638,0.412299,0.606557,0.394053,0.669872,0.417974


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 61
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to ./Regressors/task1_auth_usr100e/checkpoint-69
Configuration saved in ./Regressors/task1_auth_usr100e/checkpoint-69/config.json
Model weights saved in ./Regressors/task1_auth_usr100e/checkpoint-69/pytorch_model.bin
tokenizer config file saved in ./Regressors/task1_auth_usr100e/checkpoint-69/tokenizer_config.json
Special tokens file saved in ./Regressors/task1_auth_usr100e/checkpoint-69/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: text. If text are 

./Regressors/task1_auth_usr100e/checkpoint-4071
step 6: evaluate


{'eval_loss': 0.17200611531734467, 'eval_r2_score': 0.006928068459504999, 'eval_mean_squared_error': 0.41473624110221863, 'eval_accuracy': 0.5409836065573771, 'eval_f1': 0.4314514151174768, 'eval_precision': 0.5452380952380952, 'eval_recall': 0.43273809523809526, 'eval_runtime': 0.7107, 'eval_samples_per_second': 85.831, 'eval_steps_per_second': 5.628, 'epoch': 100.0}
DONE


In [22]:
# run_exp("./Models/task1_auth_usr100e", df, report=report)

## Task2: Train Model

In [20]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 2486 records from ../Task2/annotated/annotated.jsonl
N 1495 186 186


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Close,222
2. Know each other,158
3. Don't know each other,1487


In [54]:
run_exp("./Regressors/task2_clse_usr100e", df, report=report, regressor_configs={
    "label": "close",
    "not_label": "not_close",
    "label_fn": closeness_label_fn,
})

NameError: name 'closeness_label_fn' is not defined

In [24]:
# run_exp("./Models/task2_clse_usr100e", df, report=report)

In [25]:
df = get_task2_conver("../Task2/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()

Loaded 2486 records from ../Task2/annotated/annotated.jsonl
N 1876 234 234


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Respect,319
2. Normal,1661
3. Not respect,364


In [26]:
# def authority2_label_fn(label):
#     if label == '1. Respect':
#         return 1
#     elif label =='2. Normal':
#         return 0.5
#     elif label == "3. Not respect":
#         return 0
#     elif type(label)==str:
#         assert(False)
    
#     if label > 0.66:
#         return '1. Respect'
#     elif label > 0.33:
#         return '2. Normal'
#     else:
#         return "3. Not respect"

# run_exp("./Regressors/task2_auth_usr100e", df, report=report, regressor_configs={
#     "label": "respect",
#     "not_label": "not_respect",
#     "label_fn": authority2_label_fn,
# })

In [27]:
# run_exp("./Models/task2_auth_usr100e", df, report=report)

## Task3: Train Model

In [26]:
df = get_task1_conver("../Task3/annotated/annotated.jsonl", "closeness", skips = ["4. Don't like each other"], only_user=True)
# print(df[0]["text"][0])
pd.concat(df).groupby("label").count()


Loaded 1221 records from ../Task3/annotated/annotated.jsonl
N 1090 60 60


Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
1. Close,462
2. Know each other,696
3. Don't know each other,52


In [28]:
# run_exp("./Regressors/task3_clse_usr100e", df, report=report, regressor_configs={
#     "label": "close",
#     "not_label": "not_close",
#     "label_fn": closeness_label_fn,
# })

In [29]:
# run_exp("./Models/task3_clse_usr100e", df, report=report)

In [32]:
# df = get_task1_conver("../Task3/annotated/annotated.jsonl", "authority", skips = [], only_user=True)
# # print(df[0]["text"][0])
# pd.concat(df).groupby("label").count()

In [33]:
# run_exp("./Regressors/task3_auth_usr100e", df, report=report, regressor_configs={
#     "label": "respect",
#     "not_label": "not_respect",
#     "label_fn": authority2_label_fn,
# })

In [None]:
# run_exp("./Models/task3_auth_usr", df, report=report)

In [31]:
"DOOOOOM"

'DOOOOOM'