In [1]:
import os
import json
import shutil
import scrapbook as sb


  from pyarrow import HadoopFileSystem


# Settings

In [2]:
src_dir = "../../src"  # Ignore this!

# Pipeline settings
task = "sequence_classification"  # chinese_word_segmentation OR target_classification, OR sequence_classification
model = 'BERT_CLS'  # None: Default model 
device = 0
text_prepro = None  # None: Default steps 

model_params = {
    'num_train_epochs': 5, 
    'max_length': 256,
#     'tokenizer_name': "bert-base-chinese" 
#     'pretrained_lm': "bert-base-chinese" 
#     'embedding_trainable': True, 
#     'output_hidden_act_func': "PReLU", 
#     'output_hidden_dim': 128, 
#     'output_use_bn': False, 
#     'optimizer': "AdamW",
#     'learning_rate': 2e-5,
#     'weight_decay': 0.0,
#     'gradient_accumulation_steps': 1,
#     'adam_epsilon': 1e-8,
#     'max_grad_norm': 1.0,
}

train_params = {
    'batch_size': 16, 
    'seed': 42, 
    'optimization_metric': "macro_f1", 
    'early_stop': None
}

eval_params = {
    'batch_size': 32
}

model_dir = f"../output/test_pipeline_{task}_tmp"  # output dir for new model

In [3]:
os.chdir(src_dir)
from pipeline import Pipeline

# Load data

In [4]:

train_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
dev_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
test_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
print(train_raw_data[0])

{'content': '<p> zaia係咩, 我都想睇水舞間, 朋友睇完都話好睇!', 'label': 1}


# Run pipeline

## Initialize pipeline

In [5]:

pipeline = Pipeline(
    task=task, 
    model=model, 
    device=device, 
    text_prepro=text_prepro
)

2021-11-29 02:22:22 ***** Model class is specified for sequence_classification. *****
2021-11-29 02:22:22   Model = BERT_CLS


../config/examples/sequence_classification/BERT_CLS
['.ipynb_checkpoints', 'model', 'result', 'logs', 'run.yaml']


## Train a new model

In [6]:

pipeline.train(
    model_dir, 
    train_raw_data=train_raw_data, 
    dev_raw_data=dev_raw_data, 
    model_params=model_params,
    train_params=train_params
)

2021-11-29 02:22:23 ***** Initializing pipeline *****
2021-11-29 02:22:23 ***** Loading tokenizer *****
2021-11-29 02:22:23   Tokenizer source = 'transformers'
2021-11-29 02:22:23 ***** Initializing model *****
2021-11-29 02:22:23   Task = sequence_classification
2021-11-29 02:22:23   Model class = BERT_CLS
2021-11-29 02:22:23   Model path = ../output/test_pipeline_sequence_classification_tmp/model/model.pt


['tokenizer', 'label_to_id.json', 'model.pt', 'run.yaml', 'model.yaml']


2021-11-29 02:22:32 ***** Loading data *****
2021-11-29 02:22:32   Raw data is provided.
3it [00:00, 86.38it/s]
2021-11-29 02:22:32   Loaded samples = 3
2021-11-29 02:22:32 ***** Loading data *****
2021-11-29 02:22:32   Raw data is provided.
3it [00:00, 131.16it/s]
2021-11-29 02:22:32   Loaded samples = 3
2021-11-29 02:22:32 ***** Running training *****
2021-11-29 02:22:32   Num examples = 3
2021-11-29 02:22:32   Num Epochs = 5
2021-11-29 02:22:32   Sampler = 
2021-11-29 02:22:32   Batch size = 16
2021-11-29 02:22:32   Gradient Accumulation steps = 1
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration:   0%|          | 0/1 [00:00<?, ?it/s, tr_loss=0.619][A
Iteration: 100%|██████████| 1/1 [00:00<00:00,  7.65it/s, tr_loss=0.619][A
2021-11-29 02:22:32 ***** Epoch end: 0 *****
2021-11-29 02:22:32 ***** Running evaluation *****
2021-11-29 02:22:32   Num examples = 3
2021-11-29 02:22:32   Batch size = 64

Evaluating: 100%|██████████| 1

## Test

In [7]:

metrics = pipeline.test(
    test_raw_data=test_raw_data,
    eval_params=eval_params
)

2021-11-29 02:22:33 ***** Loading data *****
2021-11-29 02:22:33   Raw data is provided.
3it [00:00, 117.46it/s]
2021-11-29 02:22:33   Loaded samples = 3
2021-11-29 02:22:33 ***** Running evaluation *****
2021-11-29 02:22:33   Num examples = 3
2021-11-29 02:22:33   Batch size = 32
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 46.04it/s]
2021-11-29 02:22:33   accuracy = 1.0
2021-11-29 02:22:33   macro_f1 = 1.0
2021-11-29 02:22:33   micro_f1 = 1.0
2021-11-29 02:22:33   support = 3
2021-11-29 02:22:33   -1-precision = 1.0
2021-11-29 02:22:33   -1-recall = 1.0
2021-11-29 02:22:33   -1-f1-score = 1.0
2021-11-29 02:22:33   -1-support = 1
2021-11-29 02:22:33   0-precision = 1.0
2021-11-29 02:22:33   0-recall = 1.0
2021-11-29 02:22:33   0-f1-score = 1.0
2021-11-29 02:22:33   0-support = 1
2021-11-29 02:22:33   1-precision = 1.0
2021-11-29 02:22:33   1-recall = 1.0
2021-11-29 02:22:33   1-f1-score = 1.0
2021-11-29 02:22:33   1-support = 1
2021-11-29 02:22:33   loss = 0.49821820855140686
2021-1

## Load and Predict

In [5]:

pipeline = Pipeline(
    model_dir=model_dir, 
    device=device, 
)

print("Input:")
print(test_raw_data[0])

output = pipeline.predict(
    data_dict=test_raw_data[0],
)

print("Output:")
print(output)

2021-11-29 02:50:57 ***** Existing model is provided. *****
2021-11-29 02:50:57   Model directory = ../output/test_pipeline_sequence_classification_tmp
2021-11-29 02:50:57 ***** Initializing pipeline *****
2021-11-29 02:50:57 ***** Loading tokenizer *****
2021-11-29 02:50:57   Tokenizer source = 'transformers'
2021-11-29 02:50:57 ***** Initializing model *****
2021-11-29 02:50:57   Task = sequence_classification
2021-11-29 02:50:57   Model class = BERT_CLS
2021-11-29 02:50:57   Model path = ../output/test_pipeline_sequence_classification_tmp/model/model.pt


../output/test_pipeline_sequence_classification_tmp/model
['tokenizer', 'label_to_id.json', 'model.pt', 'run.yaml', 'model.yaml']
['tokenizer', 'label_to_id.json', 'model.pt', 'run.yaml', 'model.yaml']
Input:
{'content': '<p> zaia係咩, 我都想睇水舞間, 朋友睇完都話好睇!', 'label': 1}
Output:
{'prediction_id': 0, 'prediction': '1'}


## Explain

In [None]:
import time

t0 = time.time()

scores = pipeline.explain(
    data_dict=test_raw_data[0],
    method='IntegratedGradients',
    layer='pretrained_model.embeddings.word_embeddings', 
    norm='l2'
)

# print(scores)
print(time.time() - t0)

In [7]:
scores

[('[CLS]', 0.02911902405321598),
 ('<', 0.020030846819281578),
 ('p', 0.0189193207770586),
 ('>', 0.018668590113520622),
 ('z', 0.014856795780360699),
 ('##ai', 0.015659550204873085),
 ('##a', 0.015582692809402943),
 ('系', 0.016651133075356483),
 ('咩', 0.024479681625962257),
 (',', 0.016806714236736298),
 ('我', 0.01255938783288002),
 ('都', 0.012607353739440441),
 ('想', 0.016966380178928375),
 ('睇', 0.018306516110897064),
 ('水', 0.017799602821469307),
 ('舞', 0.021416891366243362),
 ('间', 0.01682175137102604),
 (',', 0.01767466403543949),
 ('朋', 0.011837850324809551),
 ('友', 0.011017942801117897),
 ('睇', 0.01700235903263092),
 ('完', 0.015345368534326553),
 ('都', 0.01331076119095087),
 ('话', 0.013242393732070923),
 ('好', 0.012461753562092781),
 ('睇', 0.021983789280056953),
 ('!', 0.026536865159869194),
 ('[SEP]', 0.0717211440205574)]

In [11]:
from IPython.core.display import HTML, display

def show_text_attr(scores):
    rgb = lambda x: '255,0,0' if x < 0 else '0,255,0'
    alpha = lambda x: abs(x) ** 0.5
    token_marks = [
        f'<mark style="background-color:rgba({rgb(attr)},{alpha(attr)})">{token}</mark>'
        for token, attr in scores
    ]
    
    display(HTML('<p>' + ' '.join(token_marks) + '</p>'))

In [12]:
show_text_attr(scores)

# Clear output folder

In [1]:
shutil.rmtree(model_dir)

NameError: name 'shutil' is not defined

# Export variables

In [10]:
sb.glue("macro_f1", metrics['macro_f1'])
sb.glue("micro_f1", metrics['micro_f1'])