In [1]:
import os
import json
import shutil
import scrapbook as sb


  from pyarrow import HadoopFileSystem


# Settings

In [4]:
src_dir = "../../src"  # Ignore this!

# Pipeline settings
task = "sequence_classification"  # chinese_word_segmentation OR target_classification, OR sequence_classification
model = 'BERT_CLS'  # None: Default model 
device = 0
text_prepro = None  # None: Default steps 

model_params = {
    'num_train_epochs': 5, 
    'max_length': 256,
#     'tokenizer_name': "bert-base-chinese" 
#     'pretrained_lm': "bert-base-chinese" 
#     'embedding_trainable': True, 
#     'output_hidden_act_func': "PReLU", 
#     'output_hidden_dim': 128, 
#     'output_use_bn': False, 
#     'optimizer': "AdamW",
#     'learning_rate': 2e-5,
#     'weight_decay': 0.0,
#     'gradient_accumulation_steps': 1,
#     'adam_epsilon': 1e-8,
#     'max_grad_norm': 1.0,
}

train_params = {
    'batch_size': 16, 
    'seed': 42, 
    'optimization_metric': "macro_f1", 
    'early_stop': None
}

eval_params = {
    'batch_size': 32
}

model_dir = f"../output/test_pipeline_{task}_tmp"  # output dir for new model


In [3]:
os.chdir(src_dir)
from pipeline import Pipeline

# Load data

In [4]:

train_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
dev_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
test_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
print(train_raw_data[0])

{'content': '<p> zaia係咩, 我都想睇水舞間, 朋友睇完都話好睇!', 'label': 1}


# Run pipeline

## Initialize pipeline

In [5]:

pipeline = Pipeline(
    task=task, 
    model=model, 
    device=device, 
    text_prepro=text_prepro
)

2021-11-29 02:22:22 ***** Model class is specified for sequence_classification. *****
2021-11-29 02:22:22   Model = BERT_CLS


../config/examples/sequence_classification/BERT_CLS
['.ipynb_checkpoints', 'model', 'result', 'logs', 'run.yaml']


## Train a new model

In [6]:

pipeline.train(
    model_dir, 
    train_raw_data=train_raw_data, 
    dev_raw_data=dev_raw_data, 
    model_params=model_params,
    train_params=train_params
)

2021-11-29 02:22:23 ***** Initializing pipeline *****
2021-11-29 02:22:23 ***** Loading tokenizer *****
2021-11-29 02:22:23   Tokenizer source = 'transformers'
2021-11-29 02:22:23 ***** Initializing model *****
2021-11-29 02:22:23   Task = sequence_classification
2021-11-29 02:22:23   Model class = BERT_CLS
2021-11-29 02:22:23   Model path = ../output/test_pipeline_sequence_classification_tmp/model/model.pt


['tokenizer', 'label_to_id.json', 'model.pt', 'run.yaml', 'model.yaml']


2021-11-29 02:22:32 ***** Loading data *****
2021-11-29 02:22:32   Raw data is provided.
3it [00:00, 86.38it/s]
2021-11-29 02:22:32   Loaded samples = 3
2021-11-29 02:22:32 ***** Loading data *****
2021-11-29 02:22:32   Raw data is provided.
3it [00:00, 131.16it/s]
2021-11-29 02:22:32   Loaded samples = 3
2021-11-29 02:22:32 ***** Running training *****
2021-11-29 02:22:32   Num examples = 3
2021-11-29 02:22:32   Num Epochs = 5
2021-11-29 02:22:32   Sampler = 
2021-11-29 02:22:32   Batch size = 16
2021-11-29 02:22:32   Gradient Accumulation steps = 1
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration:   0%|          | 0/1 [00:00<?, ?it/s, tr_loss=0.619][A
Iteration: 100%|██████████| 1/1 [00:00<00:00,  7.65it/s, tr_loss=0.619][A
2021-11-29 02:22:32 ***** Epoch end: 0 *****
2021-11-29 02:22:32 ***** Running evaluation *****
2021-11-29 02:22:32   Num examples = 3
2021-11-29 02:22:32   Batch size = 64

Evaluating: 100%|██████████| 1

## Test

In [8]:

metrics = pipeline.test(
    test_raw_data=test_raw_data,
    eval_params=eval_params
)

2021-11-30 02:57:45 ***** Loading data *****
2021-11-30 02:57:45   Raw data is provided.
1821it [00:07, 237.49it/s]
2021-11-30 02:57:53   Loaded samples = 1821
2021-11-30 02:57:53 ***** Running evaluation *****
2021-11-30 02:57:53   Num examples = 1821
2021-11-30 02:57:53   Batch size = 32
Evaluating: 100%|██████████| 57/57 [00:09<00:00,  5.97it/s]
2021-11-30 02:58:02   accuracy = 0.9104887424492037
2021-11-30 02:58:02   macro_f1 = 0.9104212080067818
2021-11-30 02:58:02   micro_f1 = 0.9104171559402365
2021-11-30 02:58:02   support = 1821
2021-11-30 02:58:02   0-precision = 0.9359720605355064
2021-11-30 02:58:02   0-recall = 0.881578947368421
2021-11-30 02:58:02   0-f1-score = 0.9079616036137775
2021-11-30 02:58:02   0-support = 912
2021-11-30 02:58:02   1-precision = 0.8877338877338877
2021-11-30 02:58:02   1-recall = 0.9394939493949395
2021-11-30 02:58:02   1-f1-score = 0.9128808123997861
2021-11-30 02:58:02   1-support = 909
2021-11-30 02:58:02   loss = 0.2819006933520238
2021-11-30 

## Load and Predict

In [7]:

pipeline = Pipeline(
    model_dir=model_dir, 
    device=device, 
)

print("Input:")
print(test_raw_data[0])

output = pipeline.predict(
    data_dict=test_raw_data[0],
)

print("Output:")
print(output)

2021-11-30 02:57:33 ***** Existing model is provided. *****
2021-11-30 02:57:33   Model directory = ../output/explainable_ai_paper/bert_cls_sst
2021-11-30 02:57:33 ***** Initializing pipeline *****
2021-11-30 02:57:33 ***** Loading tokenizer *****
2021-11-30 02:57:33   Tokenizer source = 'transformers'
2021-11-30 02:57:33 ***** Initializing model *****
2021-11-30 02:57:33   Task = sequence_classification
2021-11-30 02:57:33   Model class = BERT_CLS
2021-11-30 02:57:33   Model path = ../output/explainable_ai_paper/bert_cls_sst/model/model.pt


../output/explainable_ai_paper/bert_cls_sst/model
['run.yaml', 'model.yaml', 'tokenizer', 'label_to_id.json', 'model.pt']
['run.yaml', 'model.yaml', 'tokenizer', 'label_to_id.json', 'model.pt']
Input:
{'content': 'if you sometimes like to go to the movies to have fun wasabi is a good place to start', 'label': 1}
Output:
{'prediction_id': 0, 'prediction': '1'}


## Explain

In [None]:
# from captum.attr import visualization as viz
# import numpy as np


# def visualize_data_record_bert(pipeline, raw_data):
    
#     true_class = raw_data['label']

#     tokens, scores, attr_target, attr_target_prob = pipeline.explain(
#         data_dict=raw_data,
#         method='IntegratedGradients',
#         layer='pretrained_model.embeddings.word_embeddings', 
#         norm='sum'
#     )
    
#     attr_class = pipeline.args.label_to_id_inv[attr_target]
#     start_position_vis = viz.VisualizationDataRecord(
#                             scores,
#                             pred_prob=attr_target_prob,
#                             pred_class=attr_class,
#                             true_class=true_class,
#                             attr_class=attr_class,
#                             attr_score=np.sum(scores),       
#                             raw_input=tokens,
#                             convergence_score=None)
#     viz.visualize_text([start_position_vis])

# raw_data = test_raw_data[1]
# visualize_data_record_bert(pipeline, raw_data)

# Clear output folder

In [1]:
shutil.rmtree(model_dir)

NameError: name 'shutil' is not defined

# Export variables

In [10]:
sb.glue("macro_f1", metrics['macro_f1'])
sb.glue("micro_f1", metrics['micro_f1'])