In [1]:
import os
import json
import shutil
import scrapbook as sb


  from pyarrow import HadoopFileSystem


# Settings

In [2]:
src_dir = "../../src"  # Ignore this!

# Pipeline settings
task = "sequence_classification"  # chinese_word_segmentation OR target_classification, OR sequence_classification
model = 'BERT_CLS'  # None: Default model 
device = 0
text_prepro = None  # None: Default steps 

model_params = {
    'num_train_epochs': 5, 
    'max_length': 256,
    'tokenizer_name': "bert-base-cased" ,
    'pretrained_lm': "bert-base-cased" ,
#     'embedding_trainable': True, 
#     'output_hidden_act_func': "PReLU", 
#     'output_hidden_dim': 128, 
#     'output_use_bn': False, 
#     'optimizer': "AdamW",
#     'learning_rate': 2e-5,
#     'weight_decay': 0.0,
#     'gradient_accumulation_steps': 1,
#     'adam_epsilon': 1e-8,
#     'max_grad_norm': 1.0,
}

train_params = {
    'batch_size': 16, 
    'seed': 42, 
    'optimization_metric': "macro_f1", 
    'early_stop': None
}

eval_params = {
    'batch_size': 32
}

model_dir = f"../output/test_pipeline_{task}_sst"  # output dir for new model

In [3]:
os.chdir(src_dir)
from pipeline import Pipeline

# Load data

In [4]:

# train_raw_data = json.load(open(f"../data/datasets/public/{task}/sst/train.json", 'r'))
# dev_raw_data = json.load(open(f"../data/datasets/public/{task}/sst/test.json", 'r'))
# test_raw_data = json.load(open(f"../data/datasets/public/{task}/sst/test.json", 'r'))
# print(train_raw_data[0])

{'content': 'the rock is destined to be the 21st century s new conan and that he s going to make a splash even greater than arnold schwarzenegger jean claud van damme or steven segal', 'label': 1}


In [4]:

train_raw_data = json.load(open(f"../data/datasets/internal/sequence_classification/post_sentiment/train.json", 'r'))
dev_raw_data = json.load(open(f"../data/datasets/internal/sequence_classification/post_sentiment/test.json", 'r'))
test_raw_data = json.load(open(f"../data/datasets/internal/sequence_classification/post_sentiment/test.json", 'r'))
print(train_raw_data[0])

{'content': '德國政府證實,在大衆旗下豪華汽車品牌奧迪的多款柴油車上發現專門應付尾氣排放檢測的作弊軟件,包括奧迪a8。『大衆旗下奧迪a8首曝"排放門"_手機新浪網』http://t.cn/rsatfqj', 'label': -1}


# Extra settings

In [6]:
# model_dir = f"../output/explainable_ai_paper/bert_cls_sst"

In [7]:
# model_dir = f"/ailab/shared/Users/quincy/canton-target-sentiment/output/explainable_ai_paper/bert_cls_sst"

In [10]:
model_dir = f"../output/knowledge_distillation/teacher_model_cls"

# Run pipeline

## Initialize pipeline

In [7]:

pipeline = Pipeline(
    task=task, 
    model=model, 
    device=device, 
    text_prepro=text_prepro
)

2021-12-01 01:38:48 ***** Model class is specified for sequence_classification. *****
2021-12-01 01:38:48   Model = BERT_CLS


../config/examples/sequence_classification/BERT_CLS
['.ipynb_checkpoints', 'run.yaml', 'model', 'result', 'logs']


## Train a new model

In [12]:

pipeline.train(
    model_dir, 
    train_raw_data=train_raw_data, 
    dev_raw_data=dev_raw_data, 
    model_params=model_params,
    train_params=train_params
)

2021-11-30 01:58:03 ***** Initializing pipeline *****
2021-11-30 01:58:03 ***** Loading tokenizer *****
2021-11-30 01:58:03   Tokenizer source = 'transformers'
2021-11-30 01:58:06 ***** Initializing model *****
2021-11-30 01:58:06   Task = sequence_classification
2021-11-30 01:58:06   Model class = BERT_CLS


['tokenizer']


2021-11-30 01:58:07 ***** Loading pretrained language model *****
2021-11-30 01:58:07   Pretrained BERT = 'bert-base-cased'
2021-11-30 01:58:19 ***** Loading data *****
2021-11-30 01:58:19   Raw data is provided.
6920it [00:30, 230.15it/s]
2021-11-30 01:58:49   Loaded samples = 6920
2021-11-30 01:58:49 ***** Loading data *****
2021-11-30 01:58:49   Raw data is provided.
1821it [00:07, 229.67it/s]
2021-11-30 01:58:57   Loaded samples = 1821
2021-11-30 01:58:57 ***** Running training *****
2021-11-30 01:58:57   Num examples = 6920
2021-11-30 01:58:57   Num Epochs = 5
2021-11-30 01:58:57   Sampler = 
2021-11-30 01:58:57   Batch size = 16
2021-11-30 01:58:57   Gradient Accumulation steps = 1
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/433 [00:00<?, ?it/s][A
Iteration:   0%|          | 0/433 [00:00<?, ?it/s, tr_loss=0.671][A
Iteration:   0%|          | 1/433 [00:00<02:13,  3.24it/s, tr_loss=0.671][A
Iteration:   0%|          | 1/433 [00:00<02:13,  3.24it/s,

## Test

In [13]:

metrics = pipeline.test(
    test_raw_data=test_raw_data,
    eval_params=eval_params
)

2021-11-30 02:10:28 ***** Loading data *****
2021-11-30 02:10:28   Raw data is provided.
1821it [00:07, 233.53it/s]
2021-11-30 02:10:35   Loaded samples = 1821
2021-11-30 02:10:35 ***** Running evaluation *****
2021-11-30 02:10:35   Num examples = 6920
2021-11-30 02:10:35   Batch size = 32
Evaluating: 100%|██████████| 217/217 [00:36<00:00,  5.96it/s]
2021-11-30 02:11:12   accuracy = 0.9992774566473989
2021-11-30 02:11:12   macro_f1 = 0.9992761232323626
2021-11-30 02:11:12   micro_f1 = 0.9992774701162376
2021-11-30 02:11:12   support = 6920
2021-11-30 02:11:12   0-precision = 0.9987926350739511
2021-11-30 02:11:12   0-recall = 0.9996978851963746
2021-11-30 02:11:12   0-f1-score = 0.9992450551109768
2021-11-30 02:11:12   0-support = 3310
2021-11-30 02:11:12   1-precision = 0.9997227612974772
2021-11-30 02:11:12   1-recall = 0.9988919667590028
2021-11-30 02:11:12   1-f1-score = 0.9993071913537482
2021-11-30 02:11:12   1-support = 3610
2021-11-30 02:11:12   loss = 0.003224152470788648
2021

## Load and Predict

In [11]:

pipeline = Pipeline(
    model_dir=model_dir, 
    device=device, 
)

print("Input:")
print(test_raw_data[0])

output = pipeline.predict(
    data_dict=test_raw_data[0],
)

print("Output:")
print(output)

2021-12-02 02:01:50 ***** Existing model is provided. *****
2021-12-02 02:01:50   Model directory = ../output/knowledge_distillation/teacher_model_cls
2021-12-02 02:01:50 ***** Initializing pipeline *****
2021-12-02 02:01:50 ***** Loading tokenizer *****
2021-12-02 02:01:50   Tokenizer source = 'transformers'
2021-12-02 02:01:50 ***** Initializing model *****
2021-12-02 02:01:50   Task = sequence_classification
2021-12-02 02:01:50   Model class = BERT_CLS
2021-12-02 02:01:50   Model path = ../output/knowledge_distillation/teacher_model_cls/model/model.pt


../output/knowledge_distillation/teacher_model_cls/model
['tokenizer', 'label_to_id.json', 'run.yaml', 'model.yaml', 'model.pt']
['tokenizer', 'label_to_id.json', 'run.yaml', 'model.yaml', 'model.pt']
Input:
{'content': '# 现在的选手很少有这种强悍气势啊', 'label': '1'}
Output:
{'prediction_id': 1, 'prediction': '1'}


## Explain model predictions

In [7]:
from IPython.core.display import HTML, display
import numpy as np


def show_text_attr(scores, show_top_n=None, reverse_color=False):
 
    if show_top_n is not None:
        scores = scores[1:len(scores)-1]
        arr = np.array([x[1] for x in scores])
        idxs = arr.argsort()[-1::-1]
        for i in idxs[show_top_n:]:
            scores[i] = (scores[i][0], 0)
    if reverse_color:
        rgb = lambda x: '255,0,0' if x > 0 else '0,255,0'
    else:
        rgb = lambda x: '255,0,0' if x < 0 else '0,255,0'
    alpha = lambda x: 0 if x == 0 else 0.5
    
    token_marks = [
        f'<mark style="background-color:rgba({rgb(attr)},{alpha(attr)})">{token}</mark>'
        for token, attr in scores
    ]
    
    display(HTML('<p>' + ' '.join(token_marks) + '</p>'))

### Positive keywords

In [85]:
import time
import random

# t0 = time.time()
random.shuffle(test_raw_data)
cnt = 0
i = 0
while cnt < 25:
    
    output = pipeline.predict(
        data_dict=test_raw_data[i],
    )
    pred = output['prediction']
    if pred == '1':
        tokens, scores, attr_target, attr_target_prob = pipeline.explain(
            data_dict=test_raw_data[i],
            method='IntegratedGradients',
            layer='pretrained_model.embeddings.word_embeddings', 
            norm=None
        )
        show_text_attr(scores, show_top_n=3)
        cnt += 1
    i += 1
# print(scores)
# print(time.time() - t0)

### Negative keywords

In [89]:

cnt = 0
i = 0
while cnt < 25:
    
    output = pipeline.predict(
        data_dict=test_raw_data[i],
    )
    pred = output['prediction']
    if pred == '0':
        tokens, scores, attr_target, attr_target_prob = pipeline.explain(
            data_dict=test_raw_data[i],
            method='IntegratedGradients',
            layer='pretrained_model.embeddings.word_embeddings', 
            norm=None
        )
        show_text_attr(scores, show_top_n=3, reverse_color=True)
        cnt += 1
    i += 1


### VisualizationDataRecord

In [12]:
from captum.attr import visualization as viz
import numpy as np

In [13]:


def visualize_data_record_bert(pipeline, raw_data):
    
    true_class = raw_data['label']

    tokens, scores, attr_target, attr_target_prob = pipeline.explain(
        data_dict=raw_data,
        method='IntegratedGradients',
        layer='pretrained_model.embeddings.word_embeddings', 
        norm='sum'
    )
    
    attr_class = pipeline.args.label_to_id_inv[attr_target]
    start_position_vis = viz.VisualizationDataRecord(
                            scores,
                            pred_prob=attr_target_prob,
                            pred_class=attr_class,
                            true_class=true_class,
                            attr_class=attr_class,
                            attr_score=np.sum(scores),       
                            raw_input=tokens,
                            convergence_score=None)
    viz.visualize_text([start_position_vis])

raw_data = test_raw_data[1]
visualize_data_record_bert(pipeline, raw_data)

True Label,Predicted Label,Attribution Label,Attribution Score,Word Importance
0.0,1 (0.94),1.0,1.46,"[CLS] 三 钢 闽 光 最 佳 买 、 卖 点 位 得 出 , 战 友 们 不 要 被 主 力 阴 谋 所 迷 惑 [SEP]"
,,,,


In [16]:
!pip install ailab

Collecting ailab
  Downloading ailab-20210210-py2.py3-none-any.whl (10 kB)
Collecting cefpython3==66.0
  Downloading cefpython3-66.0-py2.py3-none-manylinux1_x86_64.whl (79.6 MB)
[K     |████████████████████████████████| 79.6 MB 4.0 MB/s eta 0:00:014K     |▌                               | 1.3 MB 12.0 MB/s eta 0:00:07
[?25hCollecting entangle-python
  Downloading entangle_python-20201010.2-py2.py3-none-any.whl (6.8 kB)
Collecting autobahn[twisted]
  Downloading autobahn-21.11.1.tar.gz (365 kB)
[K     |████████████████████████████████| 365 kB 9.5 MB/s eta 0:00:01
Collecting bcrypt
  Downloading bcrypt-3.2.0-cp36-abi3-manylinux2010_x86_64.whl (63 kB)
[K     |████████████████████████████████| 63 kB 4.1 MB/s  eta 0:00:01
Collecting cryptography>=3.4.6
  Downloading cryptography-36.0.0-cp36-abi3-manylinux_2_24_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 14.4 MB/s eta 0:00:01
Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
Collecting hyperlink>=21.0.

In [19]:
!pip install ltp

Collecting ltp
  Downloading ltp-4.1.5.post2-py3-none-any.whl (94 kB)
[K     |████████████████████████████████| 94 kB 3.4 MB/s  eta 0:00:01
Collecting pygtrie<2.5,>=2.3.0
  Downloading pygtrie-2.4.2.tar.gz (35 kB)
Collecting transformers<=4.7.0,>=4.0.0
  Downloading transformers-4.7.0-py3-none-any.whl (2.5 MB)
[K     |████████████████████████████████| 2.5 MB 86.2 MB/s eta 0:00:01
Collecting huggingface-hub==0.0.8
  Downloading huggingface_hub-0.0.8-py3-none-any.whl (34 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 57.1 MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: pygtrie
  Building wheel for pygtrie (setup.py) ... [?25ldone
[?25h  Created wheel for pygtrie: filename=pygtrie-2.4.2-py3-none-any.whl size=19063 sha256=9d0d4e6eb22b423dadd79833bb944c316a32f4f979ad5c9dc674b0dcff412234
  Store

In [18]:
# from ailabuap.tokenizer import LTPTokenizer

# ltp_tokenizer = LTPTokenizer()

ModuleNotFoundError: No module named 'ailab.tokenizer'

In [20]:
from ltp import LTP

ltp = LTP()  # 默认加载 Small 模型
seg, hidden = ltp.seg(["他叫汤姆去拿外衣。"])

HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=164437832.0), HTML(value='')))

ChunkedEncodingError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))

# Clear output folder

In [None]:
shutil.rmtree(model_dir)

# Export variables

In [10]:
sb.glue("macro_f1", metrics['macro_f1'])
sb.glue("micro_f1", metrics['micro_f1'])