In [1]:
import os
import json
import shutil
import scrapbook as sb


  from pyarrow import HadoopFileSystem


# Settings

In [2]:
src_dir = "../../src"  # Ignore this!

# Pipeline settings
task = "chinese_word_segmentation"  # chinese_word_segmentation OR target_classification, OR sequence_classification
model = 'CNN_CRF'  # None: Default model 
device = 0
text_prepro = None  # None: Default steps 
model_params = {
    'num_train_epochs': 100
}
train_params = {
    'batch_size': 16
}
eval_params = {
    'batch_size': 32
}
model_dir = f"../output/test_pipeline_{task}_tmp"  # output dir for new model

In [3]:
os.chdir(src_dir)
from pipeline import Pipeline

# Load data

In [4]:

train_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
dev_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
test_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
print(train_raw_data[0])

{'docid': 'S_00000235', 'content': '佢指，最大機會係有患者進入街市，經佢嘅糞便或者口水傳播，但究竟點傳播，係因為口水或者係由老鼠將病毒帶到四周圍，暫時未知道。', 'words': ['佢', '指', '，', '最', '大', '機會', '係', '有', '患者', '進入', '街市', '，', '經', '佢', '嘅', '糞便', '或者', '口水', '傳播', '，', '但', '究竟', '點', '傳播', '，', '係', '因為', '口水', '或者', '係', '由', '老鼠', '將', '病毒', '帶', '到', '四周圍', '，', '暫時', '未', '知道', '。'], 'postags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'sent_indexs': [[0, 1], [1, 2], [2, 3], [3, 4], [4, 5], [5, 7], [7, 8], [8, 9], [9, 11], [11, 13], [13, 15], [15, 16], [16, 17], [17, 18], [18, 19], [19, 21], [21, 23], [23, 25], [25, 27], [27, 28], [28, 29], [29, 31], [31, 32], [32, 34], [34, 35], [35, 36], [36, 38], [38, 40], [40, 42], [42, 43], [43, 44], [44, 46], [46, 47], [47, 49], [49, 50], [50, 51], [51, 54], [54, 55], [55, 57], [57, 58], [58, 60], [60, 61]]}


# Run pipeline

## Initialize pipeline

In [5]:

pipeline = Pipeline(
    task=task, 
    model=model, 
    device=device, 
    text_prepro=text_prepro
)

2021-11-26 02:23:35 ***** Model class is specified for chinese_word_segmentation. *****
2021-11-26 02:23:35   Model = CNN_CRF


../config/examples/chinese_word_segmentation/CNN_CRF
['.ipynb_checkpoints', 'run.yaml', 'model', 'result', 'logs']


## Train a new model

In [6]:

pipeline.train(
    model_dir, 
    train_raw_data=train_raw_data, 
    dev_raw_data=dev_raw_data, 
    model_params=model_params,
    train_params=train_params
)

2021-11-26 02:23:36 ***** Initializing pipeline *****
2021-11-26 02:23:36 ***** Loading tokenizer *****
2021-11-26 02:23:36   Tokenizer source = 'char_split'
2021-11-26 02:23:36 ***** Building vocab from dataset *****
2021-11-26 02:23:36   Datasets = ['train']
100%|██████████| 1/1 [00:00<00:00, 187.98it/s]
2021-11-26 02:23:36   Number of infrequency words = 0
2021-11-26 02:23:36   Infrequenct words = 0
2021-11-26 02:23:36   Vocab size = 45
2021-11-26 02:23:36 ***** Initializing model *****
2021-11-26 02:23:36   Task = chinese_word_segmentation
2021-11-26 02:23:36   Model class = CNN_CRF


['word_to_id.json']


2021-11-26 02:23:42 ***** Loading data *****
2021-11-26 02:23:42   Raw data is provided.
32it [00:00, 247.55it/s]
2021-11-26 02:23:43   Loaded samples = 32
2021-11-26 02:23:43 ***** Loading data *****
2021-11-26 02:23:43   Raw data is provided.
32it [00:00, 251.60it/s]
2021-11-26 02:23:43   Loaded samples = 32
2021-11-26 02:23:43 ***** Running training *****
2021-11-26 02:23:43   Num examples = 32
2021-11-26 02:23:43   Num Epochs = 100
2021-11-26 02:23:43   Sampler = 
2021-11-26 02:23:43   Batch size = 16
2021-11-26 02:23:43   Gradient Accumulation steps = 1
Epoch:   0%|          | 0/100 [00:00<?, ?it/s]
Iteration:   0%|          | 0/2 [00:00<?, ?it/s][A
Iteration:   0%|          | 0/2 [00:00<?, ?it/s, tr_loss=80.4][A
Iteration:  50%|█████     | 1/2 [00:00<00:00,  3.17it/s, tr_loss=80.4][A
Iteration:  50%|█████     | 1/2 [00:00<00:00,  3.17it/s, tr_loss=99.4][A
Iteration: 100%|██████████| 2/2 [00:00<00:00,  3.06it/s, tr_loss=99.4][A
2021-11-26 02:23:43 ***** Epoch end: 0 *****
202

## Test

In [7]:

metrics = pipeline.test(
    test_raw_data=test_raw_data,
    eval_params=eval_params
)

2021-11-26 02:25:43 ***** Loading data *****
2021-11-26 02:25:43   Raw data is provided.
32it [00:00, 253.37it/s]
2021-11-26 02:25:43   Loaded samples = 32
2021-11-26 02:25:43 ***** Running evaluation *****
2021-11-26 02:25:43   Num examples = 32
2021-11-26 02:25:43   Batch size = 32
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
2021-11-26 02:25:44   macro_f1 = 0.6531557614360162
2021-11-26 02:25:44   micro_f1 = 0.6531557614360162
2021-11-26 02:25:44   support = 884
2021-11-26 02:25:44   O-precision = 0.6690391459074733
2021-11-26 02:25:44   O-recall = 0.6380090497737556
2021-11-26 02:25:44   O-f1-score = 0.6531557614360162
2021-11-26 02:25:44   O-support = 884
2021-11-26 02:25:44   loss = 15.432387351989746
2021-11-26 02:25:44   dataset = train
2021-11-26 02:25:44 ***** Running evaluation *****
2021-11-26 02:25:44   Num examples = 32
2021-11-26 02:25:44   Batch size = 32
Evaluating: 100%|██████████| 1/1 [00:00<00:00,  3.84it/s]
2021-11-26 02:25:44   macro_f1 = 0.6531557614

## Predict

In [8]:
print("Input:")
print(test_raw_data[0])

output = pipeline.predict(
    data_dict=test_raw_data[0],
)

print("Output:")
print(output)

Input:
{'content': '原來是瑞士製錶品牌Hublot，黑色太陽花還暗藏機關。', 'target_locs': [[9, 15]], 'label': 'neutral'}
Output:
{'prediction_id': 0, 'prediction': 'neutral'}


# Clear output folder

In [1]:
shutil.rmtree(model_dir)

NameError: name 'shutil' is not defined

# Export variables

In [10]:
sb.glue("macro_f1", metrics['macro_f1'])
sb.glue("micro_f1", metrics['micro_f1'])