In [1]:
import os
import json
import shutil
import scrapbook as sb


  from pyarrow import HadoopFileSystem


# Settings

In [2]:
src_dir = "../../src"  # Ignore this!

# Pipeline settings
task = "sequence_classification"  # chinese_word_segmentation OR target_classification, OR sequence_classification
device = 0
model_params = {'num_train_epochs': 5}
model_dir = f"../output/test_pipeline_{task}_tmp"  # output dir for new model

In [3]:
os.chdir(src_dir)
from pipeline import Pipeline

# Load data

In [4]:

train_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
dev_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
test_raw_data = json.load(open(f"../data/datasets/sample/{task}/train_sample.json", 'r'))
print(train_raw_data[0])

{'content': '<p> zaia係咩, 我都想睇水舞間, 朋友睇完都話好睇!', 'label': 1}


# Run pipeline

## Initialize pipeline

In [5]:

pipeline = Pipeline(
    task=task, 
    device=device, 
)

2021-11-26 01:29:44 ***** Model class is not specified for sequence_classification. *****
2021-11-26 01:29:44   Default model = BERT_CLS


../config/examples/sequence_classification/BERT_CLS
['.ipynb_checkpoints', 'run.yaml', 'model', 'result', 'logs']


## Train a new model

In [6]:

pipeline.train(
    model_dir, 
    train_raw_data=train_raw_data, 
    dev_raw_data=dev_raw_data, 
    model_params=model_params
)

2021-11-26 01:29:45 ***** Initializing pipeline *****
2021-11-26 01:29:45 ***** Loading tokenizer *****
2021-11-26 01:29:45   Tokenizer source = 'transformers'
2021-11-26 01:29:48 ***** Initializing model *****
2021-11-26 01:29:48   Task = sequence_classification
2021-11-26 01:29:48   Model class = BERT_CLS


['tokenizer']


2021-11-26 01:29:49 ***** Loading pretrained language model *****
2021-11-26 01:29:49   Pretrained BERT = 'bert-base-chinese'
2021-11-26 01:29:59 ***** Loading data *****
2021-11-26 01:29:59   Raw data is provided.
3it [00:00, 105.36it/s]
2021-11-26 01:29:59   Loaded samples = 3
2021-11-26 01:29:59 ***** Loading data *****
2021-11-26 01:29:59   Raw data is provided.
3it [00:00, 134.71it/s]
2021-11-26 01:29:59   Loaded samples = 3
2021-11-26 01:29:59 ***** Running training *****
2021-11-26 01:29:59   Num examples = 3
2021-11-26 01:29:59   Num Epochs = 5
2021-11-26 01:29:59   Sampler = 
2021-11-26 01:29:59   Batch size = 32
2021-11-26 01:29:59   Gradient Accumulation steps = 1
Epoch:   0%|          | 0/5 [00:00<?, ?it/s]
Iteration:   0%|          | 0/1 [00:00<?, ?it/s][A
Iteration:   0%|          | 0/1 [00:00<?, ?it/s, tr_loss=1.17][A
Iteration: 100%|██████████| 1/1 [00:00<00:00,  7.87it/s, tr_loss=1.17][A
2021-11-26 01:29:59 ***** Epoch end: 0 *****
2021-11-26 01:29:59 ***** Running 

## Test

In [7]:

metrics = pipeline.test(
    test_raw_data=test_raw_data,
)

2021-11-25 02:14:51 ***** Loading data *****
2021-11-25 02:14:51   Raw data is provided.
3it [00:00, 114.58it/s]
2021-11-25 02:14:51   Loaded samples = 3
2021-11-25 02:14:51 ***** Running evaluation *****
2021-11-25 02:14:51   Num examples = 3
2021-11-25 02:14:51   Batch size = 64
Evaluating: 100%|██████████| 1/1 [00:00<00:00, 45.95it/s]
2021-11-25 02:14:51   accuracy = 1.0
2021-11-25 02:14:51   macro_f1 = 1.0
2021-11-25 02:14:51   micro_f1 = 1.0
2021-11-25 02:14:51   support = 3
2021-11-25 02:14:51   -1-precision = 1.0
2021-11-25 02:14:51   -1-recall = 1.0
2021-11-25 02:14:51   -1-f1-score = 1.0
2021-11-25 02:14:51   -1-support = 1
2021-11-25 02:14:51   0-precision = 1.0
2021-11-25 02:14:51   0-recall = 1.0
2021-11-25 02:14:51   0-f1-score = 1.0
2021-11-25 02:14:51   0-support = 1
2021-11-25 02:14:51   1-precision = 1.0
2021-11-25 02:14:51   1-recall = 1.0
2021-11-25 02:14:51   1-f1-score = 1.0
2021-11-25 02:14:51   1-support = 1
2021-11-25 02:14:51   loss = 0.7164847254753113
2021-11

## Predict

In [7]:
print("Input:")
print(test_raw_data[0])

output = pipeline.predict(
    data_dict=test_raw_data[0],
)

print("Output:")
print(output)

Input:
{'content': '<p> zaia係咩, 我都想睇水舞間, 朋友睇完都話好睇!', 'label': 1}
Output:
{'prediction_id': 0, 'prediction': '1'}


# Clear output folder

In [8]:
shutil.rmtree(model_dir)

# Export variables

In [8]:
sb.glue("macro_f1", metrics['macro_f1'])
sb.glue("micro_f1", metrics['micro_f1'])