### Introduction

From data augmentation experiment for Apple Care 2 dataset, highlighted factors can ensure the quality of unlabeled dataset which enhance the performance of model.

Experiment details in [2022_01_13 biweekly discussion.pptx](https://jira.wisers.com:18090/download/attachments/82808396/2022_01_13%20biweekly%20discussion.pptx?version=1&modificationDate=1642063801000&api=v2)

For code design please browse [Confluence Proposed Module](https://jira.wisers.com:18090/display/RES/Proposed+Module2) 

In [2]:
import os
os.chdir('../src/')
os.getcwd()

'/home/developer/Users/hinova/canton-target-sentiment/src'

### Model Generation

Model is required to predict labels for unlabel data in order to sample data. This step can be skipped if user already got trained model which exists:
- model directory
    - run.yaml
    - model.yaml
    - label_to_id.json
    - model.pt

In [3]:
os.system(f"python run.py --config_dir='../config/examples/sequence_classification/BERT_AVG'")

0

### Arguments

User can sample label ratio based on the ratio of train set, or request desired label ratio. It depends on existence of argument (either input this argument or not):

- label_ratio (if required desired label ratio)


Optional Arguments

In [6]:
# user can sample label ratio based on the ratio of train set, or request desired label ratio

# label_ratio = {'-1': 0.3, '0': 0.4, '1': 0.3}

Required Arguments

In [29]:
# model
model_dir = '../config/examples/sequence_classification/BERT_AVG/model' # model inference

# unlabel data path
unlabel_path = '../data/datasets/sample/sequence_classification/unlabeled_sample.json' 

# save path
save_path = '../data/datasets/sample/sequence_classification'

# sample size and certainty
sample_size = 150 # required, integer and smaller than size of unlabeled data
certainty = 0.8 # required, only select the data that max(p)>certainty, 
                # p is the predicted probabilities (0 - 1) over the label space
                # default certainty is 0

In [35]:
import json
from pathlib import Path
from utils import load_yaml
class arg():
    def __init__(self, model_dir: str):
        # run_yaml configuration
        run_config = load_yaml(Path(model_dir) / "run.yaml")
        self.task = run_config['task']
        self.data_config = run_config['data']
        self.data_dir = Path(self.data_config['data_dir'])
        self.prepro_config = run_config['text_prepro']
        self.eval_config = run_config['eval']
        self.train_config = run_config['train']
        self.device = run_config['device']
        model_class = self.train_config['model_class']

        # model_yaml configuration
        self.model_config = load_yaml(Path(model_dir) / "model.yaml")[model_class]
        self.model_config['pretrained_lm_from_prev'] = model_dir

        # label_to_id mapping
        with open(model_dir+'/label_to_id.json', 'rb') as outfile:
            self.label_to_id = json.load(outfile)
        self.label_to_id_inv = dict(zip(self.label_to_id.values(), self.label_to_id.keys()))

        # model directory which model locates
        self.model_dir = Path(model_dir)

args = arg(model_dir)

In [36]:
print('actual arguments: \n', args.__dict__)

actual arguments: 
 {'task': 'sequence_classification', 'data_config': {'output_dir': '../config/examples/sequence_classification/BERT_AVG', 'data_dir': '../data/datasets/sample/sequence_classification', 'train': 'train_sample.json', 'dev': 'train_sample.json', 'test': 'train_sample.json'}, 'data_dir': PosixPath('../data/datasets/sample/sequence_classification'), 'prepro_config': {'steps': ['utf8_replace', 'simplified_chinese', 'lower_case', 'full_to_half']}, 'eval_config': {'batch_size': 64, 'model_file': 'model.pt'}, 'train_config': {'model_class': 'BERT_AVG', 'kd': {'use_kd': False, 'teacher_dir': '../output/post_sentiment_20210707_bert_avg/model', 'loss_type': 'mse', 'soft_lambda': 0.5, 'kl_T': 5}, 'seed': 42, 'log_steps': 100, 'batch_size': 32, 'final_model': 'best', 'optimization_metric': 'macro_f1', 'early_stop': 5}, 'device': 1, 'model_config': {'max_length': 256, 'tokenizer_source': 'transformers', 'tokenizer_name': 'bert-base-chinese', 'pretrained_lm': 'bert-base-chinese', 'o

### get label ratio of train set

This part should skip if user inputs label_to_ratio argument

In [21]:
# if following ratio of train set
from tokenizer import get_tokenizer
from dataset import get_dataset
    
tokenizer = get_tokenizer(args = args) 
train_dataset = get_dataset(dataset="train", tokenizer=tokenizer, args=args)

print('train dataset size: ', len(train_dataset))
    

../config/examples/sequence_classification/BERT_AVG/model
['run.yaml', 'model.yaml', 'tokenizer', 'label_to_id.json', 'model.pt']


3it [00:00, 104.52it/s]

train dataset size:  3





In [26]:
label = [train_dataset[i]['label'].item() for i in range(len(train_dataset))]
print('The first three labels of trainset: \n', label[:3])

The first three labels of trainset: 
 [0, 1, 2]


In [27]:
def get_label_ratio(label):
    '''
        input:
        - label: list

        output:
        - label_ratio: dict
    '''
    result = {}
    for i in label:
        key = args.label_to_id_inv[i]
        if key not in result:
            result[key] = 0
        result[key] = result[key] + 1/len(label)
    return result

label_ratio = get_label_ratio(label)
print('label ratio of train set: \n',label_ratio)

label ratio of train set: 
 {'1': 0.3333333333333333, '0': 0.3333333333333333, '-1': 0.3333333333333333}


### generate pseudo prediction label 

In [37]:
args.data_config['data_dir'] = '/'.join(unlabel_path.split('/')[:-1])
args.data_dir = Path(args.data_config['data_dir'])
args.data_config['unlabeled'] = unlabel_path.split('/')[-1]
unlabel_dataset = get_dataset(dataset="unlabeled", tokenizer=tokenizer, args=args)
print('unlabel dataset size: ', len(unlabel_dataset))

64it [00:00, 167.79it/s]

unlabel dataset size:  64





In [38]:
from model import get_model
model = get_model(args)

In [47]:
from torch.utils.data import DataLoader
from trainer import prediction_step
from itertools import chain

def predict_label(dataset, model):
    '''
        input:
        - dataset: torch.dataset
        - model: torch.model
        
        output:
        - list
    '''
    dataloader = DataLoader(
        dataset,
        shuffle=False,
        batch_size=args.eval_config["batch_size"],
        # collate_fn=eval_dataset.pad_collate,
    )

    results = []
    for batch in dataloader:
        result = prediction_step(model, batch, args=args)
        results.append(result)
    return results

prediction = predict_label(unlabel_dataset, model)
pseudo_label = prediction[0]['prediction']
print('The first three labels of unlabel: \n', pseudo_label[:3])

The first three labels of unlabel: 
 ['-1', '-1', '-1']


In [49]:
args.label_to_id_inv

{0: '1', 1: '0', 2: '-1'}

In [50]:
pseudo_label_id = prediction[0]['prediction_id']
pseudo_label_ratio = get_label_ratio(pseudo_label_id)
print('pseudo label ratio of unlabel set: \n',pseudo_label_ratio)

pseudo label ratio of unlabel set: 
 {'-1': 0.8125, '1': 0.140625, '0': 0.046875}


### sampling 
- label ratio
- certainty

In [47]:
import json
import random
import numpy as np

def sampling(unlabel_path, pseudo_label, label_ratio, certainty):
    '''
        input:
        - unlabel_dataset: torch.dataset
        - pseudo_label: list
        - label_ratio: dict
        - certainty: float

        output:
        - list
    '''
    # collect probability of prediction
    prob_ls = []
    for batch_pred in pseudo_label:
            prob_ls = prob_ls + batch_pred['probabilities']
    prob_np = np.array(prob_ls)

    # sample size for labels
    ss_idx = []
    remain_size = sample_size
    for i, key in enumerate(label_ratio.keys()):
        # sample size computation
        if i != len(label_ratio.keys()) - 1:
            # sample size follows label ratio
            key_size = int(sample_size * label_ratio[key])
            remain_size = remain_size - key_size
        else:
            key_size = remain_size
        # certainty index
        key_certain_idx = np.argwhere((prob_np.argmax(axis=1)==key) & (prob_np.max(axis=1)>=certainty)).flatten()
        ss_idx = ss_idx + (random.sample(key_certain_idx.tolist(), key_size))
    ss_idx = random.sample(ss_idx, sample_size)

    # indexing unlabel data
    with open(unlabel_path, 'rb') as outfile:
        unlabel_data = np.array(json.load(outfile))
    return unlabel_data[ss_idx].tolist()

sampled_ls = sampling(
    unlabel_path = data_dir + '/' + unlabel_file, 
    pseudo_label = prediction, 
    label_ratio = label_ratio, 
    certainty = certainty)

##### sampled label ratio output

In [49]:
print('Total size of sampled result', len(sampled_ls))
print('Overview of ', sampled_ls[0])

1500
{'content': '【创意无限】这个iphone扩音器应该是你最好的朋友。是橡胶做的。它轻盈,柔软,可爱!更酷的是,它不需要电源驱动,只需将扬声器放在iphone的底部,就可以放大手机播放的声音。'}


### save data

In [56]:
def save_data(sample_list: list, save_path: str):
    with open(save_path, 'w') as outfile:
        json.dump(sample_list, outfile)

save_data(sampled_ls, save_path)

### review

In [58]:
with open(save_path, 'rb') as outfile:
    result = json.load(outfile)
print(len(result))
print(result[0])

1500
{'content': '【创意无限】这个iphone扩音器应该是你最好的朋友。是橡胶做的。它轻盈,柔软,可爱!更酷的是,它不需要电源驱动,只需将扬声器放在iphone的底部,就可以放大手机播放的声音。'}
