In [None]:
# Run the preprocessing script below


import random
import numpy as np
import json

def convert(data, tokenizer, max_seq_length, prefix):
    '''
    Convert the data and encoding the batch, and then shuffle the data and finally return binary file format
        
    Args:
        data (list of list): list of list of firstSentence, secondSentence and ground-truth label
                            data = [
                                    ["First sentence", "Second Sentence", 0],
                                    ...
                                ]
        
        tokenizer (object of class Cantokenizer): tokenizing cantonese characters
        max_seq_length (int): maximum length of the sentence
        prefix (string): labelling the data classes into training class or testing class
        
    Returns:
        None
    '''
    # Encoding the two sentences
    print("samples of input")
    print(json.dumps(random.sample([(e[0], e[1]) for e in data], 10), indent=2))
    print()
    print("samples of encoded")
    encodeds = tokenizer.encode_batch([(e[0], e[1]) for e in data])
    print(encodeds[:10])
    print()
    # Initializing empty lists for holding different data
    data_original = []
    data_attn_mask = []
    data_labels = []
    data_type_ids = []
    
    # For every encoded data (encodeds) and original data (data)
    for e, t in zip(encodeds, data):
        # Now e is the encoded values, t is the training/testing data set
        # Check the ground-truth label and assign as label
        label = t[2]
        # If the length of encoded id is longer than the maximum sequence length, skip processing the data
        if len(e.ids) > max_seq_length:
            continue
        
        
        e.pad(max_seq_length)
        # Append the data into list to store the values
        data_original.append(e.ids)
        data_attn_mask.append(e.attention_mask)
        data_labels.append(label)
        data_type_ids.append(e.type_ids)
    
    # Create a list of indices map the indices in the original data list
    indices = list(range(len(data_original)))
    
    # Randomly shuffle the indices to prevent data aggregation leading to overfitting towards one class
    random.shuffle(indices)
    
    # Override the original lists using the shuffled list 
    data_original = [data_original[i] for i in indices]
    data_attn_mask = [data_attn_mask[i] for i in indices]
    data_labels = [data_labels[i] for i in indices]
    data_type_ids = [data_type_ids[i] for i in indices]
    
    # Cast the type of the data elements into specified type 
    ids = np.array(data_original).astype(np.int16)
    attn = np.array(data_attn_mask).astype(np.int8)
    labels = np.array(data_labels).astype(np.int8)
    type_ids = np.array(data_type_ids).astype(np.int8)

    print(f"writing to {prefix}_xx")
    
    # Writing the data into binary file format
    with open(prefix+"_ids", 'wb') as f:
        f.write(ids.tobytes())
    with open(prefix+"_mask", 'wb') as f:
        f.write(attn.tobytes())
    with open(prefix+"_type_ids", 'wb') as f:
        f.write(type_ids.tobytes())
    with open(prefix+"_label", 'wb') as f:
        f.write(labels.tobytes())


In [None]:
'''
Load the json file of final training and testing datasets.
'''
import json
# Load json file into python's object
with open('final_train_set.json') as f:
    final_train_set = json.load(f)
with open('final_test_set.json') as f:
    final_test_set = json.load(f)
    


In [None]:
'''
Import the cantokenizer and convert the training and testing dataset into binary file format.
'''

import json

from cantokenizer import CanTokenizer, NORM_OPTIONS


tokenizer = CanTokenizer('cantokenizer-vocab.txt', 
                         add_special=True, 
                         add_special_cls='<s>', 
                         add_special_sep='</s>',
                         norm_options = NORM_OPTIONS.ZH_NORM_MAPPING | 
                                        NORM_OPTIONS.SIMPL_TO_TRAD | 
                                        NORM_OPTIONS.SEPARATE_INTEGERS | 
                                        NORM_OPTIONS.SEPARATE_SYMBOLS
                        )

    
    
# Define maximum sequence length
max_seq_length = 96

# If there are both first sentence and second sentence (no missing value), thne we can convert the string into
# binary files
convert([e for e in final_train_set if e[0] and e[1]], tokenizer, 96, 'train')

convert([e for e in final_test_set if e[0] and e[1]], tokenizer, 96, 'test')


In [None]:


# We will train the model here -> https://colab.research.google.com/drive/13nW5HluBSDaQVlck5V4cbiuP9OzA7ubi
