### Postprocess vocab encoders from pretraning

Add additional tokens: CLS and PAD.

In [1]:
import os
import pickle
from typing import Dict

from smart_open import open

from common import S3_OUTPUT_PREFIX


In [2]:
def load_vocab(filename: str) -> Dict[str, int]:
    """Load vocab encoder."""
    s3_vocab_path = os.path.join(
        S3_OUTPUT_PREFIX, "output", "pretrain", "encoders", filename
    )
    with open(s3_vocab_path, "rb") as f:
        vocab = pickle.load(f)

    return vocab


def save_vocab(vocab: Dict[str, int], filename: str):
    """Save vocab encoder"""
    s3_save_path = os.path.join(
        S3_OUTPUT_PREFIX, "output", "pretrain", "encoders", filename
    )
    with open(s3_save_path, "wb") as f:
        pickle.dump(vocab, f, protocol=pickle.HIGHEST_PROTOCOL)


In [3]:
# Load feature vocab
feature_vocab = load_vocab("feature_vocab.pickle")

# Add CLS and PAD feature token encodings
feature_vocab["CLS"] = max(feature_vocab.values()) + 1
feature_vocab["PAD"] = feature_vocab["CLS"] + 1
feature_vocab["MASK"] = feature_vocab["PAD"] + 1

# Save to new file in S3
save_vocab(feature_vocab, "feature_vocab_with_cls_pad_mask.pickle")


In [4]:
# Load time vocab
time_vocab = load_vocab("time_vocab.pickle")

# Add PAD time token encoding
time_vocab["PAD"] = max(time_vocab.values()) + 1

# Save to new file in S3
save_vocab(time_vocab, "time_vocab_with_pad.pickle")


In [5]:
# Load time vocab
type_vocab = load_vocab("type_vocab.pickle")

# Add PAD code type token encoding
type_vocab["PAD"] = max(type_vocab.values()) + 1

# Save to new file in S3
save_vocab(type_vocab, "type_vocab_with_pad.pickle")
