In [1]:
import os
import sys
from typing import Optional, cast

import src.evals.data as data_module
import src.hf_bert as hf_bert_module
import src.mosaic_bert as mosaic_bert_module
import src.flex_bert as flex_bert_module
import transformers
from composer import Trainer, algorithms, Evaluator
from composer.callbacks import LRMonitor, MemoryMonitor, OptimizerMonitor, RuntimeEstimator, SpeedMonitor
from composer.core.types import Dataset
from composer.loggers import WandBLogger
from composer.optim import DecoupledAdamW
from composer.optim.scheduler import (
    ConstantWithWarmupScheduler,
    CosineAnnealingWithWarmupScheduler,
    LinearWithWarmupScheduler,
)
from src.scheduler import WarmupStableDecayScheduler
from composer.utils import dist, reproducibility
from omegaconf import DictConfig
from omegaconf import OmegaConf as om
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm
  @custom_fwd
  @custom_bwd


In [2]:
def build_my_dataloader(cfg: DictConfig, device_batch_size: int):
    """Create a dataloader for classification.

    **Modify this function to train on your own dataset!**

    This function is provided as a starter code to simplify fine-tuning a BERT
    classifier on your dataset. We'll use the dataset for QNLI (one of the
    GLUE tasks) as a demonstration.

    Args:
        cfg (DictConfig): An omegaconf config that houses all the configuration
            variables needed to instruct dataset/dataloader creation.
        device_batch_size (int): The size of the batches that the dataloader
            should produce.

    Returns:
        dataloader: A dataloader set up for use of the Composer Trainer.
    """
    # As a demonstration, we're using the QNLI dataset from the GLUE suite
    # of tasks.
    #
    # Note: We create our dataset using the `data_module.create_glue_dataset` utility
    #   defined in `./src/glue/data.py`. If you inspect that code, you'll see
    #   that we're taking some extra steps so that our dataset yields examples
    #   that follow a particular format. In particular, the raw text is
    #   tokenized and some of the data columns are removed. The result is that
    #   each example is a dictionary with the following:
    #
    #     - 'input_ids': the tokenized raw text
    #     - 'label': the target class that the text belongs to
    #     - 'attention_mask': a list of 1s and 0s to indicate padding
    #
    # When you set up your own dataset, it should handle tokenization to yield
    # examples with a similar structure!
    #
    # REPLACE THIS WITH YOUR OWN DATASET:
    dataset = data_module.create_glue_dataset(
        task="qnli",
        split=cfg.split,
        tokenizer_name=cfg.tokenizer_name,
        max_seq_length=cfg.max_seq_len,
    )

    dataset = cast(Dataset, dataset)
    dataloader = DataLoader(
        dataset,
        # As an alternative to formatting the examples inside the dataloader,
        # you can write a custom data collator to do that instead.
        collate_fn=transformers.default_data_collator,
        batch_size=device_batch_size,
        sampler=dist.get_sampler(dataset, drop_last=cfg.drop_last, shuffle=cfg.shuffle),
        num_workers=cfg.num_workers,
        pin_memory=cfg.get("pin_memory", True),
        prefetch_factor=cfg.get("prefetch_factor", 2),
        persistent_workers=cfg.get("persistent_workers", True),
        timeout=cfg.get("timeout", 0),
    )

    return dataloader

yaml_path, args_list = "yamls/test/sequence_classification.yaml", sys.argv[2:]
with open("yamls/defaults.yaml") as f:
    default_cfg = om.load(f)
with open(yaml_path) as f:
    yaml_cfg = om.load(f)
cli_cfg = om.from_cli(args_list)
cfg = om.merge(default_cfg, yaml_cfg, cli_cfg)
cfg = cast(DictConfig, cfg)  # for type checking

train_loader = build_my_dataloader(
    cfg.train_loader,
    cfg.global_train_batch_size // dist.get_world_size(),
)

#get one data sample from the train_loader
data = next(iter(train_loader))
# Print data keys only
print(data.keys())
print(data)



Example from glue dataset:
{'question': 'When did the third Digimon series begin?', 'sentence': 'Unlike the two seasons before it and most of the seasons that followed, Digimon Tamers takes a darker and more realistic approach to its story featuring Digimon who do not reincarnate after their deaths and more complex character development in the original Japanese.', 'label': 1, 'idx': 0}
{'question': 'Which missile batteries often have individual launchers several kilometres from one another?', 'sentence': 'When MANPADS is operated by specialists, batteries may have several dozen teams deploying separately in small sections; self-propelled air defence guns may deploy in pairs.', 'label': 1, 'idx': 1}
{'question': "What two things does Popper argue Tarski's theory involves in an evaluation of truth?", 'sentence': 'He bases this interpretation on the fact that examples such as the one described above refer to two things: assertions and the facts to which they refer.', 'label': 0, 'idx': 2}

In [7]:
import transformers
from generate_dataset import generate_synthetic_dataset

# Configuration parameters
decimal_token = "<DEC>"

cfg = {}

# Generate the synthetic dataset
df = generate_synthetic_dataset(
    n_samples=cfg.get("n_samples", 100),
    n_continuous_features=cfg.get("n_continuous_features", 15),
    n_discrete_features=cfg.get("n_discrete_features", 15),
    n_classes=cfg.get("n_classes", 2),
    class_distribution=cfg.get("class_distribution", [0.8, 0.2]),
    n_bins=cfg.get("n_bins", 10),
    n_redundant=cfg.get("n_redundant", 5),
    n_noisy=cfg.get("n_noisy", 20),
    class_sep=cfg.get("class_sep", 0.1),
)

# Change structure to "sentence", "label" and "idx"
# All columns except the last one are features and they are concatenated to form a sentence
# The last column is the label
df['sentence'] = df.drop(columns=['label']).apply(lambda x: ' '.join([f"{val}".replace('.', decimal_token) for val in x]), axis=1)

# Create dummy sentence based on label: if 1 then "4.23245", if 0 then "5.7655"
df['sentence'] = df['label'].apply(lambda x: f"4{decimal_token}23245" if x == 1 else f"5{decimal_token}7655")

# Reorder columns and add index
df = df[['sentence', 'label']]
df['idx'] = df.index

# Tokenize the dataset
tokenizer = transformers.AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

# Add special tokens
tokenizer.add_tokens([decimal_token])

# Add a padding token if it doesn't already exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token_id = tokenizer.eos_token_id  # Use EOS token as padding token for GPT-2

# Tokenize sentences
tokenized_dataset = tokenizer(
    df['sentence'].tolist(),  # Ensure this is a list of strings
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

# Print input sentence and tokenization results
print("\nTokenizer Debug Info:")
print("-" * 50)
# Print first 3 examples
for i in range(min(3, len(df))):
    print(f"\nExample {i+1}:")
    print(f"Input sentence: {df['sentence'].iloc[i]}")
    print(f"Label: {df['label'].iloc[i]}")
    
    # Get tokenized ids for this example
    tokens = tokenizer.encode(df['sentence'].iloc[i])
    print(f"Token IDs: {tokens}")
    
    # Decode back to string to verify tokenization
    decoded = tokenizer.decode(tokens)
    print(f"Decoded text: {decoded}")
    
    # Print individual tokens
    tokens_list = tokenizer.convert_ids_to_tokens(tokens)
    print(f"Individual tokens: {tokens_list}")
print("-" * 50)

# Create a PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

labels = df['label'].tolist()
custom_dataset = CustomDataset(tokenized_dataset, labels)


Tokenizer Debug Info:
--------------------------------------------------

Example 1:
Input sentence: 5<DEC>7655
Label: 0
Token IDs: [101, 1019, 30522, 6146, 24087, 102]
Decoded text: [CLS] 5 <DEC> 7655 [SEP]
Individual tokens: ['[CLS]', '5', '<DEC>', '76', '##55', '[SEP]']

Example 2:
Input sentence: 4<DEC>23245
Label: 1
Token IDs: [101, 1018, 30522, 20666, 19961, 102]
Decoded text: [CLS] 4 <DEC> 23245 [SEP]
Individual tokens: ['[CLS]', '4', '<DEC>', '232', '##45', '[SEP]']

Example 3:
Input sentence: 5<DEC>7655
Label: 0
Token IDs: [101, 1019, 30522, 6146, 24087, 102]
Decoded text: [CLS] 5 <DEC> 7655 [SEP]
Individual tokens: ['[CLS]', '5', '<DEC>', '76', '##55', '[SEP]']
--------------------------------------------------




In [2]:
import string
import itertools

def create_extended_letter_mapping(df, label_column):
    """
    Create a mapping from integer values to unique letters for each column in the dataframe, excluding the label column.
    
    Args:
        df (pd.DataFrame): The input dataframe.
        label_column (str): The name of the label column.
    
    Returns:
        dict: A dictionary where keys are column names and values are mappings from integers to unique letters.
    """
    letter_mapping = {}
    letters = string.ascii_uppercase  # Use uppercase letters A-Z
    
    # Generate combinations of letters (e.g., A, B, ..., Z, AA, AB, ..., ZZ, AAA, AAB, ...)
    max_length = 3  # Adjust this value if needed to handle more unique values
    extended_letters = [''.join(comb) for comb in itertools.chain.from_iterable(itertools.product(letters, repeat=i) for i in range(1, max_length + 1))]
    
    letter_index = 0  # To keep track of the starting index for each column's unique letters
    
    for col in df.columns:
        if col == label_column:
            continue  # Skip the label column
        unique_values = df[col].unique()
        if len(unique_values) > len(extended_letters) - letter_index:
            raise ValueError(f"Too many unique values in column {col} to map to extended letters.")
        
        # Assign a unique subset of letters to this column
        column_letters = extended_letters[letter_index:letter_index + len(unique_values)]
        mapping = {val: column_letters[i] for i, val in enumerate(unique_values)}
        letter_mapping[col] = mapping
        
        # Update the starting index for the next column
        letter_index += len(unique_values)
    
    print("Letter mapping:")
    print(letter_mapping)
    
    return letter_mapping

def transform_discrete_to_letters(df, letter_mapping):
    """
    Transform discrete features from integer encoding to letter encoding, excluding the label column.
    
    Args:
        df (pd.DataFrame): The input dataframe.
        letter_mapping (dict): The mapping from integers to letters for each column.
    
    Returns:
        pd.DataFrame: The transformed dataframe.
    """
    df_transformed = df.copy()
    
    for col, mapping in letter_mapping.items():
        df_transformed[col] = df_transformed[col].map(mapping)
    
    return df_transformed

In [6]:
from generate_dataset import generate_synthetic_dataset
import transformers

cfg = {}

# Generate the synthetic dataset
df = generate_synthetic_dataset(
    n_samples=cfg.get("n_samples", 100),
    n_continuous_features=cfg.get("n_continuous_features", 0),  # Set to 0 since all features are discrete
    n_discrete_features=cfg.get("n_discrete_features", 15),
    n_classes=cfg.get("n_classes", 2),
    class_distribution=cfg.get("class_distribution", [0.8, 0.2]),
    n_bins=cfg.get("n_bins", 10),
    n_redundant=cfg.get("n_redundant", 0),
    n_noisy=cfg.get("n_noisy", 0),
    class_sep=cfg.get("class_sep", 0.1),
)

# Define the label column
label_column = 'label'

print(df.shape)

# Create extended letter mapping for all columns except the label column
letter_mapping = create_extended_letter_mapping(df, label_column)

# Transform all columns to letter encoding except the label column
df_transformed = transform_discrete_to_letters(df.drop(columns=[label_column]), letter_mapping)

# Add the label column back to the transformed dataframe
df_transformed[label_column] = df[label_column]

# Change structure to "sentence", "label" and "idx"
# All columns except the last one are features and they are concatenated to form a sentence
# The last column is the label
df_transformed['sentence'] = df_transformed.drop(columns=[label_column]).apply(lambda x: ' '.join([str(val) for val in x]), axis=1)

# Reorder columns and add index
df_transformed = df_transformed[['sentence', label_column]]
df_transformed['idx'] = df_transformed.index

# Tokenize the dataset
tokenizer = transformers.AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")

# Add a padding token if it doesn't already exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token_id = tokenizer.eos_token_id  # Use EOS token as padding token for GPT-2

# Tokenize sentences
tokenized_dataset = tokenizer(
    df_transformed['sentence'].tolist(),  # Ensure this is a list of strings
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)

# Print input sentence and tokenization results
print("\nTokenizer Debug Info:")
print("-" * 50)
# Print first 3 examples
for i in range(min(3, len(df_transformed))):
    print(f"\nExample {i+1}:")
    print(f"Input sentence: {df_transformed['sentence'].iloc[i]}")
    print(f"Label: {df_transformed[label_column].iloc[i]}")
    
    # Get tokenized ids for this example
    tokens = tokenizer.encode(df_transformed['sentence'].iloc[i])
    print(f"Token IDs: {tokens}")
    
    # Decode back to string to verify tokenization
    decoded = tokenizer.decode(tokens)
    print(f"Decoded text: {decoded}")
    
    # Print individual tokens
    tokens_list = tokenizer.convert_ids_to_tokens(tokens)
    print(f"Individual tokens: {tokens_list}")
print("-" * 50)

  from .autonotebook import tqdm as notebook_tqdm


(100, 16)
Letter mapping:
{'discrete_feature_0': {3: 'A', 1: 'B', 4: 'C', 7: 'D', 5: 'E', 6: 'F', 2: 'G', 9: 'H', 8: 'I', 0: 'J'}, 'discrete_feature_1': {3: 'K', 5: 'L', 4: 'M', 8: 'N', 6: 'O', 7: 'P', 2: 'Q', 1: 'R', 0: 'S', 9: 'T'}, 'discrete_feature_2': {6: 'U', 4: 'V', 5: 'W', 1: 'X', 9: 'Y', 3: 'Z', 2: 'AA', 0: 'AB', 7: 'AC', 8: 'AD'}, 'discrete_feature_3': {4: 'AE', 2: 'AF', 1: 'AG', 6: 'AH', 8: 'AI', 7: 'AJ', 5: 'AK', 3: 'AL', 9: 'AM', 0: 'AN'}, 'discrete_feature_4': {5: 'AO', 3: 'AP', 4: 'AQ', 1: 'AR', 7: 'AS', 6: 'AT', 9: 'AU', 8: 'AV', 2: 'AW', 0: 'AX'}, 'discrete_feature_5': {6: 'AY', 3: 'AZ', 4: 'BA', 5: 'BB', 0: 'BC', 2: 'BD', 8: 'BE', 7: 'BF', 1: 'BG', 9: 'BH'}, 'discrete_feature_6': {7: 'BI', 3: 'BJ', 6: 'BK', 4: 'BL', 1: 'BM', 5: 'BN', 2: 'BO', 0: 'BP', 9: 'BQ', 8: 'BR'}, 'discrete_feature_7': {5: 'BS', 3: 'BT', 2: 'BU', 4: 'BV', 6: 'BW', 7: 'BX', 8: 'BY', 0: 'BZ', 9: 'CA', 1: 'CB'}, 'discrete_feature_8': {3: 'CC', 6: 'CD', 7: 'CE', 5: 'CF', 2: 'CG', 4: 'CH', 0: 'CI', 9



In [None]:
e

In [10]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig, BertTokenizerFast

class CustomBertEmbeddings(nn.Module):
    def __init__(self, config, tokenizer):
        super(CustomBertEmbeddings, self).__init__()
        self.tokenizer = tokenizer
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        if position_ids is None:
            position_ids = self.create_position_ids(input_ids)
        
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        inputs_embeds = self.word_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeds + token_type_embeds
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        # Log word IDs and their corresponding positional encodings
        self.log_positional_encodings(input_ids, position_ids, position_embeds)

        return embeddings

    def create_position_ids(self, input_ids):
        position_ids = torch.arange(input_ids.size(1), dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

        # Adjust position_ids for subwords
        for i, input_id in enumerate(input_ids):
            tokens = self.tokenizer.convert_ids_to_tokens(input_id)
            current_position = 0
            for j, token in enumerate(tokens):
                if token.startswith("##"):
                    position_ids[i, j] = current_position
                else:
                    current_position = position_ids[i, j]

        return position_ids

    def log_positional_encodings(self, input_ids, position_ids, position_embeds):
        for i in range(input_ids.size(0)):
            for j in range(input_ids.size(1)):
                word_id = input_ids[i, j].item()
                pos_id = position_ids[i, j].item()
                pos_embed = position_embeds[i, j].detach().cpu().numpy()
                print(f"Word ID: {word_id}, Position ID: {pos_id}, Positional Encoding: {pos_embed}")

# Example usage
config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
custom_embeddings = CustomBertEmbeddings(config, tokenizer)

# Example input
input_ids = tokenizer("AQ 232 342 3434", return_tensors='pt')['input_ids']
position_ids = custom_embeddings.create_position_ids(input_ids)
embeddings = custom_embeddings(input_ids, position_ids=position_ids)

Word ID: 101, Position ID: 0, Positional Encoding: [-9.70485628e-01  1.80302894e+00 -8.19857240e-01 -4.84720543e-02
 -4.84500259e-01 -7.87522435e-01  1.42060244e+00  9.14309800e-01
 -1.07837605e+00 -9.44993675e-01  5.85320652e-01  7.50791788e-01
 -3.15190166e-01  3.66069764e-01  1.99320063e-01  1.41126049e+00
 -6.08427644e-01 -8.32416356e-01 -5.92485607e-01 -2.27414772e-01
  8.52364749e-02 -4.28160161e-01 -7.88723707e-01  1.72249937e+00
 -6.51998222e-01 -1.11059844e+00  1.07758093e+00 -1.69642956e-03
  1.56997752e+00 -2.07585990e-01  2.21596289e+00 -1.38046294e-01
 -5.01735151e-01  1.58870006e+00 -1.87656701e-01 -8.02190065e-01
 -2.14086667e-01  1.44588482e+00 -6.23634756e-02 -6.01941466e-01
  6.12250268e-02  3.31105828e-01  8.72132778e-02  6.61423981e-01
 -7.28164241e-02  1.83970883e-01 -7.90686384e-02  1.52451217e+00
 -1.25623792e-01 -1.14020097e+00  2.65612394e-01 -1.05548000e+00
 -7.18414411e-02 -5.37619412e-01  9.36248899e-01  2.30263799e-01
 -1.50864887e+00  2.07146263e+00  5.305

In [30]:
import torch
import torch.nn as nn
from transformers import BertModel, BertConfig, BertTokenizerFast

class CustomBertEmbeddings(nn.Module):
    def __init__(self, config, tokenizer):
        super(CustomBertEmbeddings, self).__init__()
        self.tokenizer = tokenizer
        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids, token_type_ids=None, position_ids=None):
        if position_ids is None:
            position_ids = self.create_position_ids(input_ids)
        
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        inputs_embeds = self.word_embeddings(input_ids)
        position_embeds = self.position_embeddings(position_ids)
        token_type_embeds = self.token_type_embeddings(token_type_ids)

        embeddings = inputs_embeds + position_embeds + token_type_embeds
        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        # Log word IDs and their corresponding positional encodings
        #self.log_positional_encodings(input_ids, position_ids, position_embeds)

        return embeddings

    def create_position_ids(self, input_ids, text):
        position_ids = torch.zeros_like(input_ids, dtype=torch.long)
        print(f"Position IDs (Before): {input_ids}")

        for i, input_id in enumerate(input_ids):
            print(f"\nExample {i}:")
            tokens = self.tokenizer.convert_ids_to_tokens(input_id)
            original_text = self.tokenizer.decode(input_id, skip_special_tokens=True)
            words = text.split()
            word_positions = {word: idx + 1 for idx, word in enumerate(words)}
            print(f"\nOriginal Text: {original_text}")
            print(f"Words: {words}")
            print(f"Word Positions: {word_positions}")

            current_position = 1
            for j, token in enumerate(tokens):
                if token == '[CLS]':
                    position_ids[i, j] = 0  # Special ID for [CLS]
                elif token == '[SEP]':
                    position_ids[i, j] = len(words) + 1  # Special ID for [SEP]
                elif token.startswith("##") or token == '.' or last_token == '.':
                    position_ids[i, j] = current_position - 1
                else:
                    word = token.replace("##", "")
                    if word in word_positions:
                        current_position = word_positions[word]
                    position_ids[i, j] = current_position
                    current_position = current_position + 1
                last_token = token
                

        return position_ids

    def log_positional_encodings(self, input_ids, position_ids, position_embeds):
        for i in range(input_ids.size(0)):
            for j in range(input_ids.size(1)):
                word_id = input_ids[i, j].item()
                pos_id = position_ids[i, j].item()
                pos_embed = position_embeds[i, j].detach().cpu().numpy()
                print(f"Word ID: {word_id}, Position ID: {pos_id}, Positional Encoding: {pos_embed}")

# Example usage
config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
custom_embeddings = CustomBertEmbeddings(config, tokenizer)

# Example input
text = "343434 34.554534 4.32434"
input_ids = tokenizer(text, return_tensors='pt')['input_ids']
position_ids = custom_embeddings.create_position_ids(input_ids, text)

# Print tokenizer debug info and positional IDs
print("\nTokenizer Debug Info:")
tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
print(f"Tokens: {tokens}")
print(f"Input IDs: {input_ids}")
print(f"Position IDs: {position_ids}")

embeddings = custom_embeddings(input_ids, position_ids=position_ids)

Position IDs (Before): tensor([[  101, 27810, 23777,  2549,  4090,  1012,  4583, 19961, 22022,  1018,
          1012, 27234, 22022,   102]])

Example 0:

Original Text: 343434 34. 554534 4. 32434
Words: ['343434', '34.554534', '4.32434']
Word Positions: {'343434': 1, '34.554534': 2, '4.32434': 3}

Tokenizer Debug Info:
Tokens: ['[CLS]', '343', '##43', '##4', '34', '.', '55', '##45', '##34', '4', '.', '324', '##34', '[SEP]']
Input IDs: tensor([[  101, 27810, 23777,  2549,  4090,  1012,  4583, 19961, 22022,  1018,
          1012, 27234, 22022,   102]])
Position IDs: tensor([[0, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4]])
