In [None]:
srun -G1 --pty bash -c "source /data/ai_club/team_3_2024-25/team3-env-finetune/bin/activate; \
    hostname; \
    jupyter notebook \
        --ServerApp.root_dir=$(pwd) \
        --ServerApp.password='' \
        --ServerApp.open_browser=False \
        --ServerApp.allow_origin='*' \
        --ServerApp.allow_remote_access=True \
        --ServerApp.port=14321 \
        --ServerApp.ip='*'
"

In [1]:
import torch
import torch.nn as nn
from transformers.models.qwen2.modeling_qwen2 import Qwen2DecoderLayer
from transformers import Qwen2ForCausalLM, Qwen2Config
import transformers
# import json
# import gc

2025-04-09 23:31:37.143611: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-09 23:31:37.435675: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744255897.555380 1160978 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744255897.590489 1160978 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-09 23:31:37.876111: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

In [36]:
class IMDecoderLayer(nn.Module):
    mask = None
    vspace_to_emb = None
    emb_to_vspace = None
    block_strength = []

    scratch = None
    norm = None

    def __init__(self, original_layer, emb_to_vspace, vspace_to_emb, norm, config, block_idx):
        super().__init__()
        self.original_layer = original_layer

        if IMDecoderLayer.vspace_to_emb == None:
            IMDecoderLayer.vspace_to_emb = vspace_to_emb

        if IMDecoderLayer.emb_to_vspace == None:
            IMDecoderLayer.emb_to_vspace =  emb_to_vspace

        if IMDecoderLayer.scratch == None:
            IMDecoderLayer.scratch = torch.zeros(config.vocab_size, dtype=bool).to('cuda')

        if IMDecoderLayer.norm == None:
            IMDecoderLayer.norm = norm

        self.block_idx = len(IMDecoderLayer.block_strength)
        IMDecoderLayer.block_strength.append(
            nn.Parameter(torch.tensor(1.0, dtype=torch.float32).to('cuda'))
        )

    def forward(self, hidden_states, *args, **kwargs):
        hidden_states = self.original_layer(hidden_states, *args, **kwargs)
        hidden_states = hidden_states[0]

        residual = hidden_states
        hidden_states = IMDecoderLayer.emb_to_vspace(residual)
        
        assert IMDecoderLayer.mask != None

        for i, positions in enumerate(IMDecoderLayer.mask):
            for j, toks_allowed in enumerate(positions):
                if not toks_allowed:
                    hidden_states[i,j,:] = 0 
                    continue

                hidden_states[i,j,:] = 0 

                IMDecoderLayer.scratch[:] = False
                IMDecoderLayer.scratch[toks_allowed] = True
                hidden_states[i,j,IMDecoderLayer.scratch] += 1/IMDecoderLayer.scratch.sum()

                IMDecoderLayer.scratch[:] = True
                IMDecoderLayer.scratch[toks_allowed] = False
                hidden_states[i,j,IMDecoderLayer.scratch] -= 1/IMDecoderLayer.scratch.sum()

        # print(hidden_states)
        hidden_states = hidden_states @ IMDecoderLayer.vspace_to_emb.weight
        hidden_states = hidden_states * IMDecoderLayer.block_strength[self.block_idx]
        hidden_states = hidden_states + residual

        return (hidden_states,)

def gen_mask(tokens):
    mask = [] # mask[batch, position, allowed_tok]
    for batch_size in tokens['attention_mask'].argmin(axis=1):
        if batch_size == 0:
            batch_size = tokens['attention_mask'].shape[1]
        mask.append([[] for i in range(batch_size)])

    return mask

for i, s in enumerate(IMDecoderLayer.block_strength):
    c=3 # This is a hyperparameter
    s.data.fill_(c*i/(i+c))

In [60]:
def get_model(do_masking):
    MODEL_NAME = 'Qwen/Qwen2.5-0.5B-Instruct'

    config = Qwen2Config.from_pretrained(MODEL_NAME)

    model = Qwen2ForCausalLM.from_pretrained(MODEL_NAME).to('cuda')
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

    if do_masking:
        # FREEZE existing model. Only the new layer in IMDecoderLayer will be trained
        for param in model.parameters():
            param.requires_grad = False

        # REPLACE transformer blocks with IM ones
        for i, _ in enumerate(model.model.layers):
            model.model.layers[i] = IMDecoderLayer(model.model.layers[i], model.lm_head, model.model.embed_tokens, model.model.norm, config, i)

    def tokenize(batch):
        tokens = tokenizer(batch, return_tensors='pt', padding=True)
        tokens = {k:v.to('cuda') for k,v in tokens.items()}
        return tokens

    def tokof(s, check=True):
        toks = tokenizer(s, add_special_tokens=False)['input_ids']
        if check:
            if len(toks) > 1: raise Exception(f'This is more than one tok: {toks}')
            return toks[0]
        return toks

    return model, tokenize, tokenizer, tokof

model, tokenize, tokenizer, tokof = get_model(True)

In [136]:
vocab = [
    'terve', 'hei', 'talo', 'vesi', 'ystävä', 'huomenta', 'velho', 'suomi', 'koira', 'nimi', 'nimeni', 'nimesi', 'nimensä',
    'ystäväni', 'ystäväsi', 'ystävänsä', 'vanha', 'hyvää', 'suomalainen', 'mukava', 'minä', 'minun', 'olen', 'olenko', 'sinä', 'sinun', 'olet',
    'oletko', 'hän', 'hänen', 'on', 'onko', 'matti', 'aleksi', 'sami', 'kyllä', 'ei', 'mitä', 'mikä', 'kuka', 'rossi', 'lucas', '.', '!'
]

vocab += [v[0].upper() + v[1:] for v in vocab]
vocab += [(' '+v if v.isalpha() else v) for v in vocab]
# vocab += [v+'.' for v in vocab]

vocab = list(set(vocab))

# --- BUILD DA TRIE ---

trie = {}

for v in vocab:
    curr_node = trie

    toks = tokof(v, check=False)

    for tok in toks:
        tok = tokenizer.decode(tok) # FOR VISUALIZING
        if tok not in curr_node:
            curr_node[tok] = {}
        curr_node = curr_node[tok]

    curr_node[None] = {}

def get_next_allowed(given, trie):
    allowed = trie
    for tok in given:
        # assume given already has valid seq
        if tok in allowed:
            allowed = allowed[tok]
        elif None in allowed:
            allowed = trie[tok]
        else:
            raise Exception(f'Unexpected token {tok}')

    allowed = list(allowed.keys())

    if None in allowed and given:
        allowed += [t for t in trie.keys() if t[0] == ' ' or not t.isalpha()]

    allowed = [v for v in allowed if v]

    if not given:
        allowed = [v for v in allowed if v[0] != ' ']

    return allowed
    

In [137]:
print(
    get_next_allowed([], trie),
    get_next_allowed([' o'], trie),
    get_next_allowed([' o', 'len'], trie),
    get_next_allowed([' o', 'len', 'ko'], trie),
    get_next_allowed([' o', 'len', 'ko', ' hu'], trie),
    get_next_allowed([' o', 'len', 'ko', ' hu', 'oment'], trie),
    get_next_allowed([' o', 'len', 'ko', ' hu', 'oment', 'a'], trie),
    sep='\n\n'
)

['m', 'Sin', 'O', 'Van', 'o', 'N', 'Y', 'H', 'n', 'on', 'ei', 'hy', 'On', 'V', 'M', 'ross', 't', 'T', 'Mit', 'K', '!', 'olen', 'yst', 'l', 'Hy', 'ky', 'Su', 'sin', 'S', 'Min', 'su', 'min', 'E', '.', 'Vel', 'vel', 'h', 'k', 'Ale', 'hu', 'Hu', 'He', 'ale', 'Luc', 'Ross', 'van', 'hei', 'ko', 'ves', 'hä', 's', 'ter', 'mit']

['let', 'len']

['ko', ' hu', ' y', ' on', ' sin', ' K', ' vel', ' Hu', ' o', ' T', ' ei', ' Muk', ' h', ' N', ' On', ' k', ' n', '!', ' matt', ' V', ' su', ' Y', ' hei', ' m', ' hä', ' van', ' Sam', ' ale', ' ro', ' Lucas', ' O', ' E', ' mit', '.', ' Ky', ' Hä', ' ter', ' Su', ' mik', ' Sin', ' Min', ' Hy', ' tal', ' v', ' Ale', ' min', ' Mik', ' Ko', ' Mit', ' ko', ' Vel', ' luc', ' He', ' Van', ' Rossi', ' sam', ' H', ' Matt', ' hy']

[' hu', ' y', ' on', ' sin', ' K', ' vel', ' Hu', ' o', ' T', ' ei', ' Muk', ' h', ' N', ' On', ' k', ' n', '!', ' matt', ' V', ' su', ' Y', ' hei', ' m', ' hä', ' van', ' Sam', ' ale', ' ro', ' Lucas', ' O', ' E', ' mit', '.', ' Ky', 

In [58]:
prompt = 'Here\'s a Finnish sentence: Onko'

print(prompt, end='')

for _ in range(50):
    tokens = tokenize(prompt)

    # TODO: incorporate get_next_allowed into mask
    # TODO: for now, only mask last token -- pros: simper mask, faster inference. cons: cant cannot parallelize training (doesn't matter for now)

    IMDecoderLayer.mask = gen_mask(tokens)
    IMDecoderLayer.mask[0][-1] += [tokof(' sin'), tokof('sin')]

    out = model.generate(
        **tokens,
        max_new_tokens=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.000001,
        # do_sample=True,
        return_dict_in_generate=True,
        # output_hidden_states=True
    )

    tok = tokenizer.decode(out.sequences[0][-1])
    prompt += tok
    print(tok, end='')

Here's a Finnish sentence: Onko sinulle tarkoitus, että tämän sinulla on sinu sinua? (Is it possible that this is the same person as you?) The correct answer is:

A) Yes
B) No

C) It depends
