In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [2]:
quant_cfg = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.float16
)

keys = ["model.embed_tokens"] + [f"model.layers.{i}" for i in range(32)] + ["model.norm", "lm_head"]
n_on_gpu = 0
values = [0 for _ in range(n_on_gpu)] + ["cpu" for _ in range(len(keys) - n_on_gpu)]
device_map = dict(zip(keys, values))

In [3]:
base_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map=device_map)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")

In [5]:
math_model = AutoModelForCausalLM.from_pretrained("nvidia/OpenMath-Mistral-7B-v0.1-hf", device_map=device_map)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
math_tokenizer = AutoTokenizer.from_pretrained("nvidia/OpenMath-Mistral-7B-v0.1-hf")

In [7]:
def math_template(question):
    prefix = "System:\nYou're an expert Python programmer and mathematician. Help the user to solve this problem using code when necessary. Make sure to put the answer (and only answer) inside \\boxed{}.\n\n"
    return prefix + f"User:\n{question}\n\nAssistant:\n"

In [8]:
def run_math_model(question, math_model, math_tokenizer):
    prompt = math_template(question)
    tokens = math_tokenizer(prompt, return_tensors="pt").to(math_model.device)

    with torch.no_grad():
        output = math_model.generate(**tokens, max_new_tokens=1024, pad_token_id=math_tokenizer.eos_token_id)

    print(math_tokenizer.batch_decode(output)[0])

In [9]:
def dare(p, sft_params, base_params, clip=True):
    mask = torch.rand((sft_params.shape), device=sft_params.device) < p
    sft_params[~mask] = base_params[~mask]
    # TODO: How to deal with truncation?
    if clip:
        sft_params[mask] = ((sft_params[mask] - p * base_params[mask]) / (1 - p)).clamp(max=255).byte()
    else:
        sft_params[mask] = ((sft_params[mask] - p * base_params[mask]) / (1 - p))

In [10]:
def get_attention_parameters(layer):
    attn = layer.self_attn
    weights = []
    weights.append(attn.q_proj)
    weights.append(attn.k_proj)
    weights.append(attn.v_proj)
    weights.append(attn.o_proj)

    return [w._parameters["weight"] for w in weights]

def get_mlp_parameters(layer):
    mlp = layer.mlp
    weights = []
    weights.append(mlp.gate_proj)
    weights.append(mlp.up_proj)
    weights.append(mlp.down_proj)

    return [w._parameters["weight"] for w in weights]

In [11]:
question = "If I am 10 and my sister is half my age, what age is my sister when I am 20?"
question = "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?"
run_math_model(question, math_model, math_tokenizer)

<s> System:
You're an expert Python programmer and mathematician. Help the user to solve this problem using code when necessary. Make sure to put the answer (and only answer) inside \boxed{}.

User:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Assistant:
 Let's solve this problem using Python code.
<llm-code>
clips_sold_in_april = 48
clips_sold_in_may = clips_sold_in_april / 2
clips_sold_in_april_and_may = clips_sold_in_april + clips_sold_in_may
clips_sold_in_april_and_may
</llm-code>
<llm-code-output>
72.0
</llm-code-output>
Thus Natalia sold \boxed{72} clips in April and May.</s>


In [12]:
with torch.no_grad():
    for math_layer, base_layer in zip(math_model.model.layers, base_model.model.layers):
        math_attention = get_attention_parameters(math_layer)
        base_attention = get_attention_parameters(base_layer)

        math_mlp = get_mlp_parameters(math_layer)
        base_mlp = get_mlp_parameters(base_layer)

        math_weights = math_attention + math_mlp
        base_weights = base_attention + base_mlp
        for math_params, base_params in zip(math_weights, base_weights):
            dare(0.9, math_params, base_params, clip=False)


In [13]:
run_math_model(question, math_model, math_tokenizer)

<s> System:
You're an expert Python programmer and mathematician. Help the user to solve this problem using code when necessary. Make sure to put the answer (and only answer) inside \boxed{}.

User:
Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Assistant:
 Let's solve this problem using Python code.
<llm-code>
clips_sold_in_april = 48
clips_sold_in_may = clips_sold_in_april / 2
clips_sold_in_april_and_may = clips_sold_in_april + clips_sold_in_may
clips_sold_in_april_and_may
</llm-code>
<llm-code-output>
72.0
</llm-code-output>
Th Natalia sold \boxed{72} clips altogether in April and May.</s>
