In [1]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Little hack to ensure that we can import Coconut correctly

In [2]:
import os
import sys

sys.path.append(f"{os.getcwd()}/..")

Imports

In [3]:
from coconut import Coconut
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

Constants

In [4]:
MODEL_ARCHITECTURE = "openai-community/gpt2"
COCONUT_CHECKPOINT = "../checkpoints/gsm-coconut/checkpoint_25"
THOUGHT_CHECKPOINT = "../checkpoints/gsm-cot/checkpoint_15"

GPU_DEVICE = 0

In [5]:
def build_coconut_model(
    architecture: str,
    checkpoint: str,
    latent_token: str = "<|latent|>",
    start_token: str = "<|start-latent|>",
    end_token: str = "<|end-latent|>",
    gpu_device: int = 0,
) -> dict:
    
    model = AutoModelForCausalLM.from_pretrained(architecture)
    tokenizer = AutoTokenizer.from_pretrained(architecture)
    tokenizer.pad_token = tokenizer.eos_token

    coconut_tokens = [latent_token, start_token, end_token]
    tokenizer.add_tokens(coconut_tokens)
    token_ids = tokenizer.convert_tokens_to_ids(coconut_tokens)

    model.resize_token_embeddings(len(tokenizer))
    embeddings = model.get_input_embeddings()
    target_id = tokenizer.convert_tokens_to_ids("<<")

    for token_id in token_ids:
        target_embedding = embeddings.weight.data[token_id]
        embeddings.weight.data[token_id] = target_embedding

    model = Coconut(model, *token_ids, tokenizer.eos_token_id)

    load_kwargs = {
        "map_location": torch.device(GPU_DEVICE),
        "weights_only": False,
    }
    weights = torch.load(checkpoint, **load_kwargs)
    model.load_state_dict(weights, strict=False)

    return {"model": model.to(gpu_device), "tokenizer": tokenizer}



def build_thought_model(
    architecture: str,
    checkpoint: str,
    gpu_device: int = 0,
) -> dict:

    model = AutoModelForCausalLM.from_pretrained(architecture)
    tokenizer = AutoTokenizer.from_pretrained(architecture)

    load_kwargs = {
        "map_location": torch.device(GPU_DEVICE),
        "weights_only": False,
    }
    weights = torch.load(checkpoint, **load_kwargs)
    model.load_state_dict(weights, strict=False)

    return {"model": model.to(gpu_device), "tokenizer": tokenizer}

In [6]:
coconut = build_coconut_model(MODEL_ARCHITECTURE, COCONUT_CHECKPOINT, gpu_device=GPU_DEVICE)
thought = build_thought_model(MODEL_ARCHITECTURE, THOUGHT_CHECKPOINT, gpu_device=GPU_DEVICE)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [15]:
question = "I have 7 apples and I gave Josh 3 apples and he gives Paul 2 apples. How many apples do I have?"
# question = "What would I have to do to earn one million dollars?"

for approach_name, approach in {"Coconut": coconut, "CoT": thought}.items():
    tokens = approach["tokenizer"](question, return_tensors="pt").to(0)
    outputs = approach["model"].generate(
        **tokens,
        max_new_tokens=40,
        num_beams=15,
        num_return_sequences=15,
    )
    print(f"\n{approach_name}")
    for output_id in range(len(outputs)):
        print(f"\nOutput {output_id + 1}:")
        output = approach["tokenizer"].decode(outputs[output_id])
        print(f"{output}")
    print("")

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Coconut

Output 1:
I have 7 apples and I gave Josh 3 apples and he gives Paul 2 apples. How many apples do I have?### 4


CoT

Output 1:
I have 7 apples and I gave Josh 3 apples and he gives Paul 2 apples. How many apples do I have?
<<3+2=5>>
<<7-5=2>>
### 2<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>

Output 2:
I have 7 apples and I gave Josh 3 apples and he gives Paul 2 apples. How many apples do I have?
<<3+2=5>>
<<7+5=12>>
### 12<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>

Output 3:
I have 7 apples and I gave Josh 3 apples and he gives Paul 2 apples. How many apples do I have?
<<7-3-2=2>>
### 2<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>

Output 4:
I have 7 apples and I gave Josh 3 apples

In [8]:
def coconut_next_token_distribution(
    model: Coconut,
    input_ids: torch.Tensor,
    **kwargs,
) -> torch.Tensor:
    gen_fwd_cnt = 0

    assert input_ids.shape[0] == 1, "only support batch_size == 1 now"

    outputs = model.forward(
        input_ids,
        torch.ones_like(input_ids, device=input_ids.device),
        input_ids.clone(),
        torch.arange(
            0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
        ).reshape(1, -1),
        **kwargs,
    )

    return torch.nn.functional.softmax(outputs.logits[:, -1], dim=1)


question = "I have 7 apples and I gave Luke 3 apples and he gives Paul 2 apples. How many apples do I have?"

for approach in [coconut]:
    tokens = approach["tokenizer"](question, return_tensors="pt").to(0)
    logits = coconut_next_token_distribution(approach["model"], tokens["input_ids"])

    print(logits)
    print(torch.sum(logits))
    print(torch.max(logits))

tensor([[6.9018e-08, 3.2274e-06, 3.5362e-08,  ..., 1.1694e-03, 1.8761e-03,
         2.6242e-07]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor(1.0000, device='cuda:0', grad_fn=<SumBackward0>)
tensor(0.5681, device='cuda:0', grad_fn=<MaxBackward1>)


In [20]:
import pandas as pd

COT_DATA = "cot_pos_neg_filtered"

df = pd.read_csv(f"{os.getcwd()}/../data/{COT_DATA}.csv")

def add_chevrons(row):
    if row:
        try:
            steps = row.split(',')  # Split the string by commas
            enclosed_steps = [f"<<{step}>>" for step in steps]  # Add chevrons to each step
            return ''.join(enclosed_steps)  # Join the steps back into a single string
        except:
            print(row)


df['pred_steps'] = df['pred_steps'].apply(add_chevrons)

df


nan
nan
nan
nan
nan


Unnamed: 0,q_idx,qa_idx,question,true_anwser,pred_steps,pred_anwser,correct_anwser,raw_output
0,0,0_0,Janet’s ducks lay 16 eggs per day. She eats th...,18,<<3+4=7>><<16-7=9>><<9*2=18>>,18.0,True,Janet’s ducks lay 16 eggs per day. She eats th...
1,0,0_1,Janet’s ducks lay 16 eggs per day. She eats th...,18,<<3+4=7>><<16-7=9>><<9*7=63>><<63*2=126>>,126.0,False,Janet’s ducks lay 16 eggs per day. She eats th...
2,0,0_2,Janet’s ducks lay 16 eggs per day. She eats th...,18,<<3+4=7>><<16-7=9>><<16*9=144>><<144*2=288>>,288.0,False,Janet’s ducks lay 16 eggs per day. She eats th...
3,0,0_3,Janet’s ducks lay 16 eggs per day. She eats th...,18,<<16-3=13>><<3+4=7>><<13-7=6>><<6*2=12>>,12.0,False,Janet’s ducks lay 16 eggs per day. She eats th...
4,0,0_4,Janet’s ducks lay 16 eggs per day. She eats th...,18,<<3+4=7>><<16-7=9>><<9*4=36>><<36*2=72>>,72.0,False,Janet’s ducks lay 16 eggs per day. She eats th...
...,...,...,...,...,...,...,...,...
5284,1318,1318_0,Henry and 3 of his friends order 7 pizzas for ...,14,<<7*8=56>><<56/4=14>>,14.0,True,Henry and 3 of his friends order 7 pizzas for ...
5285,1318,1318_1,Henry and 3 of his friends order 7 pizzas for ...,14,<<7*8=56>><<1+3=4>><<56/4=14>>,14.0,True,Henry and 3 of his friends order 7 pizzas for ...
5286,1318,1318_2,Henry and 3 of his friends order 7 pizzas for ...,14,<<1+3=4>><<7*8=56>><<56/4=14>>,14.0,True,Henry and 3 of his friends order 7 pizzas for ...
5287,1318,1318_3,Henry and 3 of his friends order 7 pizzas for ...,14,<<1+3=4>><<56/4=14>>,14.0,True,Henry and 3 of his friends order 7 pizzas for ...
