## Load Model

In [1]:
from mamba import Mamba, ModelArgs
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pretrained_model_name = 'state-spaces/mamba-370m'
model = Mamba.from_pretrained(pretrained_model_name)
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/gpt-neox-20b')

  return self.fget.__get__(instance, owner)()
Downloading tokenizer_config.json: 100%|██████████| 156/156 [00:00<00:00, 35.7kB/s]
Downloading vocab.json: 100%|██████████| 1.08M/1.08M [00:00<00:00, 1.44MB/s]
Downloading merges.txt: 100%|██████████| 457k/457k [00:00<00:00, 1.17MB/s]
Downloading tokenizer.json: 100%|██████████| 2.11M/2.11M [00:01<00:00, 2.08MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 90.0/90.0 [00:00<00:00, 108kB/s]
Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.


## Generate Text

In [3]:
import torch
import torch.nn.functional as F

In [6]:
def generate(model, 
			 tokenizer, 
			 prompt:str,
			 n_tokens_to_gen: int=50,
			 sample: bool=True,
			 top_k: int=40):
	model.eval()
	input_ids = tokenizer(prompt, return_tensors='pt').input_ids
	
	for token_n in range(n_tokens_to_gen):
		with torch.no_grad():
			indices_to_input = input_ids
			next_token_logits = model(indices_to_input)[:, -1]

		probs = F.softmax(next_token_logits, dim=-1)
		(batch, vocab_size) = probs.shape
	
		if top_k is not None:
			(values, indices) = torch.topk(probs, k=top_k)
			probs[probs < values[:, -1, None]] = 0
			probs = probs / probs.sum(axis=1, keepdims=True)
   
		if sample:
			next_indices = torch.multinomial(probs, num_samples=1)
		else:
			next_indices = torch.argmax(probs, dim=-1)[:, None]

		input_ids = torch.cat([input_ids, next_indices], dim=1)
	
	output_completions = [tokenizer.decode(output.tolist()) for output in input_ids][0]
	
	return output_completions

In [7]:
print(generate(model, tokenizer, 'Mamba is the'))

2024-05-01 00:22:15.955730: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Mamba is the third of a planned seven-film series.

According to a press note in the magazine, the first film in the universe, which includes the first 10 episodes, "was released in the U.S. July 1, 1988, with foreign


In [8]:
print(generate(model, tokenizer, 'U of T grad students are'))

U of T grad students are working with the community.
<dpm> I think it's a good idea in the long run. I think it's the right place to host it. There are a lot of people willing to host this
<tvansteenburgh
