In [1]:
import sys, os

In [2]:
from exllamav2 import(
    ExLlamaV2,
    ExLlamaV2Config,
    ExLlamaV2Cache,
    ExLlamaV2Cache_8bit,
    ExLlamaV2Tokenizer,
    model_init,
)

from exllamav2.generator import (
    ExLlamaV2StreamingGenerator,
    ExLlamaV2BaseGenerator,
    ExLlamaV2Sampler
)


In [3]:
model_directory = "/home/cwinkler/models/Mistral-7B-instruct-exl2"

In [4]:
config = ExLlamaV2Config()
config.model_dir = model_directory
config.prepare()

model = ExLlamaV2(config)
print("Loading model: " + model_directory)

cache = ExLlamaV2Cache(model, lazy = True)
model.load_autosplit(cache)

Loading model: /home/cwinkler/models/Mistral-7B-instruct-exl2


In [5]:
tokenizer = ExLlamaV2Tokenizer(config)

# Initialize generator

generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)

In [6]:
settings = ExLlamaV2Sampler.Settings()
settings.temperature = 0.1
settings.top_k = 50
settings.top_p = 0.8
settings.token_repetition_penalty = 1.05
settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])

max_new_tokens = 500


In [7]:
prompt = "Tell me about Alan Turing!"

In [8]:
input_ids = tokenizer.encode(prompt)
prompt_tokens = input_ids.shape[-1]

In [9]:
generator.warmup()

In [10]:
generator.set_stop_conditions([])
generator.begin_stream(input_ids, settings)

In [11]:
%%time
generated_tokens = 0
while True:
    chunk, eos, _ = generator.stream()
    generated_tokens += 1
    print (chunk, end = "")
    sys.stdout.flush()
    if eos or generated_tokens == max_new_tokens: break



Alan Turing was a mathematician and computer scientist, widely considered to be the father of theoretical computer science and artificial intelligence. He is best known for his role in cracking the Nazi's Enigma code during World War II while working with the British government on a top-secret codebreaking program. Turing's work in codebreaking was instrumental in the Allied victory in the war.

Turing is also known for his contributions to the development of the first computer designs, and for his pioneering work in the field of artificial intelligence. In 1950, he proposed the Turing Test, an experiment to measure a machine's ability to exhibit intelligent behavior equivalent to or indistinguishable from a human.

Sadly, Turing's personal life was marked by tragedy and persecution. In 1952, he was prosecuted for homosexuality, which was then considered a crime in the UK. He accepted chemical castration as an alternative to prison, and two years later he died from cyanide poisoning 

In [12]:
!nvidia-smi

Wed May 29 15:43:49 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 545.23.08              Driver Version: 545.23.08    CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        On  | 00000000:26:00.0 Off |                  Off |
|  0%   47C    P2              65W / 450W |   5524MiB / 24564MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    