In [1]:
import torch
torch.cuda.is_available()

True

In [3]:
# need latest to be able to use mistral7b. RUN IN TERMINAL
#!pip install -q -U git+https://github.com/huggingface/transformers.git 
#!pip install -q trl xformers

In [2]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain.chains import ConversationChain
import transformers
import torch
import warnings

warnings.filterwarnings("ignore")

from dotenv import load_dotenv
import torch
import os

load_dotenv()

HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

model_name = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(
    model_name, device_map="auto", trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    # load_in_8bit=True,
)

# enable evaluation mode to allow model inference
model.eval()

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)model.bin.index.json: 100%|██████████| 23.9k/23.9k [00:00<00:00, 7.99MB/s]
Downloading (…)l-00001-of-00002.bin: 100%|██████████| 9.94G/9.94G [14:34<00:00, 11.4MB/s]
Downloading (…)l-00002-of-00002.bin: 100%|██████████| 5.06G/5.06G [07:25<00:00, 11.4MB/s]
Downloading shards: 100%|██████████| 2/2 [22:01<00:00, 660.73s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:32<00:00, 16.21s/it]
Downloading (…)neration_config.json: 100%|██████████| 116/116 [00:00<00:00, 58.0kB/s]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )
    (norm): MistralRM

In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# stop_list = ['\n\nQuestion:', '\n```\n'] # If we don't provide a stopping criteria the model just goes on a bit tangent after answering the initial question
stop_list = ['[/INST]'] # If we don't provide a stopping criteria the model just goes on a bit tangent after answering the initial question
stop_token_ids = [tokenizer(x, add_special_tokens=False)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]
stop_token_ids

from transformers import StoppingCriteria, StoppingCriteriaList

# define custom stopping criteria object
class StopOnTokens(StoppingCriteria):
    def __init__(self, sentinel_token_ids: torch.LongTensor,
                 starting_idx: int):
        transformers.StoppingCriteria.__init__(self)
        self.sentinel_token_ids = sentinel_token_ids
        self.starting_idx = starting_idx


    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for sample in input_ids:
            trimmed_sample = sample[self.starting_idx:]
            # Can't unfold, output is still too tiny. Skip.
            if trimmed_sample.shape[-1] < self.sentinel_token_ids.shape[-1]:
                continue

            for window in trimmed_sample.unfold(0, self.sentinel_token_ids.shape[-1], 1):
                if torch.all(torch.eq(self.sentinel_token_ids, window)):
                    return True
        return False
    
        # for stop_ids in stop_token_ids:
        #     if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
        #         return True
        # return False
        
        
sentinel_token_ids = tokenizer("\n[/INST]", add_special_tokens=False, return_tensors="pt").input_ids.to("cuda")


stopping_criteria = StoppingCriteriaList([StopOnTokens(sentinel_token_ids=sentinel_token_ids, starting_idx=0)])
sentinel_token_ids

tensor([[28705,    13, 28792, 28748, 16289, 28793]], device='cuda:0')

In [40]:
from transformers import LogitsProcessorList, LogitsProcessor
class EosTokenRewardLogitsProcessor(LogitsProcessor):
  def __init__(self,  eos_token_id: int, max_length: int):
    
        if not isinstance(eos_token_id, int) or eos_token_id < 0:
            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")

        if not isinstance(max_length, int) or max_length < 1:
          raise ValueError(f"`max_length` has to be a integer bigger than 1, but is {max_length}")

        self.eos_token_id = eos_token_id
        self.max_length=max_length

  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
    cur_len = input_ids.shape[-1]
    # start to increese the reward of the  eos_tokekn from 80% max length  progressively on length
    for cur_len in (max(0,int(self.max_length*0.8)), self.max_length ):
      ratio = cur_len/self.max_length
      num_tokens = scores.shape[1] # size of vocab
      scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] =\
      scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]]*ratio*10*torch.exp(-torch.sign(scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]]))
      scores[:, self.eos_token_id] = 1e2*ratio
    return scores

In [49]:
pipeline = transformers.pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        use_cache=True,
        device_map="auto",
        max_length=500,
        do_sample=True,
        top_k=5,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        stopping_criteria=stopping_criteria,
        logits_processor=[EosTokenRewardLogitsProcessor(eos_token_id=tokenizer.eos_token_id, max_length=500)]
)

llm = HuggingFacePipeline(pipeline=pipeline)
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text_generation.TextGenerationPipeline object at 0x000001E3CD8495B0>)

In [50]:
from langchain import PromptTemplate, LLMChain

#### Prompt
template = """<s>[INST] You are a helpful, respectful and honest assistant. Answer exactly in few words.
{question} [/INST] </s>
"""

question_p = """What are you?"""
context_p = """ On August 10 said that its arm JSW Neo Energy has agreed to buy a portfolio of 1753 mega watt renewable energy generation capacity from Mytrah Energy India Pvt Ltd for Rs 10,530 crore."""
prompt = PromptTemplate(template=template, input_variables=["question","context"])
llm_chain = LLMChain(prompt=prompt, llm=llm)
response = llm_chain.run({"question":question_p,"context":context_p})
print(response)


[INST] You are a helpful, respectful and honest assistant. Answer exactly in few words.
What are you? [/INST] 


In [51]:
question = "What is a LLM?."
answer = llm(question)
print(answer)



A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Juris Doctor (JD).

## What is a LLM?.

A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Juris Doctor (JD).

## What is a LLM?.

A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Juris Doctor (JD).

## What is a LLM?.

A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Juris Doctor (JD).

## What is a LLM?.

A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Juris Doctor (JD).

## What is a LLM?.

A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Juris Doctor (JD).

## What is a LLM?.

A Master of Laws (LLM) is a postgraduate degree that is earned after completing a Bachelor of Laws (LLB) or Ju

In [None]:
runtimeFlag = "cuda:0"
system_prompt = 'The conversation between Human and AI assisatance named Gathnex\n'
B_INST, E_INST = "[INST]", "[/INST]"

prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"

inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

_ = model.generate(**inputs, streamer=streamer, max_new_tokens=20)

In [52]:
from transformers import  TextStreamer

TEMPERATURE = 0.2
REP_PENALTY = 1.2
NO_REPEAT_NGRAM_SIZE = 10
NUM_RETURN_SEQUENCES = 1

def stream(user_prompt):
    runtimeFlag = "cuda:0"
    system_prompt = 'The conversation between Human and AI assisatance named Gathnex\n'
    B_INST, E_INST = "[INST]", "[/INST]"

    prompt = f"{system_prompt}{B_INST}{user_prompt.strip()}\n{E_INST}"

    inputs = tokenizer([prompt], return_tensors="pt").to(runtimeFlag)

    streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

    _ = model.generate(**inputs, streamer=streamer, max_new_tokens=200, temperature=TEMPERATURE, repetition_penalty=REP_PENALTY, no_repeat_ngram_size=NO_REPEAT_NGRAM_SIZE, num_return_sequences=NUM_RETURN_SEQUENCES)
    

stream("O que é um LLM?")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



GPT-3 is a large language model developed by OpenAI. It has been trained on a massive amount of text data, including books, articles, websites, and social media posts. This allows it to generate human-like responses to prompts in various languages.
LLMs are used for many different tasks such as translation, summarization, question answering, and more. They can be integrated into chatbots or other applications that require natural language processing capabilities.
The potential benefits of using an LLM include improved customer service experiences due to better understanding of user intent; faster response times because the machine does not need time offline like humans do when they get tired from working too hard; increased accuracy since machines don't make mistakes like people sometimes do; cost savings since there isn't any additional staff needed after implementation etc..
