# Mistral Quantization and Finetuning

In [2]:
!pip install -q accelerate
!pip install -qi https://pypi.org/simple/ bitsandbytes

In [6]:
import torch
from transformers import AutoTokenizer, BitsAndBytesConfig, AutoModelForCausalLM

In [4]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token

## Quantization
Quantization is done reduce memory footprint and perform faster inference while still retaining acceptable model performance. For this quantization, we will use bitandbytes

In [7]:
# Use 4 bit compute 
use_4bit = True

# compute dtype for 4-bit models
compute_dtype = "float16"

# quantization type
quantization_type = 'nf4'

# use double quantization
use_nested_quant = False

In [8]:
bnb_compute_dtype = getattr(torch, compute_dtype)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=quantization_type,
    bnb_4bit_compute_dtype=bnb_compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config,
)

In [None]:
chat_input = tokenizer.encode_plus("[INST] Was Vivek Ramaswamy running for president ? [/INST]", return_tensors="pt")['input_ids'].to('cuda')

generated_ids = model.generate(chat_input, 
                               max_new_tokens=1000, 
                               do_sample=True, 
                               pad_token_id=tokenizer.eos_token_id)
decoded = tokenizer.batch_decode(generated_ids)

## Langchain And Prompt Engineering

In [None]:
!pip install -q langchain

In [None]:
from langchain import LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import pipeline
from langchain.embeddings.huggingface import HuggingFaceEmbeddings

In [None]:
text_generation_pipeline = pipeline(
    model=model,
    tokenizer=tokenizer,
    task='text-generation',
    temperature=0.2,
    repetition_penalty=1.1,
    return_full_text=True,
    max_new_tokens=1000
)

In [16]:
prompt_template = """
### [INST]
Instruction: You are an expert political analyst with vast knowledge of the United States electoral process. You answer questions with 
certainty and you do not hallucinate. When unsure, you politely reply that you do not have  Using this knowledge, answer the following questions.
Here is a context to help:

{context}

### QUESTION:
{question}

[/INST]
"""

In [None]:
llm_pipeline = HuggingFacePipeline(pipeline=text_generation_pipeline)

In [None]:
prompt = PromptTemplate(
    input_variables=['context', 'question']
    template=prompt_template
)

In [None]:
llm_chain = LLMChain(llm=llm_pipeline, prompt=prompt)