# **Exploring LLM Models with Hugging Face and Langchain Library : A Comprehensive Guide**


**Exploring different LLM models from Hugging Face with LangChain**


**Llama, Mistral, Phi**

**Step 1: Setting Up the Environment**

In [None]:
!pip install -q -U langchain transformers bitsandbytes accelerate

In [None]:
import torch
import os
from langchain import PromptTemplate, HuggingFacePipeline
from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
from langchain_core.prompts import (
    ChatPromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder,
)
from langchain_core.messages import SystemMessage

In [None]:
os.environ["HF_TOKEN"]='your_huggingface_API_key'

**Step 2: Initializing the Language Model**

In [None]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"
# MODEL_NAME ="mistralai/Mistral-7B-Instruct-v0.2"
# MODEL_NAME ="meta-llama/Meta-Llama-3-8B"
# MODEL_NAME ="microsoft/Phi-3-mini-4k-instruct"
# MODEL_NAME ="microsoft/phi-1_5"

# Quantization is a technique used to reduce the memory and computation requirements
# of deep learning models, typically by using fewer bits, 4 bits
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

# Initialization of a tokenizer for the language model,
# necessary to preprocess text data for input
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

# Initialization of the pre-trained language model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
    quantization_config=quantization_config
)


**Step 3: Configuring Generation Settings**

In [None]:
# Configuration of some generation-related settings
generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024 # maximum number of new tokens that can be generated by the model
generation_config.temperature = 0.7 # randomness of the generated tex
generation_config.top_p = 0 # diversity of the generated text
generation_config.do_sample = True # sampling during the generation process
# generation_config.repetition_penalty = 1.15 # the degree to which the model should avoid repeating tokens in the generated text

**Step 4: Creating the Pipeline**

In [None]:
# A pipeline is an object that works as an API for calling the model
# The pipeline is made of (1) the tokenizer instance, the model instance, and
# some post-procesing settings. Here, it's configured to return full-text outputs
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

In [None]:
# HuggingFace pipeline
llm = HuggingFacePipeline(pipeline=pipe)

**Step 5: Testing the Model**

In [None]:
input_text = "Write me a poem about Machine Learning."

In [None]:
output = llm.invoke(input_text)

print(output)

**Step 6: Further Testing with PromptTemplate and Chain**

In [None]:
template = """
     Write me a poem about {topic}.
"""

In [None]:
topic = "Machine Learning"

In [None]:
prompt = PromptTemplate(input_variables=["topic"], template=template)
# Construct a Langchain Chain to connect the prompt template with the LLM
chain = prompt | llm
output = chain.invoke({"topic": topic})

print(output)

**Step 6: Further Testing with ChatPromptTemplate**

In [None]:
topic = "Machine Learning"

prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                  """ Write a poem related to the input topic in one paragraph"""
            )
        ),
        HumanMessagePromptTemplate.from_template("```{topic}```"),
    ]
)

chain = prompt | llm
output = chain.invoke({"topic": topic})

print(output)