# LLM Inference with Llama.cpp and Langchain

## Checking GPU Availability

Before loading the model, we check if GPU offloading is supported on the environment.


In [None]:
import pathlib
from llama_cpp.llama_cpp import load_shared_library

In [None]:
def is_gpu_available() -> bool:
    lib = load_shared_library('llama',pathlib.Path('/opt/conda/lib/python3.11/site-packages/llama_cpp/lib'))
    return bool(lib.llama_supports_gpu_offload())

is_gpu_available()

## Inference with Langchain Llama.cpp

In [None]:
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import CallbackManager, StreamingStdOutCallbackHandler
from langchain_core.prompts import PromptTemplate

In [None]:
# CHANGE THE FOLLOWING VARIABLES

# Make sure the model path is correct for your system!
model_path = "ai-models/DeepSeek-R1-Distill-Qwen-32B-Q4_K_M.gguf"

# The number of model's layers to offload to the GPU (if set to -1, all model layers will be offloaded)
n_gpu_layers = -1

In [None]:
template = """
Question: {question}.
"""
prompt = PromptTemplate.from_template(template)

# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [None]:
llm = LlamaCpp(
    model_path=model_path,
    temperature=0.75,
    max_tokens=200,
    top_p=1,
    n_gpu_layers=n_gpu_layers,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

In [None]:
question = """
What is Machine Learning?
"""

llm.invoke(question)