# Running Locally 

In [None]:
# working with ollama running locally
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain.schema.output_parser import StrOutputParser

# model name to use
model_name = "deepseek-r1:8b"

# question to ask the model
question = "What is the capital of France?"

# create a chat model chat template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant."),
        ("human", "{question}")
    ]
)

llm = ChatOllama(model=model_name)

# creating a chain to invoke the model with the prompt
chain = prompt | llm | StrOutputParser()

result = chain.invoke(prompt.format_prompt(question=question).to_messages())
print(result)

<think>
Okay, let's see what this query is asking for. The user wants to know the capital of France.

First, I need to recall that Paris is indeed the capital city of France. It's a well-known fact and should be straightforward unless there's some trick here. But wait, maybe the user expects something more than just the name? They might want additional information about why it's significant or any historical context around its status as the capital.

The system message identifies me as a helpful assistant, so my response should be accurate and concise. The human message is pretty direct: "What is the capital of France?" No hidden layers detected here. Just need to answer correctly with Paris. 

But I should consider if the user might have deeper needs. Are they testing for basic knowledge? Maybe preparing for a test or trivia game? Or perhaps they're looking for confirmation about something they already suspect but want to be sure? Since there's no indication of complexity beyond the o

In [None]:
from transformers import AutoTokenizer, pipeline
import torch, os

model = "deepseek-ai/deepseek-llm-7b-chat"
offload_dir = "./model_offload"
os.makedirs(offload_dir, exist_ok=True)
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

tokenizer = AutoTokenizer.from_pretrained(model)

generator = pipeline(
    "text-generation",
    model=model,
    torch_dtype=dtype,
    device_map="auto",
    trust_remote_code=True,
    model_kwargs={"offload_folder": offload_dir},
)

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"},
]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

out = generator(prompt, max_new_tokens=200, do_sample=True, temperature=0.7, return_full_text=False)
print(out[0]["generated_text"].strip())

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Prompt:
 <｜begin▁of▁sentence｜>You are a helpful assistant.

User: What is the capital of France?

Assistant:
The capital of France is Paris.


In [None]:
# wrapping the code above in langchain
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import ChatPromptTemplate
import torch

model = "deepseek-ai/deepseek-llm-7b-chat"

pipe = pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",
    trust_remote_code=True,
    model_kwargs={"offload_folder": "./model_offload"},
)

llm = HuggingFacePipeline(pipeline=pipe)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("human", "{question}"),
])

chain = prompt | llm
print("Response:", chain.invoke({"question": "What is the capital of France?"}))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu


Response: System: You are a helpful assistant.
Human: What is the capital of France?
Assistant:The capital of France is Paris.
