# Running Locally 

In [None]:
# working with ollama running locally
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain.schema.output_parser import StrOutputParser

# model name to use
model_name = "deepseek-r1:8b"

# question to ask the model
question = "What is the capital of France?"

# create a chat model chat template
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant."),
        ("human", "{question}")
    ]
)

llm = ChatOllama(model=model_name)

# creating a chain to invoke the model with the prompt
chain = prompt | llm | StrOutputParser()

result = chain.invoke(prompt.format_prompt(question=question).to_messages())
print(result)

<think>
Okay, let's see what this query is asking for. The user wants to know the capital of France.

First, I need to recall that Paris is indeed the capital city of France. It's a well-known fact and should be straightforward unless there's some trick here. But wait, maybe the user expects something more than just the name? They might want additional information about why it's significant or any historical context around its status as the capital.

The system message identifies me as a helpful assistant, so my response should be accurate and concise. The human message is pretty direct: "What is the capital of France?" No hidden layers detected here. Just need to answer correctly with Paris. 

But I should consider if the user might have deeper needs. Are they testing for basic knowledge? Maybe preparing for a test or trivia game? Or perhaps they're looking for confirmation about something they already suspect but want to be sure? Since there's no indication of complexity beyond the o

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline
import torch
import os

# Choose your DeepSeek model
model_name = "deepseek-ai/deepseek-llm-7b-chat"

# Folder for offloaded weights (if they don't fit in memory)
offload_dir = "./model_offload"
os.makedirs(offload_dir, exist_ok=True)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with auto device placement + disk offload
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto",               # use Accelerate to split across devices
    offload_folder=offload_dir,      # store offloaded weights here
    trust_remote_code=True           # required for some custom chat templates
)

# Load generation defaults & fix pad token
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

# Create generation pipeline (no device arg needed with device_map="auto")
generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer
)

# Example chat
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the capital of France?"}
]

# Format chat for the model
prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

# Generate
result = generator(
    prompt,
    max_new_tokens=200,
    do_sample=True,
    temperature=0.7
)

# Output only the assistant's reply
reply = result[0]["generated_text"].replace(prompt, "").strip()
print(reply)


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00, 99.24it/s]
Device set to use cpu


<｜begin▁of▁sentence｜>You are a helpful assistant.

User: What is the capital of France?

Assistant:
The capital of France is Paris.


None
