In [1]:
!pip install -q transformers datasets accelerate bitsandbytes google-cloud-storage gguf langchain_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m46.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from google.colab import drive
from google.cloud import storage
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Set environment variable for credentials
credentials_path = "/content/drive/MyDrive/gcp-key.json"  # Path to your key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

# 3. Initialize GCS client
storage_client = storage.Client()
bucket_name = "llm-test-bucket-2025"  # Replace with your bucket name
bucket = storage_client.bucket(bucket_name)

Mounted at /content/drive


In [3]:
# 4. Save tokenizer and model to a local directory
model_dir = "model_weights_4bit"  # A local directory to save the model and tokenizer
os.makedirs(model_dir, exist_ok=True)

In [4]:
import os

# Define local directory where model files will be downloaded
def download_from_gcs(gcs_prefix, local_dir):
    """Download all files from GCS folder to local directory."""
    blobs = bucket.list_blobs(prefix=gcs_prefix)
    for blob in blobs:
        local_filepath = os.path.join(local_dir, os.path.basename(blob.name))
        print(f"Downloading {blob.name} to {local_filepath}...")
        blob.download_to_filename(local_filepath)

# Download model weights from GCS
download_from_gcs("model_weights", model_dir)

print("Model files downloaded successfully!")

Downloading model_weights/config.json to model_weights_4bit/config.json...
Downloading model_weights/generation_config.json to model_weights_4bit/generation_config.json...
Downloading model_weights/model-00001-of-00002.safetensors to model_weights_4bit/model-00001-of-00002.safetensors...
Downloading model_weights/model-00002-of-00002.safetensors to model_weights_4bit/model-00002-of-00002.safetensors...
Downloading model_weights/model.safetensors.index.json to model_weights_4bit/model.safetensors.index.json...
Downloading model_weights/special_tokens_map.json to model_weights_4bit/special_tokens_map.json...
Downloading model_weights/tokenizer.json to model_weights_4bit/tokenizer.json...
Downloading model_weights/tokenizer_config.json to model_weights_4bit/tokenizer_config.json...
Model files downloaded successfully!


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define paths
model_path = model_dir  # Path to downloaded model files

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model (ensure it uses 8-bit quantization)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Automatically map to available GPU
)

print("Model loaded successfully!")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model loaded successfully!


In [6]:
from transformers import pipeline
import torch
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
import markdown

def generate_response(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=5000)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove everything before and including '</think>'
    if "</think>" in response:
        response = response.split("</think>")[-1].strip()

    return response

# Create the text-generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=5000,
    temperature=0.9,
    do_sample=True
)

# Create a HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Define a simple prompt template
template = """Question: {question}

Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])

# Create the LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Run the chain and clean output
question = '''
          Given a 2d matrix cost[][] of size n where cost[i][j] denotes the cost of moving from city i to city j.
          The task is to complete a tour from city 0 (0-based index) to all other cities such that we visit each city
          exactly once and then at the end come back to city 0 at minimum cost. Note the difference between Hamiltonian Cycle and TSP. T
          he Hamiltonian cycle problem is to find if there exists a tour that visits every city exactly once.
          Here we know that Hamiltonian Tour exists (because the graph is complete) and in fact, many such tours exist,
          the problem is to find a minimum weight Hamiltonian Cycle.
          Example:
          Input: cost[][] = [[0, 111], [112, 0]]
          Output: 223
          Explanation: We can visit 0->1->0 and cost = 111 + 112 = 223.
          Considering above: what is the minimum cost where cost[][] = [[0, 1000, 5000], [5000, 0, 1000], [1000, 5000, 0]], hint: the cost should be less thant 5000?
          '''
response = llm_chain.run(question).strip()

# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()

print(response)

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  response = llm_chain.run(question).strip()


Given a cost matrix \( \text{cost[][]} \) of size \( n \) where \( \text{cost}[i][j] \) denotes the cost of moving from city \( i \) to city \( j \), the task is to find a Hamiltonian cycle that visits each city exactly once and returns to the starting city (city 0) at minimum cost.

The cost matrix provided is:
\[
\begin{bmatrix}
0 & 1000 & 5000 \\
5000 & 0 & 1000 \\
1000 & 5000 & 0
\end{bmatrix}
\]

The goal is to find the Hamiltonian cycle that starts and ends at city 0, visiting all other cities exactly once, with the minimum total cost.

Key steps:
1. Identify the possible Hamiltonian cycles.
2. Calculate the cost for each cycle.
3. Select the cycle with the minimum cost.

Possible cycles:
- \( 0 \rightarrow 1 \rightarrow 2 \rightarrow 0 \)
- \( 0 \rightarrow 2 \rightarrow 1 \rightarrow 0 \)

Calculating the cost for each cycle:
- Cycle \( 0 \rightarrow 1 \rightarrow 2 \rightarrow 0 \):
  - \( 0 \rightarrow 1 \) cost: 1000
  - \( 1 \rightarrow 2 \) cost: 1000
  - \( 2 \rightarrow 

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

prompt = "Can you provide me with a 5 day trip plan for a trip to France?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(model_inputs.input_ids,max_new_tokens=5000)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [16]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Okay, so I'm planning a 5-day trip to France, and I need to figure out a good itinerary. I've never been before, so I need to make sure I see the main attractions without feeling rushed. Let me think about what I know about France.

First, I know Paris is the capital, so that should definitely be the first day. I've heard the Eiffel Tower, the Louvre, and maybe the Sacré-Cœur. On the first day, I can arrive, maybe drop my bags off, and then explore those iconic spots. Maybe in the morning, visit the Eiffel Tower, then the Louvre in the afternoon. Evening could be a nice dinner and a stroll along the Seine. I think that's a solid first day.

On the second day, I want to explore more of Paris. Maybe visit some museums like the Musée d'Orsay or the Pompidou. I've heard the Pompidou is modern and has a great view from the roof. Maybe in the morning, go to the Musée d'Orsay, then in the afternoon visit the Pompidou, and end the day with a nice dinner and a cabaret show. That sounds fun.

Th

In [17]:
# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()
# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()

# print(response)
markdown_response = markdown.markdown(response)
print(markdown_response)

<p>Here's a well-organized and balanced 5-day trip plan for France, ensuring a mix of sightseeing, culture, and relaxation:</p>
<h3>Day 1: Arrival in Paris</h3>
<ul>
<li><strong>Morning:</strong> Arrive in Paris, check into your accommodation.</li>
<li><strong>Afternoon:</strong> Visit the Eiffel Tower and explore the surrounding area. Stroll along the Champ de Mars.</li>
<li><strong>Evening:</strong> Enjoy a picnic with a view from a rooftop or a romantic dinner by the Seine.</li>
<li><strong>Night:</strong> Overnight stay in Paris.</li>
</ul>
<h3>Day 2: Explore Parisian Museums</h3>
<ul>
<li><strong>Morning:</strong> Visit the Louvre Museum.</li>
<li><strong>Afternoon:</strong> Explore the Musée d'Orsay or the Pompidou Museum, then enjoy a riverside café.</li>
<li><strong>Evening:</strong> Take a sunset river cruise on the Seine.</li>
<li><strong>Night:</strong> Experience a traditional cabaret show.</li>
<li><strong>Stay:</strong> Overnight in Paris.</li>
</ul>
<h3>Day 3: Day Trip t

In [18]:
from IPython.display import HTML, display
display(HTML(markdown_response))

In [19]:
!pip install gcsfs

Collecting fsspec==2025.3.0 (from gcsfs)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2024.12.0
    Uninstalling fsspec-2024.12.0:
      Successfully uninstalled fsspec-2024.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datasets 3.4.1 requires fsspec[http]<=2024.12.0,>=2023.1.0, but you have fsspec 2025.3.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2025.3.0


In [20]:
import gcsfs

# Initialize GCS file system
fs = gcsfs.GCSFileSystem(project="genai-model-dev-deploy", token=credentials_path)

# Define your bucket path
bucket_name = "llm-test-bucket-2025"
bucket_path = f"gs://{bucket_name}/"

# Verify the bucket content
files = fs.ls(bucket_path)
print("Files in GCS bucket:", files)

Files in GCS bucket: ['llm-test-bucket-2025/model_weights']


In [23]:
file_path = f"{bucket_path}model_weights/tokenizer.json"

with fs.open(file_path, "r") as f:
    content = f.read()
    print("File content:", content[:100])  # Print first 500 characters

File content: {
  "version": "1.0",
  "truncation": null,
  "padding": null,
  "added_tokens": [
    {
      "id":


In [25]:
import gcsfs
import os

# Set up GCS authentication using service account
gcs_key_path = "/content/drive/MyDrive/gcp-key.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = gcs_key_path

# Initialize gcsfs
fs = gcsfs.GCSFileSystem(project="genai-model-dev-deploy", token=gcs_key_path)

# Define paths
bucket_name = "llm-test-bucket-2025"
gcs_model_path = f"gs://{bucket_name}/model_weights"
local_model_dir = "deepseek_model_4b"
os.makedirs(local_model_dir, exist_ok=True)

# Download all model files from GCS
def download_from_gcs(gcs_path, local_dir):
    """Download all files from a GCS directory to a local directory."""
    blobs = fs.ls(gcs_path)
    for blob in blobs:
        local_filepath = os.path.join(local_dir, os.path.basename(blob))
        print(f"Downloading {blob} to {local_filepath}...")
        fs.get(blob, local_filepath)

download_from_gcs(gcs_model_path, local_model_dir)

print("Model files downloaded successfully!")

Downloading llm-test-bucket-2025/model_weights/config.json to deepseek_model_4b/config.json...
Downloading llm-test-bucket-2025/model_weights/generation_config.json to deepseek_model_4b/generation_config.json...
Downloading llm-test-bucket-2025/model_weights/model-00001-of-00002.safetensors to deepseek_model_4b/model-00001-of-00002.safetensors...
Downloading llm-test-bucket-2025/model_weights/model-00002-of-00002.safetensors to deepseek_model_4b/model-00002-of-00002.safetensors...
Downloading llm-test-bucket-2025/model_weights/model.safetensors.index.json to deepseek_model_4b/model.safetensors.index.json...
Downloading llm-test-bucket-2025/model_weights/special_tokens_map.json to deepseek_model_4b/special_tokens_map.json...
Downloading llm-test-bucket-2025/model_weights/tokenizer.json to deepseek_model_4b/tokenizer.json...
Downloading llm-test-bucket-2025/model_weights/tokenizer_config.json to deepseek_model_4b/tokenizer_config.json...
Model files downloaded successfully!


In [26]:
# Define prompt
prompt = "Explain the benefits of LLM-based AI applications."

# Tokenize input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate output
with torch.no_grad():
    output = model.generate(**inputs, max_new_tokens=150)

# Decode and print response
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Response:\n", response)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Response:
 Explain the benefits of LLM-based AI applications. What industries are most impacted by these tools?
Hmm, so I need to explain the benefits of LLM-based AI applications and the industries most affected by them. Let me start by recalling what LLM-based AI applications are. Large Language Models, like GPT-3 or 4, use massive datasets to generate text and perform tasks. They're powerful because they can understand and generate human-like text.

First, the benefits. Well, these models are good at understanding and generating text, which is useful for many tasks. So, I can think of areas where natural language processing is important. Like writing assistance—so for writers, it can help generate ideas or improve drafts. Similarly, content creation tools, maybe for marketers or copywriters.

Another
