In [10]:
!pip install -q transformers datasets accelerate bitsandbytes google-cloud-storage langchain_community

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.5 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.5/2.5 MB[0m [31m15.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.5/2.5 MB[0m [31m42.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
from google.colab import drive
from google.cloud import storage
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Set environment variable for credentials
credentials_path = "/content/drive/MyDrive/gcp-key.json"  # Path to your key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

# 3. Initialize GCS client
storage_client = storage.Client()
bucket_name = "llm-test-bucket-2025"  # Replace with your bucket name
bucket = storage_client.bucket(bucket_name)

Mounted at /content/drive


In [3]:
# 4. Save tokenizer and model to a local directory
model_dir = "model_weights"  # A local directory to save the model and tokenizer
os.makedirs(model_dir, exist_ok=True)

In [7]:
import os

# Define local directory where model files will be downloaded
def download_from_gcs(gcs_prefix, local_dir):
    """Download all files from GCS folder to local directory."""
    blobs = bucket.list_blobs(prefix=gcs_prefix)
    for blob in blobs:
        local_filepath = os.path.join(local_dir, os.path.basename(blob.name))
        print(f"Downloading {blob.name} to {local_filepath}...")
        blob.download_to_filename(local_filepath)

# Download model weights from GCS
download_from_gcs("DeepSeek_R1_Distill_Qwen_7B", model_dir)

print("Model files downloaded successfully!")

Downloading DeepSeek_R1_Distill_Qwen_7B/config.json to model_weights/config.json...
Downloading DeepSeek_R1_Distill_Qwen_7B/generation_config.json to model_weights/generation_config.json...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00001-of-00007.safetensors to model_weights/model-00001-of-00007.safetensors...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00002-of-00007.safetensors to model_weights/model-00002-of-00007.safetensors...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00003-of-00007.safetensors to model_weights/model-00003-of-00007.safetensors...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00004-of-00007.safetensors to model_weights/model-00004-of-00007.safetensors...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00005-of-00007.safetensors to model_weights/model-00005-of-00007.safetensors...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00006-of-00007.safetensors to model_weights/model-00006-of-00007.safetensors...
Downloading DeepSeek_R1_Distill_Qwen_7B/model-00007-of

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define paths
model_path = model_dir  # Path to downloaded model files

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model (ensure it uses 8-bit quantization)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Automatically map to available GPU
)

print("Model loaded successfully!")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

Model loaded successfully!


In [14]:
from transformers import pipeline
import torch
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
import markdown

# Create the text-generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10000,
    temperature=0.9,
    do_sample=True
)

# Create a HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Define a simple prompt template
template = """Question: {question}

Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])

# Create the LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Run the chain and clean output
question = '''
          Can you solve this equation for x, where 5*x^2+2*x+120=453?
          '''
response = llm_chain.run(question).strip()

# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()

print(response)

Device set to use cuda:0


To solve the equation \( 5x^2 + 2x + 120 = 453 \) for \( x \), follow these steps:

### Step 1: Rearrange the Equation
First, move all terms to one side to set the equation to zero.

\[
5x^2 + 2x + 120 - 453 = 0
\]

Simplify:

\[
5x^2 + 2x - 333 = 0
\]

### Step 2: Identify the Coefficients
This is a quadratic equation of the form \( ax^2 + bx + c = 0 \), where:
- \( a = 5 \)
- \( b = 2 \)
- \( c = -333 \)

### Step 3: Apply the Quadratic Formula
The quadratic formula to find the roots is:

\[
x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}
\]

### Step 4: Calculate the Discriminant
Compute the discriminant (\( D \)):

\[
D = b^2 - 4ac = 2^2 - 4 \times 5 \times (-333) = 4 + 6660 = 6664
\]

### Step 5: Find the Square Root of the Discriminant
\[
\sqrt{6664} \approx 81.63
\]

### Step 6: Substitute into the Quadratic Formula
\[
x = \frac{-2 \pm 81.63}{10}
\]

This gives two potential solutions:

1. \( x = \frac{-2 + 81.63}{10} = \frac{79.63}{10} \approx 7.963 \)
2. \( x = \frac{-2 - 81.63}{10} = 

In [16]:
device = "cuda" if torch.cuda.is_available() else "cpu"

prompt = "Can you solve this equation for x, where 5*x^2+2*x+120=453?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(model_inputs.input_ids,max_new_tokens=5000)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


In [17]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

First, I need to solve the equation \(5x^2 + 2x + 120 = 453\) for \(x\).

I'll start by subtracting 453 from both sides to set the equation to zero:
\[5x^2 + 2x + 120 - 453 = 0\]
Simplifying this gives:
\[5x^2 + 2x - 333 = 0\]

Next, I'll use the quadratic formula to solve for \(x\), where \(a = 5\), \(b = 2\), and \(c = -333\):
\[x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}\]

Plugging in the values:
\[x = \frac{-2 \pm \sqrt{(2)^2 - 4 \times 5 \times (-333)}}{2 \times 5}\]
\[x = \frac{-2 \pm \sqrt{4 + 6660}}{10}\]
\[x = \frac{-2 \pm \sqrt{6664}}{10}\]

Calculating the square root:
\[\sqrt{6664} \approx 81.63\]

Now, I'll find the two possible solutions for \(x\):
1. \(x = \frac{-2 + 81.63}{10} = \frac{79.63}{10} = 7.963\)
2. \(x = \frac{-2 - 81.63}{10} = \frac{-83.63}{10} = -8.363\)

So, the solutions are approximately \(x \approx 7.96\) and \(x \approx -8.36\).
</think>

Certainly! Let's solve the equation step by step.

**Given Equation:**
\[ 5x^2 + 2x + 120 = 453 \]

**Step 1: Simplify t

In [18]:
# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()
# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()

# print(response)
markdown_response = markdown.markdown(response)
print(markdown_response)

<p>Certainly! Let's solve the equation step by step.</p>
<p><strong>Given Equation:</strong>
[ 5x^2 + 2x + 120 = 453 ]</p>
<p><strong>Step 1: Simplify the Equation</strong></p>
<p>First, we want to set the equation to zero by subtracting 453 from both sides:
[
5x^2 + 2x + 120 - 453 = 0
]
[
5x^2 + 2x - 333 = 0
]</p>
<p><strong>Step 2: Identify the Quadratic Coefficients</strong></p>
<p>This is a quadratic equation in the form ( ax^2 + bx + c = 0 ), where:
- ( a = 5 )
- ( b = 2 )
- ( c = -333 )</p>
<p><strong>Step 3: Apply the Quadratic Formula</strong></p>
<p>The quadratic formula is:
[
x = \frac{-b \pm \sqrt{b^2 - 4ac}}{2a}
]</p>
<p>Plugging in the values:
[
x = \frac{-2 \pm \sqrt{(2)^2 - 4 \times 5 \times (-333)}}{2 \times 5}
]
[
x = \frac{-2 \pm \sqrt{4 + 6660}}{10}
]
[
x = \frac{-2 \pm \sqrt{6664}}{10}
]</p>
<p><strong>Step 4: Calculate the Square Root</strong></p>
<p>[
\sqrt{6664} \approx 81.63
]</p>
<p><strong>Step 5: Find the Two Solutions</strong></p>
<ol>
<li>
<p><strong>First 

In [19]:
from IPython.display import HTML, display
display(HTML(markdown_response))