In [1]:
!pip install -q transformers datasets accelerate bitsandbytes google-cloud-storage gguf langchain_community

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.2/76.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m25.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.3/423.3 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
from google.colab import drive
from google.cloud import storage
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Mount Google Drive
drive.mount('/content/drive')

# 2. Set environment variable for credentials
credentials_path = "/content/drive/MyDrive/gcp-key.json"  # Path to your key file
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

# 3. Initialize GCS client
storage_client = storage.Client()
bucket_name = "llm-test-bucket-2025"  # Replace with your bucket name
bucket = storage_client.bucket(bucket_name)

Mounted at /content/drive


In [3]:
# 4. Save tokenizer and model to a local directory
model_dir = "model_weights_4bit"  # A local directory to save the model and tokenizer
os.makedirs(model_dir, exist_ok=True)

In [4]:
import os

# Define local directory where model files will be downloaded
def download_from_gcs(gcs_prefix, local_dir):
    """Download all files from GCS folder to local directory."""
    blobs = bucket.list_blobs(prefix=gcs_prefix)
    for blob in blobs:
        local_filepath = os.path.join(local_dir, os.path.basename(blob.name))
        print(f"Downloading {blob.name} to {local_filepath}...")
        blob.download_to_filename(local_filepath)

# Download model weights from GCS
download_from_gcs("model_weights_tinyllama", model_dir)

print("Model files downloaded successfully!")

Downloading model_weights_tinyllama/config.json to model_weights_4bit/config.json...
Downloading model_weights_tinyllama/generation_config.json to model_weights_4bit/generation_config.json...
Downloading model_weights_tinyllama/model.safetensors to model_weights_4bit/model.safetensors...
Downloading model_weights_tinyllama/special_tokens_map.json to model_weights_4bit/special_tokens_map.json...
Downloading model_weights_tinyllama/tokenizer.json to model_weights_4bit/tokenizer.json...
Downloading model_weights_tinyllama/tokenizer.model to model_weights_4bit/tokenizer.model...
Downloading model_weights_tinyllama/tokenizer_config.json to model_weights_4bit/tokenizer_config.json...
Model files downloaded successfully!


In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Define paths
model_path = model_dir  # Path to downloaded model files

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load model (ensure it uses 8-bit quantization)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # Use float16 for efficiency
    device_map="auto"  # Automatically map to available GPU
)

print("Model loaded successfully!")

Model loaded successfully!


In [6]:
from transformers import pipeline
import torch
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
import markdown

def generate_response(prompt, model, tokenizer):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=5000)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove everything before and including '</think>'
    if "</think>" in response:
        response = response.split("</think>")[-1].strip()

    return response

# Create the text-generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=5000,
    temperature=0.9,
    do_sample=True
)

# Create a HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Define a simple prompt template
template = """Question: {question}

Answer:"""
prompt = PromptTemplate(template=template, input_variables=["question"])

# Create the LLMChain
llm_chain = LLMChain(prompt=prompt, llm=llm)

# Run the chain and clean output
question = '''
            Explain the benefits of LLM-based AI applications.
          '''
response = llm_chain.run(question).strip()

# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()

print(response)

Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=text_generation_pipeline)
  llm_chain = LLMChain(prompt=prompt, llm=llm)
  response = llm_chain.run(question).strip()


Question: 

Answer: 

The benefits of LLM-based AI applications in the education sector are as follows:

1. Improved Learner Experience: The LLM-based AI applications provide highly personalized and interactive learning experiences for learners. The technology-enabled adaptive learning makes the learning environment more flexible, intuitive, and engaging, resulting in better learning outcomes for students.

2. Enhanced Learning Outcomes: LLM-based AI applications analyze the learner's learning patterns and offer personalized solutions based on contextual understanding. This approach enhances the learning outcomes by helping students to understand and apply the lesson learned.

3. Cost Reduction: LLM-based AI applications are more cost-effective than traditional learning methods. The technology-enabled adaptive learning reduces the cost of implementation, learning, and maintenance. Additionally, LLM-based AI applications provide cost-effective learning solutions to the education sector.

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"

prompt = "Can you provide me with a 5 day road trip plan for a trip to Montreal, QC. I will be starting and ending my trip to and from New York City?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(model_inputs.input_ids,max_new_tokens=5000)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

In [17]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Sure, here's a 5-day road trip plan for a trip to Montreal, QC, starting and ending in New York City:

Day 1:
- Depart from New York City at 9:00 AM
- Drive to Montreal, Quebec, Canada
- Arrive in Montreal at 1:00 PM
- Check-in at the Hotel Le Ritz, located in the heart of the city
- Enjoy the city's attractions, including the Notre-Dame Basilica, the Montreal Museum of Fine Arts, and the Montreal Museum of Science and Technology
- Dinner at Le Bistrot du Pain, a cozy bistro with a focus on local and seasonal ingredients

Day 2:
- Breakfast at Le Bistrot du Pain
- Drive to the Montmorency Falls, a stunning waterfall located in the Laurentians
- Take a guided hike to the falls, or simply enjoy the scenery from the comfort of a picnic blanket
- Lunch at Le Pain Quotidien, a casual bistro with a focus on healthy and organic options
- Visit the Musée des Beaux-Arts, a museum showcasing the art and culture of Quebec
- Dinner at Le Pain Quotidien, a casual bistro with a focus on healthy and 

In [18]:
# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()
# Remove echoes of the question if present in response
if question in response:
    response = response.replace(question, "").strip()

# Remove everything before and including '</think>'
if "</think>" in response:
    response = response.split("</think>")[-1].strip()

# print(response)
markdown_response = markdown.markdown(response)
print(markdown_response)

<p>Sure, here's a 5-day road trip plan for a trip to Montreal, QC, starting and ending in New York City:</p>
<p>Day 1:
- Depart from New York City at 9:00 AM
- Drive to Montreal, Quebec, Canada
- Arrive in Montreal at 1:00 PM
- Check-in at the Hotel Le Ritz, located in the heart of the city
- Enjoy the city's attractions, including the Notre-Dame Basilica, the Montreal Museum of Fine Arts, and the Montreal Museum of Science and Technology
- Dinner at Le Bistrot du Pain, a cozy bistro with a focus on local and seasonal ingredients</p>
<p>Day 2:
- Breakfast at Le Bistrot du Pain
- Drive to the Montmorency Falls, a stunning waterfall located in the Laurentians
- Take a guided hike to the falls, or simply enjoy the scenery from the comfort of a picnic blanket
- Lunch at Le Pain Quotidien, a casual bistro with a focus on healthy and organic options
- Visit the Musée des Beaux-Arts, a museum showcasing the art and culture of Quebec
- Dinner at Le Pain Quotidien, a casual bistro with a focus 

In [19]:
from IPython.display import HTML, display
display(HTML(markdown_response))