# Install packages

In [None]:
!pip install -q fastapi uvicorn pyngrok nest_asyncio transformers accelerate bitsandbytes

In [None]:
import os
from google.colab import drive
from transformers import AutoModelForCausalLM, AutoTokenizer

# 1. Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


# Load Model from Hugging Face

In [None]:
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# 1. Model ID
model_id = "mistralai/Mistral-7B-Instruct-v0.3"

# 2. Quantization Configuration (Fix for deprecation warning)
quantization_config = BitsAndBytesConfig()

# quantization_config = BitsAndBytesConfig(
#     load_in_4bit=True,  # Enable 4-bit quantization
#     bnb_4bit_use_double_quant=True,  # Enable nested quantization for better memory efficiency
#     bnb_4bit_quant_type="nf4",  # Use 4-bit NormalFloat quantization
#     bnb_4bit_compute_dtype=torch.float16  # Use float16 for computation
# )

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    device_map="auto",
    quantization_config=quantization_config  # Apply quantization config
)

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

# Check model size

In [None]:
# 4. Save tokenizer and model to a local directory
model_dir = "Qwen2-7B-Instruct"  # A local directory to save the model and tokenizer
os.makedirs(model_dir, exist_ok=True)

# Save tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(model_dir)

# Save model weights
model.save_pretrained(model_dir, safe_serialization=True)

tokenizer_config.json:   0%|          | 0.00/141k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [None]:
import os

total_size = 0
for dirpath, dirnames, filenames in os.walk(model_dir):
    for f in filenames:
        fp = os.path.join(dirpath, f)
        # skip if it is symbolic link
        if not os.path.islink(fp):
            total_size += os.path.getsize(fp)

print(f"Size of the saved model: {total_size / (1024**3):.2f} GB")

Size of the saved model: 4.16 GB


In [None]:
import re
import torch
from transformers import pipeline, AutoTokenizer

# Set device to "cuda" if available, otherwise "cpu"
device = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(model_dir)

# -----

# pipe = pipeline("text-generation", model=model_dir, tokenizer=tokenizer, torch_dtype=torch.float16, device_map="auto")

# messages = [
#     {
#         "role": "system",
#         "content": "You are a friendly chatbot, respond to each question accurately and concisely.",
#     },
#     {   "role": "user",
#         "content": "Can you provide a short description of colorado rocky mountains?"
#     },
# ]

# prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# outputs = pipe(prompt, max_new_tokens=1000, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
# print(outputs)

# ----

prompt = "Can you write me a concise yet effective cover letter for a job of Data Engineer that requires Azure Databricks expertise?"

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]

text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(model_inputs.input_ids,max_new_tokens=512)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [None]:
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

Subject: Application for Data Engineer Position - Azure Databricks Expertise

[Your Name]
[Your Address]
[City, State, Zip Code]
[Your Email Address]
[Today's Date]

[Recipient's Name]
[Recipient's Position]
[Company Name]
[Company Address]
[City, State, Zip Code]

Dear [Recipient's Name],

I am writing to express my interest in the Data Engineer position advertised on your company's website. With my extensive experience in data engineering, strong proficiency in Azure Databricks, and a proven track record of delivering high-quality data solutions, I am confident that I would be a valuable addition to your team.

Over the past [number of years/decades] in the data engineering field, I have honed my skills in designing, building, and maintaining scalable and efficient data pipelines. My expertise in Azure Databricks has enabled me to effectively manage and process large-scale data sets, ensuring optimal performance and reliability.

In my previous role at [Previous Company Name], I was 