In [77]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Disable tokenizers parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Automatically detect the correct device (MPS for Apple Silicon)
device = "mps" if torch.backends.mps.is_available() else "cpu"

# Load model and tokenizer
model_name = "facebook/opt-350m"  # Replace with your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)  # Move model to MPS

# Prepare input text and move input_ids to the same device
input_text = "The meaning of life is"
inputs = tokenizer(input_text, return_tensors="pt")  # Convert input text to tensor
# inputs is not a tensor but a BatchEncoding object.
# To extract the actual tensor, you need to do:
input_ids = inputs["input_ids"].to(device)  # Extract `input_ids` and move to MPS

# Generate output
with torch.no_grad():
    output = model.generate(input_ids)

# Decode and print output
print(tokenizer.decode(output[0], skip_special_tokens=True))


The meaning of life is to be happy.
I'm not sure if you're being sarcastic or not.
I'm


In [78]:
input_ids

tensor([[   2,  133, 3099,    9,  301,   16]], device='mps:0')

In [79]:
inputs = tokenizer("The meaning of life is", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[   2,  133, 3099,    9,  301,   16]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [80]:
del model

In [81]:
!pip install modal



In [82]:
import modal
from transformers import AutoTokenizer, AutoModelForCausalLM

# Create a Modal image using Debian slim and install required dependencies
# Original:
# image = modal.Image.debian_slim().pip_install("fastapi[standard]", "transformers")

# Updated:
image = modal.Image.debian_slim().pip_install(
    "fastapi[standard]",
    "transformers",
    "torch>=2.0.0",
    "accelerate",
    "safetensors"  # Optional but recommended for faster model loading
)

# Add local Python modules explicitly to avoid automounting warning
image_with_source = image.add_local_python_source("_remote_module_non_scriptable")

# Initialize a Modal App with the custom image
# Original:
# app = modal.App(name="example-lifecycle-web", image=image)

# Updated:
app = modal.App(name="example-lifecycle-web", image=image_with_source)

# Define a stub class for your model
class MyModel:
    def __init__(self):
        # Load the tokenizer and model once during initialization
        self.tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
        self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", device_map="auto")
    
    def run_inference(self, input_text: str) -> str:
        # Perform inference and return the result
        input_ids = self.tokenizer(input_text, return_tensors="pt")
        outputs = self.model.generate(**input_ids)
        return self.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

# Define the web endpoint to accept requests and run inference
@app.function()
# Original:
# @modal.web_endpoint()  # Expose this function via an HTTP endpoint

# Updated:
@modal.fastapi_endpoint()  # Updated from web_endpoint to fastapi_endpoint
def hello(input_text: str) -> str:
    # Initialize the model here, it will persist between calls
    if not hasattr(hello, "model_instance"): # Only initialize on first call
        hello.model_instance = MyModel()
    result = hello.model_instance.run_inference(input_text)  # Run inference
    return result

# To run the web app
if __name__ == "__main__":
    app.run()