# Deploy Qwen3 using vLLM with SageMaker vLLM 0.9.0

## 0. Needed IAM Role Permissions

- AmazonEC2ContainerRegistryFullAccess

## 1. Create and Push Image to ECR **[ONLY RUN ONCE]**

In [None]:
!pip install -U --quiet sagemaker boto3 awscli

In [1]:
import boto3
import sagemaker
from sagemaker import get_execution_role

ACCOUNT_ID = boto3.client('sts').get_caller_identity().get('Account')
REGION_NAME = 'us-east-1'  # set your region name here
REPO_NAME = "vllm_env"  # set your repo name here
VERSION = "v0.9.0"

CONTAINER = f"{ACCOUNT_ID}.dkr.ecr.{REGION_NAME}.amazonaws.com/{REPO_NAME}:{VERSION}"

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
# Create ECR repo
# ⚠️ Please add AmazonEC2ContainerRegistryFullAccess permission to your IAM Role.
!aws ecr describe-repositories --repository-names {REPO_NAME} --region {REGION_NAME} > /dev/null 2>&1 || aws ecr create-repository --repository-name {REPO_NAME} --region {REGION_NAME}

In [3]:
# Build image
CONTAINER = f"{ACCOUNT_ID}.dkr.ecr.{REGION_NAME}.amazonaws.com/{REPO_NAME}:{VERSION}"

!aws ecr get-login-password --region {REGION_NAME} | docker login --username AWS --password-stdin {ACCOUNT_ID}.dkr.ecr.{REGION_NAME}.amazonaws.com
print('Building docker. This may take few minutes...')
!docker build --quiet --build-arg VERSION={VERSION} -t {REPO_NAME}:{VERSION} .

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded
Building docker. This may take few minutes...
sha256:9f596cd9fa9bdaf9170efbe2808d4b8d2b00914f41e06036fc7f4ae7fe486de1


In [4]:
# Push image to ECR
# ⚠️ Please add AmazonEC2ContainerRegistryFullAccess permission to your IAM Role.
!docker tag {REPO_NAME}:{VERSION} {CONTAINER}
print('Pushing docker. This may take few minutes...')
!docker push {CONTAINER}

Pushing docker. This may take few minutes...
The push refers to repository [707684582322.dkr.ecr.us-east-1.amazonaws.com/vllm_env]

[1B21767f72: Preparing 
[1Bfdcb7075: Preparing 
[1B81d97e6f: Preparing 
[1B08cb7a01: Preparing 
[1Bb493b5dd: Preparing 
[1B5a093913: Preparing 
[1Be0fe2c90: Preparing 
[1B5c61a51d: Preparing 
[1B07f221b0: Preparing 
[1B9783bfa4: Preparing 
[1B75a9c340: Preparing 
[1Be31b14be: Preparing 
[1Ba4092c27: Preparing 
[1B852f509a: Preparing 
[1B75852a44: Preparing 
[1Baa4bda21: Preparing 
[1B4454a678: Preparing 
[12B0fe2c90: Waiting g 
[1Bae9b9700: Preparing 
[11B783bfa4: Waiting g 
[1B58f70e37: Preparing 
[14B7f221b0: Waiting g 
[1B43fcce68: Preparing 
[12B4092c27: Waiting g 
[15B5a9c340: Waiting g 
[9B0d2ed199: Waiting g 
[1B6a9e7760: Preparing 
[18Bv0.9.0: digest: sha256:9d3207b1947e2db87b2aaebb0fa5193d1efe4966a55a6f687117073745f6929a size: 6193


In [11]:
print('Please use this container url for further deployment!')
print(CONTAINER)

Please use this container url for further deployment!
707684582322.dkr.ecr.us-east-1.amazonaws.com/vllm_env:v0.9.0


## 2. Deploy

In [14]:
# ⚠️ Please add AmazonS3FullAccess permission to your IAM Role.
REGION_NAME = "us-east-1"  # Set your region name

INSTANCE_TYPE = 'ml.p4de.24xlarge'
INITIAL_INSTANCE_COUNT = 1

# Set vLLM Options.
# Sagemaker uses environment variables (with "SM_VLLM_" prefix) to control vLLM Server's options
# E.g., "--max_model_len 512" equals to {"SM_VLLM_MAX_MODEL_LEN": "512"}
VLLM_ENV = {
    #'SM_VLLM_MODEL': "Qwen/Qwen3-30B-A3B",
    'SM_VLLM_MODEL': "Qwen/Qwen3-235B-A22B",
    'SM_VLLM_TENSOR_PARALLEL_SIZE': '8',
    'SM_VLLM_MAX_MODEL_LEN': '32768',
    'SM_VLLM_MAX_NUM_SEQS': '128',
    'SM_VLLM_GPU_MEMORY_UTILIZATION': '0.9',
}

In [None]:
import os
import boto3
import datetime
import sagemaker
from sagemaker.s3 import S3Uploader


timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S-%f")[:-3]

boto_session = boto3.Session(region_name=REGION_NAME)
sagemaker_session = sagemaker.Session(boto_session=boto_session)
iam_role = sagemaker.get_execution_role(sagemaker_session=sagemaker_session)

# create a unique name
model_name = f"Qwen3-30B-{timestamp}"
endpoint_name = sagemaker.utils.name_from_base("Qwen3-30B")

model = sagemaker.Model(
    name=model_name,
    image_uri=CONTAINER,
    sagemaker_session=sagemaker_session,
    role=iam_role,
    env=VLLM_ENV,
)

predictor = model.deploy(
    instance_type=INSTANCE_TYPE,
    initial_instance_count=INITIAL_INSTANCE_COUNT,
    endpoint_name=endpoint_name,
    container_startup_health_check_timeout=3600
)
print(f'Endpoint Name: {endpoint_name}')

----------------------------

## 3. Test

In [16]:
endpoint_name = "Qwen3-30B-2025-06-12-10-58-48-818"  # Set your deployed endpoint name. You can find it in your SageMaker AI Dashboard
REGION_NAME = "us-east-1"  # Set your region name

In [27]:
import json
import boto3
import base64

payload = {
    "model": "Qwen/Qwen3-235B-A22B",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Hi, how are you doing?"
                }
        ]}
    ],
    "temperature": 0.7,
    "max_tokens": 100,
    "stream": False
}

runtime_sm_client = boto3.client('sagemaker-runtime', region_name=REGION_NAME)
response = runtime_sm_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

response_body = json.loads(response['Body'].read().decode())
print(response_body)


{'id': 'chatcmpl-00c1d6caf4194b1bb6de91fd7b4fe339', 'object': 'chat.completion', 'created': 1749730277, 'model': 'Qwen/Qwen3-235B-A22B', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'reasoning_content': None, 'content': '<think>\nLet me analyze this greeting carefully. The user sent a simple friendly greeting in Chinese: "Hi, how are you doing?" This is a common conversational opener that serves multiple purposes - it establishes contact, shows basic courtesy, and opens the door for further interaction.\n\nFrom a technical perspective, this query is straightforward but important. It tests my ability to recognize and respond appropriately to casual greetings, which are fundamental to natural human-AI interaction. The greeting itself contains both an opening ("Hi")', 'tool_calls': []}, 'logprobs': None, 'finish_reason': 'length', 'stop_reason': None}], 'usage': {'prompt_tokens': 15, 'total_tokens': 115, 'completion_tokens': 100, 'prompt_tokens_details': None}, 'prompt_logpro

# Streaming for longer session connection (up to 8 minutes)

In [30]:
payload = {
    "model": "Qwen/Qwen3-235B-A22B",
    "messages": [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Hi, how are you doing?"
                }
        ]}
    ],
    "temperature": 0.7,
    "max_tokens": 100,
    "stream": True
}

In [31]:
# Use invoke_endpoint_with_response_stream for streaming
response = runtime_sm_client.invoke_endpoint_with_response_stream(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Body=json.dumps(payload)
)

# Buffer for partial JSON
buffer = ""

# Process streaming response
for event in response['Body']:
    chunk = event['PayloadPart']['Bytes'].decode('utf-8')
    
    # Add new chunk to buffer
    buffer += chunk
    
    # Split by "data: " (SSE format)
    parts = buffer.split("data: ")
    
    # Keep last incomplete part for next chunk
    buffer = parts[-1]
    
    # Process complete parts
    for part in parts[:-1]:
        if not part.strip():
            continue
            
        try:
            # Parse JSON
            chunk_data = json.loads(part.strip())
            print(chunk_data)
            # Extract content
            # Implement here the parsing
                    
        except json.JSONDecodeError as e:
            print(f"[ERROR] Failed to parse: {part[:50]}... | Error: {e}")

{'id': 'chatcmpl-998a6800e0d04a28b6196e8580d8f92a', 'object': 'chat.completion.chunk', 'created': 1749730371, 'model': 'Qwen/Qwen3-235B-A22B', 'choices': [{'index': 0, 'delta': {'role': 'assistant', 'content': ''}, 'logprobs': None, 'finish_reason': None}]}
{'id': 'chatcmpl-998a6800e0d04a28b6196e8580d8f92a', 'object': 'chat.completion.chunk', 'created': 1749730371, 'model': 'Qwen/Qwen3-235B-A22B', 'choices': [{'index': 0, 'delta': {'content': '<think>', 'tool_calls': []}}]}
{'id': 'chatcmpl-998a6800e0d04a28b6196e8580d8f92a', 'object': 'chat.completion.chunk', 'created': 1749730371, 'model': 'Qwen/Qwen3-235B-A22B', 'choices': [{'index': 0, 'delta': {'content': '\n', 'tool_calls': []}}]}
{'id': 'chatcmpl-998a6800e0d04a28b6196e8580d8f92a', 'object': 'chat.completion.chunk', 'created': 1749730371, 'model': 'Qwen/Qwen3-235B-A22B', 'choices': [{'index': 0, 'delta': {'content': 'Let', 'tool_calls': []}}]}
{'id': 'chatcmpl-998a6800e0d04a28b6196e8580d8f92a', 'object': 'chat.completion.chunk', '