In [1]:
import boto3
import sagemaker
import os
import time
import jupyter_black

jupyter_black.load()

In [2]:
boto_session = boto3.session.Session()
region = boto_session.region_name
sess = sagemaker.Session()

In [None]:
# Replace with your own role ARN
BEDROCK_ROLE_ARN = "arn:aws:iam::427525985442:role/DuarteStack-DuarteBedrockSageMakerRoleFF6A8A20-qn1nlHfunLaT"
CONFIG = {
    "INSTANCE_TYPE": "ml.g6.12xlarge",
    "ENV": {
        "HF_MODEL_ID": "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",  # model to deploy
        "TENSOR_PARALLEL_DEGREE": "max",  # use all GPUs
        "OPTION_ENABLE_REASONING": "true",
        "OPTION_REASONING_PARSER": "deepseek_r1",
    },
    "IMAGE_NAME": "djl-inference:0.33.0-lmi15.0.0-cu128",
    "BEDROCK_ROLE_ARN": BEDROCK_ROLE_ARN,
}

In [4]:
ts = time.time()
hf_model_id = CONFIG["ENV"]["HF_MODEL_ID"]
model_name = hf_model_id.split("/")[-1]
base_model_name = sagemaker.utils.name_from_base(model_name)
print(f"{base_model_name=}")


current_region = os.environ["AWS_DEFAULT_REGION"]
print(f"{current_region=}")

image_name = CONFIG["IMAGE_NAME"]
inference_image_uri = (
    f"763104351884.dkr.ecr.{current_region}.amazonaws.com/{image_name}"
)
print(f"{inference_image_uri=}")

lmi_model = sagemaker.Model(
    image_uri=inference_image_uri,
    env=CONFIG["ENV"],
    role=CONFIG["BEDROCK_ROLE_ARN"],
    name=base_model_name,
)

endpoint_name = f"{base_model_name}-endpoint"
print(f"{endpoint_name=}")

base_model_name='DeepSeek-R1-Distill-Qwen-14B-2025-08-24-08-52-31-391'
current_region='eu-central-1'
inference_image_uri='763104351884.dkr.ecr.eu-central-1.amazonaws.com/djl-inference:0.33.0-lmi15.0.0-cu128'
endpoint_name='DeepSeek-R1-Distill-Qwen-14B-2025-08-24-08-52-31-391-endpoint'


In [5]:
lmi_model.deploy(
    initial_instance_count=1,
    instance_type=CONFIG["INSTANCE_TYPE"],
    container_startup_health_check_timeout=900,
    endpoint_name=endpoint_name,
)

----------------!

In [8]:
llm = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer(),
)

In [9]:
TEMPERATURE = 0.01
MAX_TOKENS = 1024

In [10]:
llm.predict(
    {
        "messages": [
            {
                "role": "user",
                "content": "What is the capital of Le Marche, Italy?",
            }
        ],
        "temperature": TEMPERATURE,
        "max_tokens": MAX_TOKENS,
    }
)

{'id': 'chatcmpl-139944987601104',
 'object': 'chat.completion',
 'created': 1756025028,
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'reasoning_content': "Okay, so I need to figure out what the capital of Le Marche, Italy is. I'm not super familiar with all the regions of Italy, but I know that Italy is divided into different regions, each with its own capital city. Le Marche is one of those regions, and I think it's in the central part of Italy. \n\nI remember that some regions have capitals that are pretty well-known. For example, Rome is the capital of Lazio, and Venice is the capital of Veneto. But I'm not sure about Le Marche. I think the capital might be a city that's not as famous as Rome or Venice, but still significant enough to be the regional capital.\n\nI've heard of a few cities in Le Marche before. Urbino comes to mind because I think it's known for its medieval architecture and maybe some universities. There's also Ancona, which I believe is a port 

In [11]:
from pydantic import BaseModel
from enum import Enum


class Difficulty(str, Enum):
    easy = "easy"
    medium = "medium"
    hard = "hard"


class Step(BaseModel):
    explanation: str
    output: str


class MathReasoning(BaseModel):
    steps: list[Step]
    final_answer: str
    difficulty: Difficulty

In [12]:
messages = [
    {
        "role": "system",
        "content": "You are a helpful assistant that generates JSON responses. You output only json, no other text. Guide the user through the solution step by step.",
    },
    {
        "role": "user",
        "content": "how can I solve 8x + 7 = -23",
    },
]

In [13]:
response = llm.predict(
    {
        "messages": messages,
        "temperature": TEMPERATURE,
        "max_tokens": MAX_TOKENS,
        "extra_body": {"guided_json": MathReasoning.model_json_schema()},
    }
)
response

{'id': 'chatcmpl-139944953052480',
 'object': 'chat.completion',
 'created': 1756025055,
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'reasoning_content': "Okay, so I need to solve the equation 8x + 7 = -23. Hmm, let me think about how to approach this. I remember from algebra that to solve for x, I need to isolate it on one side of the equation. \n\nFirst, I should probably get rid of that 7 that's being added to 8x. To do that, I can subtract 7 from both sides of the equation. That way, I'm keeping things balanced. So if I subtract 7 from both sides, it would look like this: 8x + 7 - 7 = -23 - 7. Simplifying that, the +7 and -7 on the left side cancel each other out, leaving just 8x. On the right side, -23 minus 7 is -30. So now the equation is 8x = -30.\n\nNext, I need to get x by itself. Since 8 is multiplied by x, I should divide both sides by 8 to undo that multiplication. So, dividing both sides by 8 gives me (8x)/8 = -30/8. On the left side, 8 divided by 8 

In [14]:
response = llm.predict(
    {
        "messages": messages,
        "temperature": TEMPERATURE,
        "max_tokens": MAX_TOKENS,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "math_reasoning",
                "schema": MathReasoning.model_json_schema(),
                "strict": True,
            },
        },
        "extra_body": {"guided_decoding_backend": "outlines"},
    }
)
response

{'id': 'chatcmpl-139944953052144',
 'object': 'chat.completion',
 'created': 1756025063,
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'reasoning_content': '{\n  "steps": [\n    {\n      "explanation": "First, subtract 7 from both sides to isolate the term with the variable.",\n      "output": "8x = -23 - 7"\n    },\n    {\n      "explanation": "Simplify the right side by combining the constants.",\n      "output": "8x = -30"\n    },\n    {\n      "explanation": "Next, divide both sides by 8 to solve for x.",\n      "output": "x = -30 / 8"\n    },\n    {\n      "explanation": "Simplify the fraction to get the final answer.",\n      "output": "x = -15/4"\n    }\n  ]\n  , "final_answer": "x = -15/4"\n  , "difficulty": "medium"\n}',
    'content': None},
   'logprobs': None,
   'finish_reason': 'eos_token'}],
 'usage': {'prompt_tokens': 48, 'completion_tokens': 180, 'total_tokens': 228}}

In [15]:
response = llm.predict(
    {
        "messages": messages,
        "temperature": TEMPERATURE,
        "max_tokens": MAX_TOKENS,
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "math_reasoning",
                "schema": MathReasoning.model_json_schema(),
                "strict": True,
            },
        },
        "extra_body": {"guided_decoding_backend": "xgrammar"},
    }
)

response

{'id': 'chatcmpl-139944953200224',
 'object': 'chat.completion',
 'created': 1756025069,
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'reasoning_content': '{\n  "steps": [\n    {\n      "explanation": "First, subtract 7 from both sides to isolate the term with the variable.",\n      "output": "8x = -23 - 7"\n    },\n    {\n      "explanation": "Simplify the right side by performing the subtraction.",\n      "output": "8x = -30"\n    },\n    {\n      "explanation": "Next, divide both sides by 8 to solve for x.",\n      "output": "x = -30 / 8"\n    },\n    {\n      "explanation": "Simplify the fraction to get the final answer.",\n      "output": "x = -15/4"\n    }\n  ]\n  , "final_answer": "x = -15/4"\n  , "difficulty": "medium"\n}',
    'content': None},
   'logprobs': None,
   'finish_reason': 'eos_token'}],
 'usage': {'prompt_tokens': 48, 'completion_tokens': 180, 'total_tokens': 228}}

In [16]:
MathReasoning.model_validate_json(
    response["choices"][0]["message"]["reasoning_content"]
).model_dump()

{'steps': [{'explanation': 'First, subtract 7 from both sides to isolate the term with the variable.',
   'output': '8x = -23 - 7'},
  {'explanation': 'Simplify the right side by performing the subtraction.',
   'output': '8x = -30'},
  {'explanation': 'Next, divide both sides by 8 to solve for x.',
   'output': 'x = -30 / 8'},
  {'explanation': 'Simplify the fraction to get the final answer.',
   'output': 'x = -15/4'}],
 'final_answer': 'x = -15/4',
 'difficulty': <Difficulty.medium: 'medium'>}

In [6]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
lmi_model.delete_model()