In [1]:
# Imports
import torch

from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
# DeepSeek R1 Qwen model
class DEEPSEEKR1QWEN:
    def __init__(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B")        
        self.model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-7B", 
                                                          device_map="auto", 
                                                          torch_dtype=torch.bfloat16,
                                                          attn_implementation='flash_attention_2') 

    def response(self, mes):
        input_ids = self.tokenizer.apply_chat_template(
            mes,
            add_generation_prompt=True,  
            return_tensors="pt"
        ).to(self.model.device)

        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).long()

        outputs = self.model.generate(
            input_ids,
            max_new_tokens=10000,  
            temperature=0.6,     
            attention_mask=attention_mask,
            pad_token_id=self.tokenizer.eos_token_id,
            eos_token_id=self.tokenizer.eos_token_id 
        )

        generated_ids = outputs[:, input_ids.shape[1]:]
        response = self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)

        return response

In [2]:
class QWQ32B:
    def __init__(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained("Qwen/QwQ-32B")        
        self.model = AutoModelForCausalLM.from_pretrained("Qwen/QwQ-32B", 
                                                          device_map="auto", 
                                                          torch_dtype="auto") 

    def response(self, mes):
        text = self.tokenizer.apply_chat_template(
            mes,
            tokenize=False,
            add_generation_prompt=True
        )

        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)

        generated_ids = self.model.generate(
            **model_inputs,
            max_new_tokens=32768,
            temperature=0.6,     # Controls randomness (higher = more random)
            top_p=0.95,          # Nucleus sampling (higher = more diverse)
            top_k=40            # Limits token selection to top-k
            # presence_penalty=1.5 # Discourages repetition
        )
        
        generated_ids = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
        ]

        response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

        return response

In [3]:
message=[
    {
      "role": "user",
      "content": "You are playing BlackJack, the dealer has a 9 of Spades, you have a Queen of Hearts and a Three of Diamonds, availabe actions are Hit and Stand. What is your action?"
    }
]

In [None]:
llm = DEEPSEEKR1QWEN()
response = llm.response(message)

In [10]:
print(response)

Okay, so I'm playing BlackJack against the dealer who has a 9 of Spades. My hand consists of a Queen of Hearts and a Three of Diamonds. I need to decide whether to hit or stand. Let me think through this step by step.

First, I should figure out the current state of the game. I have a Queen and a Three. In Blackjack, the goal is to get as close to 21 as possible without going over. So, with a Queen (12 points) and a Three (3 points), I have 15 points total. That's not too bad, but I need to consider the dealer's hand.

The dealer has a 9 of Spades, which is a high card, so they're probably using it to get a strong hand. I need to see if I can improve my hand or if I should bust.

If I hit, I risk getting another card that could push me over 21. Let's see what's available. I don't have any other cards, so I can't hit. Wait, no, I have two cards, but the action is only hit or stand. So, I can't hit because there are no more cards. Therefore, I must stand.

Wait, but maybe I should check 