In [None]:
# Imports
import pickle
import random
import rlcard
import yaml

from datetime import datetime
from rlcard.agents import RandomAgent
from openai import OpenAI

In [4]:
# DeepSeek R1 model
class DEEPSEEKR1:
    def __init__(self) -> None:
        with open("../config.yaml", "r") as file:
            config = yaml.safe_load(file)
            api_key = config["Keys"]["DEEPSEEKR1PAID"]

        self.model = config["IDs"]["DEEPSEEKR1PAID"]

        self.client = OpenAI(
            base_url=config["Providers"]["DEEPSEEK"],
            api_key=api_key,
        )

    def response(self, mes):
        response = self.client.chat.completions.create(
            model=self.model,
            messages=mes)

        return response.choices[0].message.content

In [None]:
# Blackjack Agent
game_style = 'agentpro'
now = datetime.now()
formatted = now.strftime("%Y-%m-%d %H:%M:%S")
storage_name = '../Agent-Pro/my_data/DeepSeek R1/' + formatted
response = ''

# Blackjack Agent
class LlmAgent(RandomAgent):
    """
    An agent that uses a Large Language Model (LLM) to make decisions in a Blackjack game.

    This agent formulates a prompt based on the current game state, rules,
    and predefined behavioral guidelines and world modeling. It then queries an LLM 
    to get a reasoned action (hit or stand). 

    Attributes:
        llm (DEEPSEEKR1): An instance of the LLM used for decision making.
        behavioral_guideline (str): A string defining the agent's playing style
                                    or specific instructions.
        world_modeling (str): A string containing information or assumptions
                              about the game environment or opponent.
    """
    def __init__(self, num_actions):
        super().__init__(num_actions)
        self.llm = DEEPSEEKR1()

        self.behavioral_guideline = ""
        self.world_modeling = ""


    def extract_choice(self, text):
        text = self.to_lower(text)
        last_hit_index = text.rfind("hit")
        last_stand_index = text.rfind("stand")
        if last_hit_index > last_stand_index:
            return "hit"
        elif last_stand_index > last_hit_index:
            return "stand"
        else:
            return None

    def to_lower(self, str):
        lowercase_string = str.lower()
        return lowercase_string

    def card2string(self, cardList):
        str = ''
        str = ','.join(cardList)
        str = str.replace('C', 'Club ')
        str = str.replace('S', 'Spade ')
        str = str.replace('H', 'Heart ')
        str = str.replace('D', 'Diamond ')
        str = str.replace('T', '10')
        return str

    def step(self, state):
        """
        Determines the agent's next action based on the current game state using an LLM.

        This method constructs a detailed prompt for the LLM, including the
        game rules (customized for a target total of 31 and dealer hitting up to 27),
        the dealer's visible card, the agent's current hand, and any specified
        behavioral guidelines or world modeling. The LLM's response is then parsed
        to extract the chosen action ("hit" or "stand"). The interaction history
        (prompt and response) is saved to a YAML file.

        The game rules described in the prompt are specific:
        - Target total: 31 (not standard Blackjack).
        - Bust limit: Over 31.
        - Actions: "Stand" or "Hit".
        - Dealer hits until their total is 27 or higher.

        Args:
            state (dict): A dictionary containing the current game state.
                          Expected to have a key 'raw_obs' which itself is a dict
                          containing 'dealer hand' (list of dealer's cards) and
                          'player0 hand' (list of agent's cards).

        Returns:
            int: The chosen action:
                 - 0 for "hit"
                 - 1 for "stand"
                 - -1 if the LLM's response is unclear or an error occurs.
        """
        
        deal_card = state['raw_obs']['dealer hand']
        hand_card = state['raw_obs']['player0 hand']
        p = []
        begin_info = "I will describe the situation. You have to reason through this in 3-5 steps, then stop. The description begins now. You are an aggressive player of blackjack who likes to take risks to earn high returns. Please beat the dealer and win the game. "

        game_rule = "Game Rule:\n1. Please try to get your card total to as close to 31 as possible, without going over, and still having a higher total than the dealer.\n2. If anyone's point total exceeds 31, he or she loses the game. \n3. You can only choose one of the following two actions: {\"Stand\", \"Hit\"}. If you choose to Stand, you will stop taking cards and wait for the dealer to finish. If you choose to Hit, you can continue to take a card, but there is also the risk of going over 31. \n4. After all players have completed their hands, the dealer reveals their hidden card. Dealers must hit until their cards total 27 or higher.\n"
        game_info = "The dealer's current card is {" + self.card2string(deal_card) + "}. The dealer has another hidden card. You don't know what it is. Your current cards are {" + self.card2string(hand_card) + "}. "
        
        game_info += "Behavioral guideline: " + self.behavioral_guideline + "\n"
        game_info += "World modeling: " + self.world_modeling + "\n"
        game_info += "Please read the behavioral guideline and world modeling carefully. Then you should analyze your own cards and your strategies in Self-belief and then analyze the dealer cards in World-belief. Lastly, please select your action from {\"Stand\",\"Hit\"}.### Output Format: Self-Belief is {Belief about youself}. World-Belief is {Belief about the dealer}. My action is {Your action}. Please output in the given format. Do not write anything else."

        p.append({"role": "user", "content": begin_info + game_rule + game_info})
        llm_res = self.llm.response(p)
        p.append({"role": "assistant", "content": llm_res})

        filename = storage_name + '.yaml'
        with open(filename, "a") as yaml_file:
            yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)

        choice = -1
        if self.extract_choice(llm_res) == "hit":
            choice = 0
        elif self.extract_choice(llm_res) == "stand":
            choice = 1
        else:
            choice = -1
            
        return choice

In [6]:
# Results table
results = {27: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           26: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           25: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           24: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           23: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           22: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           21: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           20: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           19: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           18: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           17: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           16: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           15: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           14: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           13: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           12: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           11: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
           10: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
            9: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []},
            8: {2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 'A': []}}

In [7]:
# DeepSeek agent
llm_agent = LlmAgent(num_actions=2)

In [None]:
# World Modeling and Behavioral Guidelines
# from final iteration

llm_agent.behavioral_guideline = """
**Behavioral Guideline**  
1. **Goal**: Maximize aggression within mathematically validated thresholds, prioritizing survival in 4-card hands while targeting 28-31 totals. Enforce unbreakable stand points to neutralize dealer compounding risks.  
2. **Strategy**:  
   - **Fractured Aggression Protocol**:  
     - 3-card hands: Hit to 28+ against Q/K upcards (was 27+) to counter dealer’s 17-20 initial total threat  
     - 4-card hands: **Mandatory stand at 29+** (no exceptions), with immediate termination of aggression upon reaching threshold  
   - **Arithmetic Supremacy Clause**:  
     - Triple-verify totals after every hit (upgraded from double-check) to eliminate 10% error risk  
     - Treat 30/31 as "fortress totals" – instant stand regardless of dealer upcard  
   - **Dealer Ambush Override**:  
     - Against Q/K upcards, assume dealer’s hidden card creates initial 17+ total until proven otherwise through hand progression  
3. **Demonstration**: Preserve *4-Card Standoff* (this game) as canonical example: 29 total with 4 cards forced stand despite dealer’s K upcard, demonstrating protocol adherence overrides outcome ambiguity.
"""

llm_agent.world_modeling = """
**World Modeling**1. **Rule-Description**:  
   - **Dealer’s Compounding Horizon**: Dealers must hit until reaching 27+, creating 18-23% bust risk per additional card drawn beyond 3  
   - **4-Card Fragility Window**: Player 4-card hands at 24-28 have 52-68% bust risk on next hit – prohibited against Q/K upcards  
   - **Hidden Card Quantum Lock**: K upcard dealers have 33% probability of hidden card being 2-7 (initial 12-17), requiring players to build ≥28 totals  
2. **Strategic Recognition**:  
   - **Upcard Entanglement**:  
     - K upcard creates dual reality: immediate dealer stand threat (initial 20) vs delayed compounding (initial 12-17)  
     - Player must resolve uncertainty by reaching 28+ with ≤4 cards  
   - **Threshold Elasticity 4.0**:  
     - For every confirmed dealer initial total point below 18, reduce player stand threshold by 1.0 (not 0.75) to counter compounding acceleration  
3. **Calibration Matrix**:  
   - **K-Upcard Response**: 28+ (3 cards)/29+ (4 cards) with triple total verification  
   - **Catastrophic Zones**:  
     - 4-card 25-28 totals banned against K upcards (58-72% bust risk)  
     - Dealer initial totals 18-20 require player minimum 29 to win 
"""

In [None]:
def play_game(env):
    """
    Simulates a game using the provided environment, records actions taken
    during the game, logs the final outcome, and returns the game payoffs.

    The function processes the trajectory of the first player (trajectories[0])
    to extract states and actions. 

    The dealer's card value is interpreted as its numerical value, or 'A' (Ace)
    if the observed value is greater than 10.

    The final game state (dealer's and player's final standing) and the outcome
    (win, draw, lose) are logged to a YAML file, the name of which is determined
    by the global variable 'storage_name'.

    Args:
        env: An environment object that has a 'run' method to simulate the game
             and a 'get_payoffs' method to retrieve the results. 

    Returns:
        list: A list of payoffs for the players involved in the game,
              as returned by 'env.get_payoffs()'.
    """

    trajectories, payoffs = env.run(is_training=False)
    if len(trajectories[0]) != 0:
        final_state = []
        action_record = []
        state = []
        _action_list = []

        for i in range(1):
            final_state.append(trajectories[i][-1])
            state.append(final_state[i]['raw_obs'])

        action_record.append(final_state[i]['action_record'])
        for i in range(1, len(action_record) + 1):
            _action_list.insert(0, action_record[-i])

    last_hand_value = 0
    for i, situation in enumerate(trajectories[0][:-1]):
        if (i % 2 == 0): # State
            last_hand_value = int(trajectories[0][i]['obs'][0])
            dealer_value = int(trajectories[0][i]['obs'][1]) 
            dealer_value = dealer_value if dealer_value <= 10 else 'A'
        else: # Action
            action = int(situation)
            if last_hand_value >= 8 and last_hand_value <= 27:
                results[last_hand_value][dealer_value].append(action)

    res_str = ('dealer {}, '.format(state[0]['state'][1]) +
                'player {}, '.format(state[0]['state'][0]))
    if payoffs[0] == 1:
        final_res = "win."
    elif payoffs[0] == 0:
        final_res = "draw."
    elif payoffs[0] == -1:
        final_res = "lose."
    p = [{"final cards": res_str, "final results": final_res}]
    filename = storage_name + '.yaml'
    with open(filename, "a") as yaml_file:
        yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)
    return env.get_payoffs()

In [None]:
# Random games
for i in range(1, 51):
    now = datetime.now()
    formatted = now.strftime("%Y-%m-%d %H:%M:%S.%f")
    storage_name = '../Agent-Pro/my_data/31_run_post_training/' + formatted
    env = rlcard.make(
    'blackjack',
    config={
        'game_num_players': 1,
        "seed": random.randint(0, 10**10)
    })
    env.set_agents([llm_agent])
    play_game(env)
    print(f"Current game: {i}")

In [None]:
# Store results
with open(f"../dicts/31_run_post_training/id_{id}.pkl", "wb") as f:
    pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)