In [None]:
# Imports
import random
import rlcard
import yaml

from datetime import datetime
from rlcard.agents import RandomAgent
from openai import OpenAI

In [None]:
now = datetime.now()
formatted = now.strftime("%Y-%m-%d %H:%M:%S")
storage_name = '../Agent-Pro/data/training/' + formatted

In [3]:
# DeepSeek R1 model
class DEEPSEEKR1:
    def __init__(self) -> None:
        with open("../config.yaml", "r") as file:
            config = yaml.safe_load(file)
            api_key = config["Keys"]["DEEPSEEKR1PAID"]

        self.model = config["IDs"]["DEEPSEEKR1PAID"]

        self.client = OpenAI(
            base_url=config["Providers"]["DEEPSEEK"],
            api_key=api_key,
        )

    def response(self, mes):
        response = self.client.chat.completions.create(
            model=self.model,
            messages=mes)

        return response.choices[0].message.content

In [None]:
# Blackjack Agent
class LlmAgent(RandomAgent):
    """
    An agent that uses a Large Language Model (LLM) to make decisions in a Blackjack game.

    This agent formulates a prompt based on the current game state, rules,
    and predefined behavioral guidelines and world modeling. It then queries an LLM 
    to get a reasoned action (hit or stand). 

    Attributes:
        llm (DEEPSEEKR1): An instance of the LLM used for decision making.
        behavioral_guideline (str): A string defining the agent's playing style
                                    or specific instructions.
        world_modeling (str): A string containing information or assumptions
                              about the game environment or opponent.
    """
    
    def __init__(self, num_actions):
        super().__init__(num_actions)
        self.llm = DEEPSEEKR1()

        self.behavioral_guideline = ""
        self.world_modeling = ""

    def extract_choice(self, text):
        text = self.to_lower(text)
        last_hit_index = text.rfind("hit")
        last_stand_index = text.rfind("stand")
        if last_hit_index > last_stand_index:
            return "hit"
        elif last_stand_index > last_hit_index:
            return "stand"
        else:
            return None

    def to_lower(self, str):
        lowercase_string = str.lower()
        return lowercase_string

    def card2string(self, cardList):
        str = ''
        str = ','.join(cardList)
        str = str.replace('C', 'Club ')
        str = str.replace('S', 'Spade ')
        str = str.replace('H', 'Heart ')
        str = str.replace('D', 'Diamond ')
        str = str.replace('T', '10')
        return str

    def step(self, state):
        """
        Determines the agent's next action based on the current game state using an LLM.

        This method constructs a detailed prompt for the LLM, including the
        game rules (customized for a target total of 31 and dealer hitting up to 27),
        the dealer's visible card, the agent's current hand, and any specified
        behavioral guidelines or world modeling. The LLM's response is then parsed
        to extract the chosen action ("hit" or "stand"). The interaction history
        (prompt and response) is saved to a YAML file.

        The game rules described in the prompt are specific:
        - Target total: 31 (not standard Blackjack).
        - Bust limit: Over 31.
        - Actions: "Stand" or "Hit".
        - Dealer hits until their total is 27 or higher.

        Args:
            state (dict): A dictionary containing the current game state.
                          Expected to have a key 'raw_obs' which itself is a dict
                          containing 'dealer hand' (list of dealer's cards) and
                          'player0 hand' (list of agent's cards).

        Returns:
            int: The chosen action:
                 - 0 for "hit"
                 - 1 for "stand"
                 - -1 if the LLM's response is unclear or an error occurs.
        """

        deal_card = state['raw_obs']['dealer hand']
        hand_card = state['raw_obs']['player0 hand']
        p = []
        begin_info = "I will describe the situation. You have to reason through this in 3-5 steps, then stop. The description begins now. You are an aggressive player of blackjack who likes to take risks to earn high returns. Please beat the dealer and win the game. "

        game_rule = "Game Rule:\n1. Please try to get your card total to as close to 31 as possible, without going over, and still having a higher total than the dealer.\n2. If anyone's point total exceeds 31, he or she loses the game. \n3. You can only choose one of the following two actions: {\"Stand\", \"Hit\"}. If you choose to Stand, you will stop taking cards and wait for the dealer to finish. If you choose to Hit, you can continue to take a card, but there is also the risk of going over 31. \n4. After all players have completed their hands, the dealer reveals their hidden card. Dealers must hit until their cards total 27 or higher.\n"
        game_info = "The dealer's current card is {" + self.card2string(deal_card) + "}. The dealer has another hidden card. You don't know what it is. Your current cards are {" + self.card2string(hand_card) + "}. "
        
        game_info += "Behavioral guideline: " + self.behavioral_guideline + "\n"
        game_info += "World modeling: " + self.world_modeling + "\n"
        game_info += "Please read the behavioral guideline and world modeling carefully. Then you should analyze your own cards and your strategies in Self-belief and then analyze the dealer cards in World-belief. Lastly, please select your action from {\"Stand\",\"Hit\"}.### Output Format: Self-Belief is {Belief about youself}. World-Belief is {Belief about the dealer}. My action is {Your action}. Please output in the given format. Do not write anything else."

        p.append({"role": "user", "content": begin_info + game_rule + game_info})
        llm_res = self.llm.response(p)
        p.append({"role": "assistant", "content": llm_res})

        filename = storage_name + '.yaml'
        with open(filename, "a") as yaml_file:
            yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)
        choice = -1
        if self.extract_choice(llm_res) == "hit":
            choice = 0
        elif self.extract_choice(llm_res) == "stand":
            choice = 1
        else:
            choice = -1
            
        return choice

In [5]:
# DeepSeek agent
llm_agent = LlmAgent(num_actions=2)

In [6]:
llm = DEEPSEEKR1()

In [None]:
def play(env):
    """
    Runs a single game episode using the provided environment, processes its
    trajectory to extract final game information, and logs this information
    to a YAML file.

    The function executes a game run via 'env.run()'. It then extracts the
    final state and action records from the first player's trajectory.
    The dealer's and player's final states (e.g., card totals or hands)
    are formatted into a string. The game outcome (win, draw, or lose) for
    the first player is determined from the payoffs.

    This information, specifically the formatted final card states and the
    game result, is then appended to a YAML file. The name of this YAML file
    is determined by the global variable 'storage_name'.

    Args:
        env: An environment object that simulates the game. It must have a
             'run()' method.

    Returns:
        None: This function does not have an explicit return statement.
    """

    trajectories, payoffs = env.run(is_training=False)
    
    if len(trajectories[0]) != 0:
        final_state = []
        action_record = []
        state = []
        _action_list = []

        for i in range(1):
            final_state.append(trajectories[i][-1])
            state.append(final_state[i]['raw_obs'])

        action_record.append(final_state[i]['action_record'])
        for i in range(1, len(action_record) + 1):
            _action_list.insert(0, action_record[-i])

    res_str = ('dealer {}, '.format(state[0]['state'][1]) +
                'player {}, '.format(state[0]['state'][0]))
    
    if payoffs[0] == 1:
        final_res = "win."
    elif payoffs[0] == 0:
        final_res = "draw."
    elif payoffs[0] == -1:
        final_res = "lose."

    p = [{"final cards": res_str, "final results": final_res}]

    filename = storage_name + '.yaml'
    with open(filename, "a") as yaml_file:
        yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)


In [None]:
def train(results, llm, old_policy):
    """
    Start a learning process for an LLM-based Blackjack agent by making it
    reflect on a past game and refine its policies.

    The function performs the following steps:
    1. Reads a game log from a YAML file (specified by 'results + '.yaml'').
    2. Prompts the LLM to analyze the game record, focusing on why the game
       was lost, problematic beliefs/actions, and underlying reasons. The game
       rules (target 31, dealer hits to 27) are provided as context.
    3. Prompts the LLM again to distill new "Behavioral Guidelines" and
       "World Modeling" based on its analysis and the 'old_policy'.
       This step aims to update the agent's strategic approach.
    4. Logs the entire interaction (prompts and LLM responses for both analysis
       and policy generation) into a new YAML file ('results + '_train.yaml'').
    5. Parses the LLM's final response to separate the new behavioral guideline
       and world modeling.
    6. Returns these updated policy strings.

    Args:
        results (str): The base name for the YAML log files. 
        llm: An LLM interface object with a 'response(messages)' method that
             takes a list of message dictionaries and returns a string response.
        old_policy (tuple or list): A sequence containing two strings:
            - 'old_policy[0]' (str): The previous behavioral guideline.
            - 'old_policy[1]' (str): The previous world modeling.

    Returns:
        tuple: A tuple containing two strings:
            - 'behavioral_guideline' (str): The newly generated behavioral guideline.
            - 'world_modeling' (str): The newly generated world modeling.
    """

    with open(results + '.yaml', 'r') as file:
        content = file.read()
    log = yaml.safe_load(content)

    p = []
    
    behavioral_guideline = ""
    world_modeling = ""

    game_rule = "Game Rule:\n1. Please try to get your card total to as close to 31 as possible, without going over, and still having a higher total than the dealer.\n2. If anyone's point total exceeds 31, he or she loses the game. \n3. You can only choose one of the following two actions: {\"Stand\", \"Hit\"}. If you choose to Stand, you will stop taking cards and wait for the dealer to finish. If you choose to Hit, you can continue to take a card, but there is also the risk of going over 31. \n4. After all players have completed their hands, the dealer reveals their hidden card. Dealers must hit until their cards total 27 or higher.\n"
    setup = "You are a seasoned blackjack expert, and you need to carefully reflect on the following record of this losing game: "
    game_record = f"Game Record: {content}\n"

    analysis_setup = """
    Correctness: Whether its beliefs about yourself, the game, and the dealer align with the final results.
    Consistency: Whether each belief and action is self - contradictory.
    Reasons: Reflect on why you lost to your dealer, which beliefs and actions are
    problematic, and what the underlying reasons are.
    ### Output Format: I analyze this game as follows: { Your analysis about the game and belief }.
    """
    
    p.append({"role": "user", "content": game_rule + setup + game_record + analysis_setup})
    res = llm.response(p)
    p.append({"role": "assistant", "content": res})

    reflection = "Policy-Level Reflection: " + res + "\n"
    guidelines = f"""
    Following the previous rigorous analysis, you should distill and articulate a set of
    Behavoiral Guidelines and World Modeling. The Behavoiral Guideline is about what
    you consider to be a more reasonable and effective behavioral strategy and
    suggestions. World Modeling is about the description of the game and the dealer.

    Here are some suggestions for you:

    Behavoiral Guideline
        1-Goal: Please summarize the detailed goal based on your reflection ...
        2-Strategy: What kind of strategy can lead you to win in similar games ...
        3-Demonstration: Can this game be considered a typical example to be preserved for
        future reference ...
    World Modeling
        1-Rule-Description: Based on the recent reflection , describe any game rules or details
        that are easy to overlook ...


    Update previous policies that you used to have to better match your current understanding.

    Previous Behavoiral Guideline:
    {old_policy[0]}

    Previous World Modeling:
    {old_policy[1]}

    Always start description of new Behavoiral Guideline with words Behavoiral Guideline and new World Modeling with words World Modeling.

    """
    

    n = []
    n.append({"role": "user", "content": game_record + reflection + guidelines})
    res = llm.response(n)
    n.append({"role": "assistant", "content": res})

    filename = results + '_train' + '.yaml'
    with open(filename, "a") as yaml_file:
        yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)
        yaml.dump(n, yaml_file, default_flow_style=False, allow_unicode=True)

    parts = res.split("**World Modeling**")

    behavioral_guideline = parts[0].strip()
    world_modeling = "**World Modeling**" + parts[1].strip()


    return behavioral_guideline, world_modeling

In [None]:
# Random games
for i in range(13, 51):
    now = datetime.now()
    formatted = now.strftime("%Y-%m-%d %H:%M:%S.%f")
    storage_name = '../Agent-Pro/data/training/' + formatted
    env = rlcard.make('blackjack', config={'game_num_players': 1, "seed": random.randint(0, 10**10)})
    env.set_agents([llm_agent])
    play(env)

    behavioral_guideline, world_modeling = train(storage_name, llm, [llm_agent.behavioral_guideline, llm_agent.world_modeling])
    llm_agent.behavioral_guideline = behavioral_guideline
    llm_agent.world_modeling = world_modeling

    print(f"Current game: {i}")

In [19]:
print(llm_agent.behavioral_guideline) 
print(llm_agent.world_modeling)

**Behavioral Guideline**  
1. **Goal**: Maximize aggression within mathematically validated thresholds, prioritizing survival in 4-card hands while targeting 28-31 totals. Enforce unbreakable stand points to neutralize dealer compounding risks.  
2. **Strategy**:  
   - **Fractured Aggression Protocol**:  
     - 3-card hands: Hit to 28+ against Q/K upcards (was 27+) to counter dealer’s 17-20 initial total threat  
     - 4-card hands: **Mandatory stand at 29+** (no exceptions), with immediate termination of aggression upon reaching threshold  
   - **Arithmetic Supremacy Clause**:  
     - Triple-verify totals after every hit (upgraded from double-check) to eliminate 10% error risk  
     - Treat 30/31 as "fortress totals" – instant stand regardless of dealer upcard  
   - **Dealer Ambush Override**:  
     - Against Q/K upcards, assume dealer’s hidden card creates initial 17+ total until proven otherwise through hand progression  
3. **Demonstration**: Preserve *4-Card Standoff* (thi