In [1]:
# Imports
import pickle
import random
import rlcard
import rlcard.envs
import yaml

from datetime import datetime
from rlcard.agents import RandomAgent
from openai import OpenAI

In [2]:
now = datetime.now()
formatted = now.strftime("%Y-%m-%d %H:%M:%S")
storage_name = '../Agent-Pro/my_data/DeepSeek R1/' + formatted

In [3]:
# DeepSeek R1 model
class DEEPSEEKR1:
    def __init__(self) -> None:
        with open("../config.yaml", "r") as file:
            config = yaml.safe_load(file)
            api_key = config["Keys"]["DEEPSEEKR1FREE"]

        self.model = config["IDs"]["DEEPSEEKR1FREE"]

        self.client = OpenAI(
            base_url=config["Providers"]["OPENROUTER"],
            api_key=api_key,
        )

    def response(self, mes):
        response = self.client.chat.completions.create(
            model=self.model,
            messages=mes)

        return response.choices[0].message.content

In [4]:
# Blackjack Agent
class LlmAgent(RandomAgent):
    def __init__(self, num_actions):
        super().__init__(num_actions)
        self.llm = DEEPSEEKR1()

        self.behavioral_guideline = ""
        self.world_modeling = ""

    def extract_choice(self, text):
        text = self.to_lower(text)
        last_hit_index = text.rfind("hit")
        last_stand_index = text.rfind("stand")
        if last_hit_index > last_stand_index:
            return "hit"
        elif last_stand_index > last_hit_index:
            return "stand"
        else:
            return None

    def to_lower(self, str):
        lowercase_string = str.lower()
        return lowercase_string

    def card2string(self, cardList):
        str = ''
        str = ','.join(cardList)
        str = str.replace('C', 'Club ')
        str = str.replace('S', 'Spade ')
        str = str.replace('H', 'Heart ')
        str = str.replace('D', 'Diamond ')
        str = str.replace('T', '10')
        return str

    def step(self, state):
        deal_card = state['raw_obs']['dealer hand']
        hand_card = state['raw_obs']['player0 hand']
        p = []
        begin_info = "I will describe the situation. You have to reason through this in 3-5 steps, then stop. The description begins now. You are an aggressive player of blackjack who likes to take risks to earn high returns. Please beat the dealer and win the game. "

        game_rule = "Game Rule:\n1. Please try to get your card total to as close to 31 as possible, without going over, and still having a higher total than the dealer.\n2. If anyone's point total exceeds 31, he or she loses the game. \n3. You can only choose one of the following two actions: {\"Stand\", \"Hit\"}. If you choose to Stand, you will stop taking cards and wait for the dealer to finish. If you choose to Hit, you can continue to take a card, but there is also the risk of going over 31. \n4. After all players have completed their hands, the dealer reveals their hidden card. Dealers must hit until their cards total 27 or higher.\n"
        game_info = "The dealer's current card is {" + self.card2string(deal_card) + "}. The dealer has another hidden card. You don't know what it is. Your current cards are {" + self.card2string(hand_card) + "}. "
        
        game_info += "Behavioral guideline: " + self.behavioral_guideline + "\n"
        game_info += "World modeling: " + self.world_modeling + "\n"
        game_info += "Please read the behavioral guideline and world modeling carefully. Then you should analyze your own cards and your strategies in Self-belief and then analyze the dealer cards in World-belief. Lastly, please select your action from {\"Stand\",\"Hit\"}.### Output Format: Self-Belief is {Belief about youself}. World-Belief is {Belief about the dealer}. My action is {Your action}. Please output in the given format. Do not write anything else."

        p.append({"role": "user", "content": begin_info + game_rule + game_info})
        llm_res = self.llm.response(p)
        p.append({"role": "assistant", "content": llm_res})

        filename = storage_name + '.yaml'
        with open(filename, "a") as yaml_file:
            yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)
        choice = -1
        if self.extract_choice(llm_res) == "hit":
            choice = 0
        elif self.extract_choice(llm_res) == "stand":
            choice = 1
        else:
            choice = -1
            
        return choice

In [5]:
# DeepSeek agent
llm_agent = LlmAgent(num_actions=2)

In [6]:
def play(env):
    trajectories, payoffs = env.run(is_training=False)
    
    if len(trajectories[0]) != 0:
        final_state = []
        action_record = []
        state = []
        _action_list = []

        for i in range(1):
            final_state.append(trajectories[i][-1])
            state.append(final_state[i]['raw_obs'])

        action_record.append(final_state[i]['action_record'])
        for i in range(1, len(action_record) + 1):
            _action_list.insert(0, action_record[-i])

    res_str = ('dealer {}, '.format(state[0]['state'][1]) +
                'player {}, '.format(state[0]['state'][0]))
    
    if payoffs[0] == 1:
        final_res = "win."
    elif payoffs[0] == 0:
        final_res = "draw."
    elif payoffs[0] == -1:
        final_res = "lose."

    p = [{"final cards": res_str, "final results": final_res}]

    filename = storage_name + '.yaml'
    with open(filename, "a") as yaml_file:
        yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)


In [11]:
def train(results, llm, old_policy):
    with open(results + '.yaml', 'r') as file:
        content = file.read()
    log = yaml.safe_load(content)

    p = []
    
    behavioral_guideline = ""
    world_modeling = ""

    game_rule = "Game Rule:\n1. Please try to get your card total to as close to 31 as possible, without going over, and still having a higher total than the dealer.\n2. If anyone's point total exceeds 31, he or she loses the game. \n3. You can only choose one of the following two actions: {\"Stand\", \"Hit\"}. If you choose to Stand, you will stop taking cards and wait for the dealer to finish. If you choose to Hit, you can continue to take a card, but there is also the risk of going over 31. \n4. After all players have completed their hands, the dealer reveals their hidden card. Dealers must hit until their cards total 27 or higher.\n"
    setup = "You are a seasoned blackjack expert, and you need to carefully reflect on the following record of this losing game: "
    game_record = f"Game Record: {content}\n"

    analysis_setup = """
    Correctness: Whether its beliefs about yourself, the game, and the dealer align with the final results.
    Consistency: Whether each belief and action is self - contradictory.
    Reasons: Reflect on why you lost to your dealer, which beliefs and actions are
    problematic, and what the underlying reasons are.
    ### Output Format: I analyze this game as follows: { Your analysis about the game and belief }.
    """
    
    p.append({"role": "user", "content": game_rule + setup + game_record + analysis_setup})
    res = llm.response(p)
    p.append({"role": "assistant", "content": res})

    reflection = "Policy-Level Reflection: " + res + "\n"
    guidelines = f"""
    Following the previous rigorous analysis, you should distill and articulate a set of
    Behavoiral Guidelines and World Modeling. The Behavoiral Guideline is about what
    you consider to be a more reasonable and effective behavioral strategy and
    suggestions. World Modeling is about the description of the game and the dealer.

    Here are some suggestions for you:

    Behavoiral Guideline
        1-Goal: Please summarize the detailed goal based on your reflection ...
        2-Strategy: What kind of strategy can lead you to win in similar games ...
        3-Demonstration: Can this game be considered a typical example to be preserved for
        future reference ...
    World Modeling
        1-Rule-Description: Based on the recent reflection , describe any game rules or details
        that are easy to overlook ...


    Update previous policies that you used to have to better match your current understanding.

    Previous Behavoiral Guideline:
    {old_policy[0]}

    Previous World Modeling:
    {old_policy[1]}
    """
    

    n = []
    n.append({"role": "user", "content": game_record + reflection + guidelines})
    res = llm.response(n)
    n.append({"role": "assistant", "content": res})

    parts = res.split("**World Modeling**")

    behavioral_guideline = parts[0].strip()
    world_modeling = "**World Modeling**" + parts[1].strip()

    filename = results + '_train' + '.yaml'
    with open(filename, "a") as yaml_file:
        yaml.dump(p, yaml_file, default_flow_style=False, allow_unicode=True)
        yaml.dump(n, yaml_file, default_flow_style=False, allow_unicode=True)

    return behavioral_guideline, world_modeling

In [None]:
# Random games
for i in range(0, 1):
    now = datetime.now()
    formatted = now.strftime("%Y-%m-%d %H:%M:%S.%f")
    storage_name = '../Agent-Pro/my_data/test/' + formatted
    env = rlcard.make('blackjack', config={'game_num_players': 1, "seed": random.randint(0, 10**10)})
    env.set_agents([llm_agent])
    play(env)

    behavioral_guideline, world_modeling = train(storage_name)
    llm_agent.behavioral_guideline = behavioral_guideline
    llm_agent.world_modeling = world_modeling

    print(f"Current game: {i}")

Current game: 0


In [8]:
llm = DEEPSEEKR1()

In [13]:
train("../Agent-Pro/my_data/test/2025-03-25 17:48:06.868309", llm, [llm_agent.behavioral_guideline, llm_agent.world_modeling])

('**Behavioral Guidelines**  \n1. **Goal**: Aim to maximize your total as close to 31 as possible without exceeding it, while ensuring your total surpasses the dealer’s (who must reach at least 27). Prioritize aggressive growth in early stages but shift to risk-averse decisions near the upper limit.  \n2. **Strategy**:  \n   - **Aggressive Hitting**: Hit when your total is ≤24, as this allows room to grow toward 31 while managing bust risk.  \n   - **Cautious Standing**: Stand at 25+ if the risk of busting exceeds the potential gain (e.g., at 30, prioritize preserving a strong total).  \n   - **Dealer Risk Assessment**: Factor in the dealer’s obligation to hit until 27+, which increases their likelihood of busting.  \n3. **Demonstration**: This game demonstrates the balance between aggression (hitting at 10, 20) and caution (standing at 30). Preserve these scenarios as references for similar high-stakes, risk-reward decisions.',
 '**World Modeling**1. **Rule-Description**:  \n   - The 