In [None]:
!pip3 install axelrod==4.11.0

Collecting axelrod==4.11.0
  Downloading Axelrod-4.11.0.tar.gz (205 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/205.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m112.6/205.3 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.3/205.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: axelrod
  Building wheel for axelrod (setup.py) ... [?25l[?25hdone
  Created wheel for axelrod: filename=Axelrod-4.11.0-py2.py3-none-any.whl size=191298 sha256=34a37522d6e5a623f42ac822b992684894f53f5632835c356943f781de80baf3
  Stored in directory: /root/.cache/pip/wheels/af/02/89/cf954fed58b376279061979389ba6808c9f22ad1accccdadd9
Successfully built axelrod
Installing collected packages: axelrod
Successfully installed axelrod-4.11.0


# **Normal Q-Learning Tournament**

In [None]:
import axelrod as axl
import pandas as pd
import numpy as np

# Define the strategies you want to include in the tournament
strategies = [
    axl.Cooperator(),
    axl.Defector(),
    axl.TitForTat(),
    axl.Grudger(),
    axl.Random(),
    axl.AntiTitForTat(),
    axl.SecondByChampion(),
    axl.Gradual(),
    axl.AdaptiveTitForTat(),
    axl.Cycler("CDCD"),
    axl.Calculator(),
    axl.CycleHunter(),
    axl.Pi(),
]

# Create a tournament with the specified strategies
tournament = axl.Tournament(players=strategies, turns=200, repetitions=10)

# Run the tournament
results = tournament.play()

# Convert scores to a NumPy array
scores_array = np.array(results.scores)

# Calculate mean and median scores for each strategy
mean_scores = np.mean(scores_array, axis=1)
median_scores = np.median(scores_array, axis=1)

# Create a DataFrame
df = pd.DataFrame({
    "Strategy": results.ranked_names,
    "Mean Score": mean_scores,
    "Median Score": median_scores
})

# Sort the DataFrame by "Mean Score" in descending order
df = df.sort_values(by="Mean Score", ascending=False)

# Display the DataFrame
print(df)


Playing matches: 100%|██████████| 91/91 [00:02<00:00, 32.93it/s]
Analysing: 100%|██████████| 25/25 [00:00<00:00, 51.63it/s]

                     Strategy  Mean Score  Median Score
3                  Calculator      7184.0        7186.0
7          Second by Champion      7011.1        7070.5
1                     Gradual      6496.0        6502.0
10               Cycler: CDCD      6217.1        6140.0
8                 Random: 0.5      6051.6        5983.0
2                    Defector      6025.6        5981.5
5                 Tit For Tat      5787.5        5908.0
6            Anti Tit For Tat      5772.5        5866.0
4   Adaptive Tit For Tat: 0.5      5492.4        5619.5
12                 Cooperator      5347.0        5345.5
9                       $\pi$      4750.3        4752.5
11               Cycle Hunter      4612.7        4563.0
0                     Grudger      4250.4        4249.5





# **Q - Learning**


In [None]:
import axelrod as axl

In [None]:
import numpy as np
import random

In [None]:
C = axl.Action.C
D = axl.Action.D

In [None]:
class TitForTat(axl.Player):
    """
    A player starts by cooperating and then mimics the previous action of the
    opponent.

    This strategy was referred to as the *'simplest'* strategy submitted to
    Axelrod's first tournament. It came first.

    Note that the code for this strategy is written in a fairly verbose
    way. This is done so that it can serve as an example strategy for
    those who might be new to Python.

    Names:

    - Rapoport's strategy: [Axelrod1980]_
    - TitForTat: [Axelrod1980]_
    """

    # These are various properties for the strategy
    name = "Tit For Tat"
    classifier = {
        "memory_depth": 1,  # Four-Vector = (1.,0.,1.,0.)
        "stochastic": False,
        "long_run_time": False,
        "inspects_source": False,
        "manipulates_source": False,
        "manipulates_state": False,
    }
    def strategy(self, opponent):
        """This is the actual strategy"""
        # First move
        if not self.history:
            return C
        # React to the opponent's last move
        print(opponent.history[-1])
        if opponent.history[-1][0] == D:
            return D
        return C

In [None]:
import operator
class QLearningAgent(axl.Player):
    # Actions possible
    actions = [C,D]
    # History of self actions
    history = []

    def __init__(self, learning_rate, discount_factor, exploration_prob):
        super().__init__()
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.q_table = {}
        # for s in [((C, C),), ((C, D),), ((D, C),), ((D, D),)]:
        #   for a in [C,D]:
        #     self.q_table[(s,a)] = 0

        #self.q_table[((C,D),),C]=1
        # for s in [C,D]:
        #   self.q_table[s]=0

        #self.q_table[D]=0.1
        self.last_action = None

    # Reward
    # C:C -> 50, C:D -> -100, D:C -> 100, D:D -> -50
    def reward(self, state, action):
        s=action
        o=state[0][1]

        if s == C:
            return 50 if o == C else -100
        else:
            return 100 if o == C else -50

    # Q-table
    def update_q_table(self, state, action, reward, next_state, opponent):

        """
        Update the Q-values in the Q-table based on the observed reward and next state.
        """
        #print(state)
        #print(action)
        current_q = self.q_table.get((state,action),0)
        #print(current_q)  # Get the current Q-value for the state-action pair
        # Calculate the new Q-value using the Bellman equation
        max_future_q = max([self.q_table.get(((next_state,),a),0) for a in (C, D)])
        #print(max_future_q) # Max Q-value for next state
        #new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward(self.last_action, opponent.last_action) + self.discount_factor * max_future_q)
        #print(reward(state,action))
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward(state, action) + self.discount_factor * max_future_q)
        #print(new_q)
        #print(state)
        #print(action)
        self.q_table[(state, action)] = new_q  # Update the Q-value in the Q-table

    # def update(self,opponent):
    #   print("HEllo")
    #   state=(self.history[-1],)
    #   action=self.last_action

    #   next_state=(action,opponent.history[-1])

    #   self.update_q_table(state,action,self.reward,next_state,opponent)


    def strategy(self, opponent):
        if not self.history:
            return C
        #print(history)
        action=self.last_action
        if action==None:
          action=D

        state = (self.history[-1],)  # Get the current state
        next_state=(action,opponent.history[-1])


        self.update_q_table(state,action,self.reward,next_state,opponent)

        if random.random() < self.exploration_prob:
            action = random.choice([C, D])
            #print(action)
        else:
            # Choose action based on Q-values
            #print(self.q_table)
            action = max(self.q_table.items(), key=operator.itemgetter(1))[0][1]
            #print(action)

        # Update action for the next round
        self.last_action = action
        return action

    def update_history(self, play, coplay):
        """
        Update the QLearningAgent's history with their action and their coplayer's action.
        """
        self.history.append((play, coplay))
        #print((play, coplay))
        #self.history=list(zip(*self.history))

player1 = TitFor2Tats()
player2 = TitForTat()
#Create a match with the QLearningAgent player
match = axl.Match(players=[QLearningAgent(learning_rate=0.5, discount_factor=0.9, exploration_prob=0.1), player2],turns=100)

# Run the tournament
results = match.play()
# # Display the results
print(results)

print(match.final_score())

(C, C)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(C, C)
(D, C)
(D, D)
(D, D)
(C, D)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
[(C, C), (D, C), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (C, D), (C, C), (D, C), (D, D), (D, D), (C, D), (D, C), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D

# **DQN**


In [None]:
import tensorflow as tf

In [None]:
#history = []
C = axl.Action.C
D = axl.Action.D

class DQNAgent(axl.Player):
    history=[]
    def __init__(self, learning_rate, discount_factor, exploration_prob):
      super().__init__()
      self.learning_rate = learning_rate
      self.discount_factor = discount_factor
      self.exploration_prob = exploration_prob
      self.model = self._build_model()
      self.memory = []
      self.last_action = None
      self.agent_history = []  # Initialize agent history

    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=4, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(2, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                      loss='mse')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        #print("hello")
        if np.random.rand() <= self.exploration_prob:
            return random.choice([C, D])
        else:
            #print(q_values)
            q_values = self.model.predict(state)
            #print(q_values)
            if np.argmax(q_values[0])==0:
              return C
            else:
              return D

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.discount_factor * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

    def update_history(self, play, coplay):
        """
        Update the QLearningAgent's history with their action and their coplayer's action.
        """
        self.history.append((play, coplay))

    def strategy(self, opponent):
        if self.history==[]:
            #print("Hello")
            return C

        #print("Hello")

        action=self.last_action
        if action==None:
          action=C


        state = (self.history[-1],)  # Get the current state
        next_state=(action,opponent.history[-1])

        if random.random() < self.exploration_prob:
            action = random.choice([C, D])
        else:
            #print(self.history[-1])
            state = np.array([self._encode_state(self.history[-1])])
            action = self.choose_action(state)

        self.last_action = action
        self.remember(state, action, self.reward(action, opponent.history[-1]), np.array([self._encode_state((action, opponent.history[-1]))]), False)

        return action

    def reward(self, action, coplay):
        """
        Define rewards based on the agent's action and its coplayer's action.
        """
        rewards = {
            (C, C): 50,
            (C, D): -100,
            (D, C): 100,
            (D, D): -50
        }
        return rewards[(action, coplay)]

    def _encode_state(self, state):
        """
        Encode the current state for the neural network input.
        """
        state_mapping = {
            (C, C): [1, 0, 0, 0],
            (C, D): [0, 1, 0, 0],
            (D, C): [0, 0, 1, 0],
            (D, D): [0, 0, 0, 1]
        }
        return state_mapping[state]

# Create a match with the DQNAgent player
agent = axl.Player()


player1 = TitForTat()
match = axl.Match(players=[DQNAgent(learning_rate=0.1, discount_factor=0.7, exploration_prob=0.3), player1],turns=100)

# Run the tournament
results = match.play()

# Display the results
print(results)
print(match.final_score())

(C, C)
(C, C)
(C, C)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(D, C)
(C, D)
(D, C)
(C, D)
(C, C)
(D, C)
(D, D)
(D, D)
(C, D)
(D, C)
(D, D)
(C, D)
(C, C)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(C, C)
(C, C)
(D, C)
(C, D)
(C, C)
(C, C)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(D, C)
(C, D)
(C, C)
(D, C)
(C, D)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(C, C)
(D, C)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(D, D)
(C, D)
(C, C)
(D, C)
(C, D)
(C, C)
(C, C)
(C, C)
(D, C)
(D, D)
(D, D)
(C, D)
(C, C)
(C, C)
(D, C)
(D, D)
[(C, C), (C, C), (C, C), (D, C), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (D, D), (C, D), (C, C), (C, C), (C, C), (C, C), (C, C), (C, C), (C, C), (C, C), (C, C), (C, C), (C, C), (D, C), (C, D), (D, C), (C, D), (C, C), (D, C), (D, D), (D, D), (C, D), (D, C), (D, D), (C, D), (C

# **Tournament**

# **Examples**

In [None]:
import numpy as np

class ProfitAgent:
    def __init__(self, epsilon=0.1, learning_rate=0.1, discount_factor=0.95):
        self.epsilon = epsilon  # Exploration rate
        self.learning_rate = learning_rate  # Learning rate
        self.discount_factor = discount_factor  # Discount factor for future rewards
        self.q_values = {}  # Q-values dictionary

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            # Explore: choose a random action
            return np.random.choice([0, 1])  # 0: no advertising, 1: advertising
        else:
            # Exploit: choose the action with the highest Q-value
            return np.argmax(self.q_values.get(state, [0, 0]))  # 0: no advertising, 1: advertising

    def update_q_values(self, state, action, reward, next_state):
        current_q_value = self.q_values.get(state, [0, 0])[action]
        next_max_q_value = np.max(self.q_values.get(next_state, [0, 0]))
        new_q_value = current_q_value + self.learning_rate * (reward + self.discount_factor * next_max_q_value - current_q_value)
        self.q_values[state] = [0, 0]
        self.q_values[state][action] = new_q_value

def simulate_competition(num_episodes=1000):
    agent_coca_cola = ProfitAgent()
    agent_pepsi = ProfitAgent()

    for _ in range(num_episodes):
        # State: combination of actions (0: no advertising, 1: advertising) of both companies
        state_coca_cola = tuple([agent_coca_cola.choose_action(())])
        state_pepsi = tuple([agent_pepsi.choose_action(())])

        # Let's simplify the reward function: both companies get a reward of 1 if they both advertise, 0 otherwise
        if state_coca_cola[0] == 1 and state_pepsi[0] == 1:
            reward_coca_cola = 1
            reward_pepsi = 1
        else:
            reward_coca_cola = 0
            reward_pepsi = 0

        agent_coca_cola.update_q_values(state_coca_cola, state_coca_cola[0], reward_coca_cola, ())
        agent_pepsi.update_q_values(state_pepsi, state_pepsi[0], reward_pepsi, ())

    return agent_coca_cola.q_values, agent_pepsi.q_values

# Simulate the competition
coca_cola_q_values, pepsi_q_values = simulate_competition()

# Display the Q-values learned by each company
print("Coca-Cola Q-values:", coca_cola_q_values)
print("Pepsi Q-values:", pepsi_q_values)


Coca-Cola Q-values: {(0,): [0.0, 0], (1,): [0, 0.039939300418256204]}
Pepsi Q-values: {(0,): [0.0, 0], (1,): [0, 0.05539693995449392]}


In [None]:
import numpy as np

class ProfitAgent:
    def __init__(self, epsilon=0.1, learning_rate=0.1, discount_factor=0.95):
        self.epsilon = epsilon  # Exploration rate
        self.learning_rate = learning_rate  # Learning rate
        self.discount_factor = discount_factor  # Discount factor for future rewards
        self.q_values = {}  # Q-values dictionary

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            # Explore: choose a random action
            return np.random.choice([0, 1])  # 0: cooperate (C), 1: defect (D)
        else:
            # Exploit: choose the action with the highest Q-value
            return np.argmax(self.q_values.get(state, [0, 0]))  # 0: cooperate (C), 1: defect (D)

    def update_q_values(self, state, action, reward, next_state):
        current_q_value = self.q_values.get(state, [0, 0])[action]
        next_max_q_value = np.max(self.q_values.get(next_state, [0, 0]))
        new_q_value = current_q_value + self.learning_rate * (reward + self.discount_factor * next_max_q_value - current_q_value)
        self.q_values[state] = [0, 0]
        self.q_values[state][action] = new_q_value

def simulate_competition(num_episodes=1000):
    agent_coca_cola = ProfitAgent()
    agent_pepsi = ProfitAgent()

    coca_cola_profit = 100
    pepsi_profit = 100

    for _ in range(num_episodes):
        state_coca_cola = tuple([agent_coca_cola.choose_action(())])
        state_pepsi = tuple([agent_pepsi.choose_action(())])

        if state_coca_cola[0] == state_pepsi[0]:  # Both companies take the same action
            if state_coca_cola[0] == 0:  # Both cooperate (C)
                reward_coca_cola = 5
                reward_pepsi = 5
            else:  # Both defect (D)
                reward_coca_cola = -5
                reward_pepsi = -5
        else:  # Different actions
            if state_coca_cola[0] == 0:  # Coca-Cola cooperates (C), Pepsi defects (D)
                reward_coca_cola = 5
                reward_pepsi = -5
            else:  # Coca-Cola defects (D), Pepsi cooperates (C)
                reward_coca_cola = -5
                reward_pepsi = 5

        # Update profits
        coca_cola_profit += reward_coca_cola
        pepsi_profit += reward_pepsi

        agent_coca_cola.update_q_values(state_coca_cola, state_coca_cola[0], reward_coca_cola, ())
        agent_pepsi.update_q_values(state_pepsi, state_pepsi[0], reward_pepsi, ())

    return coca_cola_profit, pepsi_profit

# Simulate the competition
coca_cola_profit, pepsi_profit = simulate_competition()

# Display the profits of Coca-Cola and Pepsi
print("Coca-Cola Profit:", coca_cola_profit)
print("Pepsi Profit:", pepsi_profit)


Coca-Cola Profit: 4640
Pepsi Profit: 4560


# **Q-Learning **

Here we depict the profit margin competition between COCO-COLA vs Pepsi where both companies are represented by Q-learning

In [None]:
import numpy as np

class ProfitAgent:
    def __init__(self, name, epsilon=0.1, learning_rate=0.1, discount_factor=0.95):
        self.name = name
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_values = {}
        self.wins = 0  # Initialize win count to zero

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(['C', 'D'])
        else:
            state_q_values = self.q_values.get(state)
            if state_q_values is None:
                # Initialize Q-values for the state
                self.q_values[state] = {'C': 0, 'D': 0}
                state_q_values = self.q_values[state]
            return max(state_q_values, key=state_q_values.get)

    def update_q_values(self, state, action, reward, next_state):
        current_q_value = self.q_values.get(state, {'C': 0, 'D': 0})[action]
        next_max_q_value = max(self.q_values.get(next_state, {'C': 0, 'D': 0}).values())
        new_q_value = current_q_value + self.learning_rate * (
                    reward + self.discount_factor * next_max_q_value - current_q_value)
        self.q_values[state] = {'C': 0, 'D': 0}
        self.q_values[state][action] = new_q_value

    def record_win(self):
        self.wins += 1  # Increment win count

def simulate_competition(num_episodes=1000):
    coca_cola = ProfitAgent(name='Coca-Cola')
    pepsi = ProfitAgent(name='Pepsi')

    coca_cola_profit = 100
    pepsi_profit = 100

    for episode in range(1, num_episodes + 1):
        state_coca_cola = ()
        state_pepsi = ()

        action_coca_cola = coca_cola.choose_action(state_coca_cola)
        action_pepsi = pepsi.choose_action(state_pepsi)

        if action_coca_cola == action_pepsi:
            reward_coca_cola = 5 if action_coca_cola == 'C' else -5
            reward_pepsi = 5 if action_pepsi == 'C' else -5
        elif action_coca_cola == 'C':
            reward_coca_cola = -10
            reward_pepsi = 10
        else:
            reward_coca_cola = 10
            reward_pepsi = -10

        coca_cola_profit += reward_coca_cola
        pepsi_profit += reward_pepsi

        coca_cola.update_q_values(state_coca_cola, action_coca_cola, reward_coca_cola, state_coca_cola)
        pepsi.update_q_values(state_pepsi, action_pepsi, reward_pepsi, state_pepsi)

        # Determine the winner of the episode and record the win
        if coca_cola_profit > pepsi_profit:
            coca_cola.record_win()
        elif pepsi_profit > coca_cola_profit:
            pepsi.record_win()

        print(f"Episode {episode}: Coca-Cola Action: {action_coca_cola}, Pepsi Action: {action_pepsi}, "
              f"Coca-Cola Profit: {coca_cola_profit}, Pepsi Profit: {pepsi_profit}")

    return coca_cola.q_values, pepsi.q_values, coca_cola.wins, pepsi.wins

# Simulate the competition
coca_cola_q_values, pepsi_q_values, coca_cola_wins, pepsi_wins = simulate_competition()

# Display the Q-values learned by each company
print("Coca-Cola Q-values:", coca_cola_q_values)
print("Pepsi Q-values:", pepsi_q_values)

# Display the number of wins for each company
print("Coca-Cola Wins:", coca_cola_wins)
print("Pepsi Wins:", pepsi_wins)


Episode 1: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 105, Pepsi Profit: 105
Episode 2: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 110, Pepsi Profit: 110
Episode 3: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 115, Pepsi Profit: 115
Episode 4: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 120, Pepsi Profit: 120
Episode 5: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 125, Pepsi Profit: 125
Episode 6: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 130, Pepsi Profit: 130
Episode 7: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 135, Pepsi Profit: 135
Episode 8: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 140, Pepsi Profit: 140
Episode 9: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 145, Pepsi Profit: 145
Episode 10: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 150, Pepsi Profit: 150
Episode 11: Coca-Cola Action: C, Pepsi Action: C, Coca-Cola Profit: 155, Pepsi Profit: 155
Episode 

In [None]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from collections import deque

class DQNAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, discount_factor=0.95, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.memory = deque(maxlen=2000)
        self.model = self._build_model()
        self.wins = 0  # Initialize win count to zero

    def _build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        else:
            return np.argmax(self.model.predict(state)[0])

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.discount_factor * np.amax(self.model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

def simulate_competition(agent1, agent2, num_episodes=1000):
    agent1_wins = 0
    agent2_wins = 0

    for episode in range(num_episodes):
        state1 = np.array([[random.uniform(0, 1)]])  # State representing the action of agent1
        state2 = np.array([[random.uniform(0, 1)]])  # State representing the action of agent2

        action1 = agent1.choose_action(state1)
        action2 = agent2.choose_action(state2)

        reward1, reward2 = 0, 0

        if action1 == action2:  # Both agents cooperate or both defect
            if action1 == 1:  # Both cooperate
                reward1, reward2 = 5, 5
            else:  # Both defect
                reward1, reward2 = -5, -5
        else:  # One agent cooperates, the other defects
            if action1 == 1:  # Agent1 cooperates, agent2 defects
                reward1, reward2 = -10, 10
                agent2_wins += 1
            else:  # Agent1 defects, agent2 cooperates
                reward1, reward2 = 10, -10
                agent1_wins += 1

        next_state1 = np.array([[random.uniform(0, 1)]])
        next_state2 = np.array([[random.uniform(0, 1)]])

        agent1.remember(state1, action1, reward1, next_state1, False)
        agent2.remember(state2, action2, reward2, next_state2, False)

        state1, state2 = next_state1, next_state2

        if episode > 32:
            agent1.replay(32)
            agent2.replay(32)

        print(f"Episode {episode + 1}/{num_episodes}, Agent 1 Profit: {reward1}, Agent 2 Profit: {reward2}")

    return agent1_wins, agent2_wins

# Create DQNAgents
agent1 = DQNAgent(state_size=1, action_size=2)
agent2 = DQNAgent(state_size=1, action_size=2)

# Simulate the competition
agent1_wins, agent2_wins = simulate_competition(agent1, agent2, num_episodes=100)

print("Agent 1 Wins:", agent1_wins)
print("Agent 2 Wins:", agent2_wins)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Episode 62/100, Agent 1 Profit: 5, Agent 2 Profit: 5
Episode 63/100, Agent 1 Profit: 10, Agent 2 Profit: -10
Episode 64/100, Agent 1 Profit: 10, Agent 2 Profit: -10
Episode 65/100, Agent 1 Profit: -10, Agent 2 Profit: 10
Episode 66/100, Agent 1 Profit: -10, Agent 2 Profit: 10
Episode 67/100, Agent 1 Profit: 5, Agent 2 Profit: 5
Episode 68/100, Agent 1 Profit: -5, Agent 2 Profit: -5
Episode 69/100, Agent 1 Profit: 10, Agent 2 Profit: -10
Episode 70/100, Agent 1 Profit: -5, Agent 2 Profit: -5
Episode 71/100, Agent 1 Profit: -5, Agent 2 Profit: -5
Episode 72/100, Agent 1 Profit: -5, Agent 2 Profit: -5
Episode 73/100, Agent 1 Profit: 10, Agent 2 Profit: -10
Episode 74/100, Agent 1 Profit: 5, Agent 2 Profit: 5
Episode 75/100, Agent 1 Profit: -10, Agent 2 Profit: 10
Episode 76/100, Agent 1 Profit: 5, Agent 2 Profit: 5
Episode 77/100, Agent 1 Profit: -5, Agent 2 Profit: -5
Episode 78/100, Agent 1 Profit: 5, Agent 2 Profit: 5
Epi