In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import random
import pandas as pd
from time import time
from keras.initializers import RandomUniform, Constant, Zeros, RandomNormal
from sankey import make_sankey
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
import plotly.express as px






In [3]:
# disable tf outputs
tf.keras.utils.disable_interactive_logging()

In [4]:
def encode(num, length):
    array = np.zeros(length)
    array[num] = 1
    return array

In [161]:
class RLAgent:
    """ an agent that learns via reinforcement and chooses actions based on a neural network with two hidden layers """
    def __init__(self, input_size, output_size):
        
        self.input_size = input_size # number of inputs the agents take
        self.output_size = output_size # number of outputs the agents has to choose from
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Dense(24, input_dim=self.input_size, activation='relu'))
        model.add(Dense(16, activation='relu'))
        
        model.add(Dense(self.output_size, activation='softmax', kernel_initializer='uniform'))
        model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.CategoricalCrossentropy())
        return model

    # this version of choose_action chooses the highest probability action
    def choose_action_det(self, in_var):
        input_array = encode(in_var, self.input_size)
        input_array = input_array.reshape(1, -1)
        return np.argmax(self.model.predict(input_array))

    # this version of choose_action chooses an action based on the probabilities assigned to each action
    def choose_action(self, in_var):
        input_array = encode(in_var, self.input_size)
        input_array = input_array.reshape(1, -1)
        
        # Get the action probabilities from the model
        action_probs = self.model.predict(input_array)[0]

        # Sample an action based on the probabilities
        chosen_action = np.random.choice(len(action_probs), p=action_probs)

        return chosen_action
    
    def get_probs(self, in_var):
        input_array = encode(in_var, self.input_size)
        
        input_array = input_array.reshape(1, -1)
        return self.model.predict(input_array)

    # this version of update_model uses policy gradient descent
    def update_model(self, state_or_message, action, reward, iteration=1):
        sm_array = encode(state_or_message, self.input_size)
        
        # Encode the chosen action
        encoded_action = np.zeros(self.output_size)
        encoded_action[action] = 1
        
        loss = 1
        
        action_confidence = self.get_probs(state_or_message)[0][action]
        inputs = np.reshape(sm_array, (1, -1))
        targets = np.reshape(encoded_action * reward * loss, (1, -1))

        # Train the model using the created dataset
        self.model.fit(inputs, targets, epochs=1, verbose=0)

In [162]:
""" matrices for 2x2 and 3x3 common interest Lewis signalling games """
basic_lsg = np.array([[(1, 1), (0, 0)], [(0, 0), (1, 1)]])

three_lsg = [[(1, 1), (0, 0), (0, 0)], [(0, 0), (1, 1), (0, 0)], [(0, 0), (0, 0), (1, 1)]]

In [197]:
def train_sender(utility_table, receiver_map, num_messages = 2, num_iterations = 100, 
                 state_probs=None, existing_sender=None, store_probs = False, starting_episode=0):
    
    """ train an RLAgent to be a sender (or receiver)
    
    arguments:
    utility_table: the payoff matrix for the version of the lsg you wish to train
    receiver_map: a dictionary mapping messages to actions, or an instance of RLAgent
    num_messages: the number of messages in the game
    num_iterations: how many episodes the model will train for
    state_probs: boolean, tells function to return a dataframe containing the probability that the sender chooses
                state 1 for each episode
    existing_sender: allows you to start with an already instantiated RLAgent object
    
    returns:
    sender: the trained RLAgent
    receiver_map: the trained receiver, or just the dictionary you passed in
    df_probs: dataframe containing the probability that the sender chooses
                state 1 for each episode
    
    """
    
    s = time()
    
    if store_probs:
        assert len(utility_table) == num_messages == 2, 'must be a simple game to use store_probs'
        df_probs = pd.DataFrame(columns=['State 0', 'State 1'])
    
    num_states = len(utility_table)
    if state_probs == None:
        state_probs = np.ones(num_states) / num_states
    else:
        assert num_states == len(state_probs), 'state_probs length does not match utility table'
    
    num_actions = len(utility_table[0])
    
    if existing_sender == None:
        sender = RLAgent(num_states, num_messages)
    else:
        sender = existing_sender
    
    if store_probs:
        df_probs.loc[0, :] = [sender.get_probs(0)[0][1], sender.get_probs(1)[0][1]]
    
    # Training loop
    for episode in tqdm(range(num_iterations)):
        
        # get random state
        state = random.choices(range(num_states), state_probs)[0]
#         state = episode % 2

        # Sender chooses a message
        message = sender.choose_action(state)

        # Receiver chooses an action
        if type(receiver_map) == dict:
            action = receiver_map[message]
        else:
            action = receiver_map.choose_action(message)
        
        # print(state, 'to', message, 'to', action)

        # utility lookup
        utilities = utility_table[state][action]

        # Update sender and receiver models based on the reward
        sender.update_model(state, message, utilities[0], iteration=episode+1+starting_episode)
        
        if type(receiver_map) != dict:
            # update receiver
            receiver_map.update_model(message, action, utilities[1])
        
        if store_probs:
            df_probs.loc[episode+1, :] = [sender.get_probs(0)[0][1], sender.get_probs(1)[0][1]]
        
#     t_str = convert_seconds(round(time()-s))    
#     print(num_iterations, t_str)

    
    if store_probs:
        return sender, receiver_map, df_probs
    
    return sender

In [204]:
s = train_sender(basic_lsg, {0: 0, 1:1}, num_messages = 2, store_probs=False, num_iterations=500)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:26<00:00,  3.42it/s]


In [202]:
""" this code creates a line graph showing how an agent evolved over time """
fig = px.line(df_probs, x=df_probs.index, y=['State 0', 'State 1'], title='Tracking Neural Network Preferences',
              labels={'value': 'Probability of Selecting Message 1', 'x_values': 'Training episode'},
              line_shape='linear', render_mode='svg')

fig.add_shape(type='line', x0=min(list(df_probs.index)), x1=max(list(df_probs.index)),
              y0=0.5, y1=0.5, line=dict(color='gray', width=1), xref='x', yref='y')

fig.update_layout(legend_title_text='Environment State', yaxis=dict(range=[0, 1]))

fig.show()

In [93]:
""" main learning function. allows Nature to pick states in a biased way using state_probs """
def run_learning(utility_table, num_messages = 2, num_iterations = 100, state_probs=None, store_probs=False):
    
    s = time()
    
    num_states = len(utility_table)
    if state_probs == None:
        state_probs = np.ones(num_states) / num_states
    else:
        assert num_states == len(state_probs), 'state_probs length does not match utility table'
    
    num_actions = len(utility_table[0])
    
    # Create sender and receiver agents
    sender = RLAgent(num_states, num_messages)
    receiver = RLAgent(num_messages, num_actions)
    
    emp_data = pd.DataFrame(columns=['Sender State 0', 'Sender State 1', 'Receiver Message 0', 'Receiver Message 1'])

    # Training loop
    for episode in tqdm(range(num_iterations)):
        
        # get random state
        state = random.choices(range(num_states), state_probs)[0]

        # Sender chooses a message
        message = sender.choose_action(state)

        # Receiver chooses an action
        action = receiver.choose_action(message)

        # utility lookup
        utilities = utility_table[state][action]

        # Update sender and receiver models based on the reward
        sender.update_model(state, message, utilities[0])
        receiver.update_model(message, action, utilities[1])
        
        # update the dataframe
        if len(utility_table) == 2 == num_messages:
            emp_data.loc[episode, :] = [test_data(sender, 0)[1], test_data(sender, 1)[1], 
                                        test_data(receiver, 0)[1], test_data(receiver, 1)[1]]
        
    return sender, receiver, emp_data

In [129]:
sender, receiver, emp_data = run_learning(basic_lsg, num_iterations=500, num_messages=2)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [03:21<00:00,  2.48it/s]


In [130]:
fig = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=("Sender", "Receiver"))

# Create line plot
line_fig = px.line(emp_data, x=emp_data.index, y=['Sender State 0', 'Sender State 1'],
        labels={'value': 'Probability of Selecting Message 1', 'x_values': 'Training episode'},
        line_shape='linear', render_mode='svg')

line_fig.add_shape(type='line', x0=min(list(df_probs.index)), x1=max(list(df_probs.index)),
                    y0=0.5, y1=0.5, line=dict(color='gray', width=1), xref='x', yref='y')

line_fig.update_layout(legend_title_text='Environment State', yaxis=dict(range=[0, 1]))

# Add the line plot to the subplot
for trace in line_fig.data:
    fig.add_trace(trace, row=1, col=1)
    

# Create line plot
line_fig = px.line(emp_data, x=emp_data.index, y=['Receiver Message 0', 'Receiver Message 1'],
        labels={'value': 'Probability of Selecting Action 1', 'x_values': 'Training episode'},
        line_shape='linear', render_mode='svg')

line_fig.add_shape(type='line', x0=min(list(df_probs.index)), x1=max(list(df_probs.index)),
                    y0=0.5, y1=0.5, line=dict(color='gray', width=1), xref='x', yref='y')

line_fig.update_layout(legend_title_text='Environment State', yaxis=dict(range=[0, 1]))

# Add the line plot to the subplot
for trace in line_fig.data:
    fig.add_trace(trace, row=2, col=1)

        
# Update subplot layout to show xticks and yticks only on the edges
x_ticks_visibility = True if row_num == 3 else False
y_ticks_visibility = True if col_num == 1 else False

if x_ticks_visibility:
    fig.update_xaxes(showticklabels=x_ticks_visibility, row=row_num, col=col_num, title_text='Training episode')
else:
    fig.update_xaxes(showticklabels=x_ticks_visibility, row=row_num, col=col_num)
        
if y_ticks_visibility:
    fig.update_yaxes(showticklabels=y_ticks_visibility, row=row_num, col=col_num, range=[0,1], title_text='Prob of M1')
else:
    fig.update_yaxes(showticklabels=y_ticks_visibility, row=row_num, col=col_num, range=[0,1])
    
fig.update_yaxes(title_text="Prob of sending message 1", row=1, col=1, range=[0, 1])
fig.update_yaxes(title_text="Prob of taking action 1", row=2, col=1, range=[0, 1])
    
# Update layout for better visibility
fig.update_layout(height=600, width=1000)
fig.update_layout(title_text='How Agents Evolve During a Game')

# Show the plot
fig.show()

In [8]:
def test_case(sender, receiver, state, full=False):
    
    sender_choice = sender.choose_action(state)
    sender_probs =  sender.get_probs(state)[0]
    print("sender chooses", sender_choice, "with confidence", sender_probs[sender_choice])
    if full:
        print("full probability array:", sender_probs)
    
    print()
    receiver_choice = receiver.choose_action(sender_choice)
    receiver_probs = receiver.get_probs(sender_choice)[0]
    print("receiver chooses", receiver_choice, "with confidence", receiver_probs[receiver_choice])
    if full:
        print("full probability array:", receiver_probs)
        print()
        print("Rewards = ", basic_lsg[state][receiver_choice])
    

## Tracking agent choices over time

In [9]:
def test_data(agent, state):
    return agent.get_probs(state)[0]

In [10]:
def state_str(state):
    return f"state {state}"

def message_str(message):
    return f"message {message}"

def action_str(action):
    return f"action {action}"

In [11]:
def collect_data(sender, receiver, state_probs=None):
    """ if state_probs=None, will default to same number of states as actions, and equal probabilities
        state_probs is a listlike containing the relative probabilities of each state occuring """
      
    num_messages = sender.output_size
    num_actions = receiver.output_size
    if state_probs == None:
        num_states = num_actions
        state_probs = np.ones(num_states) / num_states
    else:
        num_states = len(state_probs)

    # Initialize df
    df_data = pd.DataFrame(columns=['src', 'targ', 'frequency'])
    i = 0
    
    """ each row contains a state, message, and action, as well as how often that combo would be produced by the agents 
        so, the frequency for each row is pr(state) and pr(message | state) and pr(action | message) """
    
    for state in range(num_states):
        for message in range(num_messages):
            freq = test_data(sender, state)[message]
            df_data.loc[i, :] = [state_str(state), message_str(message), freq]
            i += 1
            
    for message in range(num_messages):
        for action in range(num_actions):
            freq = test_data(receiver, message)[action]
            df_data.loc[i, :] = [message_str(message), action_str(action), freq]
            i += 1
        
    for state in range(num_states):
        s_src = df_data[df_data['src'] == state_str(state)]

        for idx, row in s_src.iterrows():
            freq = row['frequency']
            entering_volume = 1
            leaving_volume = state_probs[int(row['src'].split()[-1])]

            adj_freq = freq * (entering_volume / leaving_volume)
            
            df_data.at[idx, 'frequency'] = adj_freq
            
    # for each message, calculate how much is going in to that node. 
    message_inflow = list()
    message_outflow = list()
    for message in range(num_messages):
        m_str = message_str(message)
        m_tgt = df_data[df_data['targ'] == m_str]
        message_inflow.append(m_tgt['frequency'].sum())
        
        # then, calculate outflow for each message node
        m_src = df_data[df_data['src'] == m_str]
        message_outflow.append(m_src['frequency'].sum())
        
    for message in range(num_messages):
        m_src = df_data[df_data['src'] == message_str(message)]
        for idx, row in m_src.iterrows():
            freq = row['frequency']
            entering_volume = message_inflow[int(row['src'].split()[-1])]
            leaving_volume = message_outflow[int(row['src'].split()[-1])]

            adj_freq = freq * (entering_volume / leaving_volume)
            

            df_data.at[idx, 'frequency'] = adj_freq
        
   
    return df_data

In [131]:
d = collect_data(sender, receiver)

In [132]:
make_sankey(d, 'src', 'targ', vals='frequency', title='decision flow for full signalling game')

In [86]:
def single_agent_sankey(agent, num_inputs, input_probs=None):
    
    num_outputs = agent.output_size
    
    if input_probs == None:
        input_probs = np.ones(num_inputs) / num_inputs
    
    df_data = pd.DataFrame(columns=['src', 'targ', 'frequency'])
    i = 0
    
    # using sender vocab just so i can keep track of everything
    for state in range(num_inputs):
        for message in range(num_outputs):
            freq = test_data(agent, state)[message]
            df_data.loc[i, :] = [f'input {state}', f'output {message}', freq]
            i += 1
            
    for state in range(num_inputs):
        s_src = df_data[df_data['src'] == f'input {state}']

        for idx, row in s_src.iterrows():
            freq = row['frequency']
            entering_volume = 1
            leaving_volume = input_probs[int(row['src'].split()[-1])]

            adj_freq = freq * (entering_volume / leaving_volume)
            
            df_data.at[idx, 'frequency'] = adj_freq
            
    make_sankey(df_data, 'src', 'targ', vals='frequency', title='decision flow for single agent')

In [205]:
single_agent_sankeyle_agent_sankey(s, 2)

# Code to evaluate trained agents

In [90]:
def evaluate_agents(sender, receiver, utility_table):
    """ for the basic_lsg game, the messages carry no information if the rewards are about .5 """
    sender_reward = 0
    receiver_reward = 0
    
    for state in range(len(utility_table)):
        action = receiver.choose_action_det(sender.choose_action_det(state))
        sender_reward += utility_table[state][action][0]
        receiver_reward += utility_table[state][action][1]
        
    print('average sender reward:', sender_reward / len(utility_table))
    print('average receiver reward:', receiver_reward / len(utility_table))

In [92]:
evaluate_agents(sender, receiver, basic_lsg)

average sender reward: 0.5
average receiver reward: 0.5


# Subplot Loop for Single Agent

In [13]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [80]:
df_probs_list = list()

for _ in range(9):
    s, df_probs = train_sender(basic_lsg, {0: 0, 1: 1}, store_probs=True, num_iterations=1000)
    df_probs_list.append(df_probs)


fig = make_subplots(rows=3, cols=3, shared_xaxes=True, shared_yaxes=True)

for i, df_probs in enumerate(df_probs_list):

    row_num = i // 3 + 1
    col_num = i % 3 + 1

    show_legend = True if (row_num == col_num == 1) else False
    
    # Create line plot
    line_fig = px.line(df_probs, x=df_probs.index, y=['State 0', 'State 1'],
              labels={'value': 'Probability of Selecting Message 1', 'x_values': 'Training episode'},
              line_shape='linear', render_mode='svg')

    line_fig.add_shape(type='line', x0=min(list(df_probs.index)), x1=max(list(df_probs.index)),
                       y0=0.5, y1=0.5, line=dict(color='gray', width=1), xref='x', yref='y')

    line_fig.update_layout(legend_title_text='Environment State', yaxis=dict(range=[0, 1]))

    # Add the line plot to the subplot
    for trace in line_fig.data:
        fig.add_trace(trace, row=row_num, col=col_num)
        
    # Update subplot layout to show xticks and yticks only on the edges
    x_ticks_visibility = True if row_num == 3 else False
    y_ticks_visibility = True if col_num == 1 else False

    if x_ticks_visibility:
        fig.update_xaxes(showticklabels=x_ticks_visibility, row=row_num, col=col_num, title_text='Training episode')
    else:
        fig.update_xaxes(showticklabels=x_ticks_visibility, row=row_num, col=col_num)
        
    if y_ticks_visibility:
        fig.update_yaxes(showticklabels=y_ticks_visibility, row=row_num, col=col_num, range=[0,1], title_text='Prob of M1')
    else:
        fig.update_yaxes(showticklabels=y_ticks_visibility, row=row_num, col=col_num, range=[0,1])
    
    
# Update layout for better visibility
fig.update_layout(height=600, width=800)
fig.update_layout(title_text='Training a Sender Neural Network')

# Show the plot
fig.show()

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:20<00:00,  4.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:18<00:00,  5.03it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:27<00:00,  4.82it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:37<00:00,  4.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:36<00:00,  4.61it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:15<00:00,  5.13it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:12<00:00,  5.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [03:25<00:00,  4.87it/s]
100%|███████████████████████████████████

In [88]:
single_agent_sankey(s, 2)