In [1]:
from rl_optimal_search.environment import *
from tqdm.notebook import tqdm
import numpy as np
from scipy.special import zeta
import pathlib


In [12]:
import matplotlib.pyplot as plt


In [3]:
def powerlaw(beta, l):
    """ Normalized discrete powerlaw """
    return (1/zeta(beta+1, q=1)) * l ** (-1-beta)

def double_exp(parameters, l):
    """ Normalized discrete double exponential """
    
    d_int, d_ext, p = parameters
    gamma_int = 1/d_int
    gamma_ext = 1/d_ext
    
    return p * (1-np.exp(-gamma_int)) * np.exp(-gamma_int * (l-1)) + (1-p) * (1-np.exp(-gamma_ext)) * np.exp(-gamma_ext * (l-1))

In [2]:
from rl_optimal_search.theoretical_policy import get_ps

In [4]:
def policy_from_distr(parameters, max_length, model):
    """
    Gets policy from a given probability distribution.

    Parameters
    ----------
    parameters : (list for double_exp, float for powerlaw) parameters of the distribution
    max_length : (int) maximum length value for which the policy is computed
    model : (str) Theoretical model. Options: 'double_exp', 'powerlaw'

    Returns
    -------
    policy : (list) policy starting from the counter at l=1.
        
    """
    policy = [1]
    for length in range(1, max_length+1):
        if model == 'powerlaw':
            policy.append(1 - powerlaw(parameters, length) / np.prod(policy))
        elif model == 'double_exp':
            policy.append(1 - double_exp(parameters, length) / np.prod(policy))
        
    policy = policy[1:]
    
    return policy

In [4]:
def walk_from_policy(config):
    '''
    Walk of foragers given a probability distribution of step lengths. Performance is evaluated as the number of targets found in a fixed time.
    
    Input
    -------
    config: (dict) config dict with parameters
    
    Description of parameters in config
    ------------------------------------
    time_ep: (int) Number of steps (decisions)
    n: (int) number of agents
    kick: (float) agent is displaced a distance of kick from the target when it finds it
    model: (str) type of model. Options: 'powerlaw', 'double_exp'
    Nt: (int) number of targets
    L: (int) world size
    at:  (int/float) radius of the targets
    destructive:  (bool) True if targets are destructive
                     
    '''
    
    #get parameters of the distributions depending on the chosen model
    if config['model'] == 'powerlaw':
        parameters = config['beta']
    elif config['model'] == 'double_exp':
        parameters = [config['d_int'], config['d_ext'], config['p']]
        
    #get policy: probability of staying in same direction at each time. 
    policy = policy_from_distr(parameters, max_length=config['time_ep'], model=config['model'])
    
    reward = [0] * config['n']
    
    for ag in tqdm(range(config['n'])):
        #initialize agents clock, position and direction, as well as targets in the env.
        pos = np.zeros((config['time_ep'], 2))
        pos[0] = np.random.rand(2)*config['L'] 
        direction = np.random.rand()*2*np.pi 
        
        internal_clock = 0 #which corresponds to l=1
        target_positions = np.random.rand(config['Nt'],2) * config['L']
        
        for t in range(1, config['time_ep']):   
            
            #update position
            pos[t, 0] = pos[t-1, 0] + np.cos(direction)
            pos[t, 1] = pos[t-1, 1] + np.sin(direction)
            
            pos[t] %= config['L']
            
            #check reward
            encounters = get_encounters(pos[t-1], pos[t], target_positions, config['L'], config['at'])
        
            kick = False
            
            if sum(encounters) > 0: 
                
                first_encounter = np.arange(len(target_positions))[encounters]
                
                if config['destructive']:
                    #target is destroyed, sample position for a new target.
                    target_positions[first_encounter] = np.random.rand(1,2) * config['L']
                else:
                
                    #----KICK----
                    # If there was an encounter, we reset direction and change position of particle to (pos target + ls)
                    kick_direction = np.random.rand()*2*np.pi  
                    pos[t,  0] = (target_positions[first_encounter, 0] + config['kick']*np.cos(kick_direction))%config['L']
                    pos[t,  1] = (target_positions[first_encounter, 1] + config['kick']*np.sin(kick_direction))%config['L']
                    #------------
                
                reward[ag] += 1
                kick = True
                
                
                    
                 
            if np.random.rand() > policy[internal_clock] or kick:
                internal_clock = 0
                direction = np.random.rand()*2*np.pi  
                
            else:
                internal_clock += 1
                
        
    mean_reward = np.mean(reward)
    
    return reward

In [8]:
config = {'d_int': 0.5,
          'd_ext': 40,
          'p': 0.5,
          'beta': 0.8,#tune.uniform(0.0000001, 1),
          'model': 'double_exp',
          'time_ep': 3000,
          'n': 5000,
          'kick': 10.0,
          'Nt': 100,
          'L': 100,
          'at': 0.5,
          'destructive': False
          }

In [13]:
results = walk_from_policy(config)

  0%|          | 0/500 [00:00<?, ?it/s]

In [10]:
target_positions = np.random.rand(config['Nt'],2) * config['L']