In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
N = 50
L = 150
dt = 0.01
T = 100
num_states = 2
num_actions = 2
v0 = 20
w0 = 13.3

In [3]:
episodes = 1000
learning_steps = 20
pred_pos = np.array([0.0,0.0])
time_steps = 10_000 # int(T/dt)
epsilon_0 = 0.995
alpha_0 = 0.8
gamma = 0.99
D0 = 1
tau_r = (1.10688**2)/(3*D0)

In [4]:
Q_matrix = np.ones((num_states , num_actions))

In [5]:
Q_matrix

array([[1., 1.],
       [1., 1.]])

# States -- Rows

### 0 - cross_prod is positive ---> predator is at left to particle
### 1 - cross_prod is negative ---> predator is at right to particle


# Actions -- Columns

### 0 - (+w0) - turn left
### 1 - (-w0) - turn right

In [6]:
def cross_sign_from_orientation_and_position(positions, phi):
    phi = phi.flatten()

    positions = -positions
    
    cos_phi = np.cos(phi)
    sin_phi = np.sin(phi)

    x = positions[:, 0]
    y = positions[:, 1]

    cross = cos_phi * y - sin_phi * x

    return (cross <= 0).astype(int)


# 0 - left is predator
# 1 - right is predator

In [7]:
for ep in range(episodes):
    
    alpha = alpha_0 / (alpha_0 + ep)
    epsilon = epsilon_0**ep

    pos_ = (np.random.rand(N,2) - 0.5) * L
    phi_ = np.random.rand(N) * 2 * np.pi

    for step in range(learning_steps):

        states_s0 = cross_sign_from_orientation_and_position(pos_ , phi_)

        best = np.argmax(Q_matrix[states_s0] , axis = 1)
        rand = np.random.rand(N)
        a = np.where(rand < (1-epsilon) , best , np.random.randint(0,2,N))
                
        ws = np.where(a==0 , w0 , -w0)

        for _ in range(time_steps):

            phi_ += (np.sqrt(2*dt/tau_r) * (np.random.randn(N)) + ws*dt)
            direction_vectors = np.vstack((np.cos(phi_) , np.sin(phi_))).T
            
            v = direction_vectors * v0
            
            pos_ += v*dt + np.sqrt(2*D0*dt)*np.random.randn(N,2)
            pos_ = ((pos_ + L/2) % L) - L/2

        states_s1 = cross_sign_from_orientation_and_position(pos_ , phi_)
        r2 = np.sum(pos_**2,axis = 1)
        reward = r2.copy()
        q_matrix_max = Q_matrix[states_s1].max(axis = 1)
        
        for i in range(N):
            Q_matrix[states_s0[i] , a[i]] += alpha * (reward[i] + gamma * (q_matrix_max[i]) - Q_matrix[states_s0[i] , a[i]])
    
    if ep%10 == 0:
        print(ep)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990


In [8]:
a1 = Q_matrix[0].argmax()
a2 = Q_matrix[1].argmax()

In [9]:
a1

1

In [10]:
a2

0

In [11]:
Q_matrix

array([[382116.03486762, 382574.47616531],
       [382572.22721697, 382138.61534273]])