In [1]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import pixiedust
from gym.envs.registration import register
from qiskit.aqua.components.optimizers import ADAM
from qiskit import *
from datetime import datetime

Pixiedust database opened successfully


In [2]:
IBMQ.load_account()

<AccountProvider for IBMQ(hub='ibm-q', group='open', project='main')>

In [3]:
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.8196, # optimum = .8196, changing this seems have no influence
)

In [4]:
env = gym.make("FrozenLakeNotSlippery-v0")

In [5]:
def get_numStates(env):
    if type(env.observation_space) != gym.spaces.tuple.Tuple:
        return env.observation_space.n
    dim_list = []
    for sp in env.observation_space:
        dim_list.append(sp.n)
    dim_list = np.array(dim_list)
    return dim_list.prod()

In [6]:
def get_Statehash(env,state):
    if type(env.observation_space) != gym.spaces.tuple.Tuple:
        return state
    dim_list = []
    for sp in env.observation_space:
        dim_list.append(sp.n)
    dim_list = np.array(dim_list)
    h = 0
    for i in range(len(dim_list)-1):
        h += state[i]*dim_list[i+1:].prod()
    h += state[-1]
    return h
    

In [7]:
def QLearning(iterations,alpha, gamma, epsilon ,env):
    returnlist = []
    num_actions = env.action_space.n
    num_states = get_numStates(env)
    Q = np.zeros((num_states,num_actions))
    for it in range(iterations):
        state = env.reset()
        R = 0

        done = False
        while not done:
            state_h = get_Statehash(env,state)
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                candidates = np.where(Q[state_h] == np.max(Q[state_h]))[0]
                action = np.random.choice(candidates)
            statep, reward, done, info = env.step(action)
            if reward == 0:
                if done:
                    reward = -0.2
                else:
                    reward = -0.01
            else:
                reward = 1.0
            R *= gamma
            R += reward
            statep_h = get_Statehash(env,statep)
            Q[state_h,action] += alpha*(reward + gamma*Q[statep_h].max() -Q[state_h,action])
            state = statep
            
        returnlist.append(R)
        if it%10000 == 0:
            print('Iteration %d, Reward: %d'%(it,R))
            
    return returnlist, Q
        
        
        



In [None]:
rl, Q = QLearning(100000,1e-3,0.99,0.25,env)  

In [None]:
#     backend_sim = Aer.get_backend('qasm_simulator')

#     result = execute(qc, backend_sim, shots = shots).result()

#     counts = result.get_counts()
#     expect = np.zeros(nqbits)
#     for c in counts :
#         for n in range(nqbits):
#             expect[n] += int(c[n])*counts[c]/shots
#     #print(counts)
#     return np.array(expect)

In [15]:
def get_Qvalues(s_list,theta):
    shots = 1000
    nqbits = 4
    num_rep = 1
    qc_list = []
    for s in s_list:
        q = QuantumRegister(nqbits)
        c = ClassicalRegister(nqbits)
        qc = QuantumCircuit(q,c)

        d = np.binary_repr(int(s),nqbits)
        for j,i in enumerate(d):
            if i == '1':
                qc.x(q[j])

        theta = theta.reshape((nqbits,3))

        for rep in range(num_rep):
            for i in range(1, nqbits):
                qc.cx(q[i-1], q[i])

            for i in range(nqbits):
                qc.u3( theta[i][0], theta[i][1], theta[i][2],q[i])



        #qc.measure(q,c)
        
        qc_list.append(qc)


    
    backend_sim = Aer.get_backend('statevector_simulator')
    qobj = assemble(qc_list,backend_sim)
    #result_list = execute(qc_list, backend_sim, optimization_level = 0).result()
    job = backend_sim.run(qobj)
    result_list = job.result()
    expect_list = []
    for result in result_list.results:
        proba = abs(np.array(result.data.statevector))**2


        expect = np.zeros(nqbits)

        for c in range(len(proba)):
            cbin = np.binary_repr(int(c),nqbits)

            for n in range(nqbits) : 
                if cbin[n] == '1':
                    expect[n] += proba[c]
                    
        expect_list.append(np.flip(expect))
                
    #print(counts)
    return expect_list

In [16]:
def loss(theta, t):
    su = 0
    Qs = get_Qvalues(t[:,1],theta)
    
    for i in range(len(t)):
        su += (t[i][0] - Qs[i][int(t[i][2])])**2
    return su

In [17]:
def DQN(iterations,alpha, gamma, epsilon ,env,N,batchsize):
    nqbits = 4
    D = []
    while len(D) < N:
        s = env.reset()
        done = False
        while not done:
            a = env.action_space.sample()
            s1, r, done,_ = env.step(a)
            if r == 0:
                if done:
                    r = -0.2
                else:
                    r = -0.01
            else:
                r = 1.0
            D.append((s,a,r,s1,done))
            s = s1
            if len(D) == N:
                break
    total_rewards = []
    theta = np.array([2*np.pi*np.random.random(3),2*np.pi*np.random.random(3),2*np.pi*np.random.random(3),2*np.pi*np.random.random(3)]).flatten()
    for it in range(iterations):
        s = env.reset()
        done = False
        total_reward = 0
        while not done:
            #env.render()
            if np.random.random() < epsilon*(0.99**it):
                a = env.action_space.sample()
            else:
                a = np.argmax(get_Qvalues([s],theta)[0])
            s1,r,done,_ = env.step(a)
            if r == 0:
                if done:
                    r = -0.2
                else:
                    r = -0.01
            else:
                r = 1.0
            #print(r)
            total_reward += r
            D.pop(0)
            D.append((s,a,r,s1,done))
        mB_ind = np.random.choice(range(N),size = batchsize,replace = False)
        mB = np.array(D)[mB_ind]
        #update Q on mB
        t = []
        for j in range(batchsize):
            if mB[j][-1]:
                y_j = mB[j][2]
            else:
                y_j = mB[j][2] + gamma*(2*max(get_Qvalues([mB[j][3]],theta)[0])-1)
            y_j /= 2
            y_j += 0.5
            t.append([y_j,mB[j][0],mB[j][1]])

        t = np.array(t)

        adam = ADAM(maxiter = 10, lr = alpha)
        start = datetime.now()
        print(theta)
        theta,_,_ = adam.optimize(3*nqbits,lambda x: loss(x,t), initial_point = theta)
        print(datetime.now()-start)
        if it %1 == 0:
            print('Iteration : ', it, 'Total reward: ', total_reward)
        total_rewards.append(total_reward)
        
    return total_rewards

In [51]:
for it in range(50):
    print(0.9*(0.95**it))

0.9
0.855
0.81225
0.7716374999999999
0.7330556249999999
0.6964028437499998
0.6615827015624999
0.6285035664843748
0.597078388160156
0.5672244687521482
0.5388632453145408
0.5119200830488138
0.486324078896373
0.4620078749515543
0.4389074812039766
0.41696210714377774
0.39611400178658884
0.3763083016972594
0.35749288661239637
0.33961824228177656
0.3226373301676877
0.30650546365930337
0.29118019047633814
0.27662118095252125
0.26279012190489515
0.2496506158096504
0.23716808501916783
0.22530968076820942
0.21404419672979894
0.20334198689330898
0.19317488754864354
0.18351614317121134
0.17434033601265078
0.16562331921201826
0.15734215325141732
0.14947504558884644
0.14200129330940411
0.13490122864393392
0.1281561672117372
0.12174835885115033
0.11566094090859282
0.10987789386316317
0.104383999170005
0.09916479921150476
0.09420655925092951
0.08949623128838302
0.08502141972396388
0.08077034873776569
0.07673183130087738
0.07289523973583352


In [18]:
total_rewards = DQN(100,1e-1,0.99,0.9,env,100,10)

[3.56696459 5.41427603 2.08831096 0.2568303  1.76691217 2.0836809
 6.19940205 3.84015422 5.37933764 5.8995501  0.79210071 2.01212239]
0:00:04.692186
Iteration :  0 Total reward:  -0.26
[3.62516871 4.9778428  2.08831096 1.26518722 1.66034877 2.0836809
 6.05595446 3.84015422 5.37933764 4.89373813 0.39694386 2.01212239]
0:00:04.615707
Iteration :  1 Total reward:  -0.3
[ 4.63336862  4.84994164  2.24327792  2.2175636   2.02891895  1.76901703
  5.04926946  4.12350982  5.313981    4.7992705  -0.03352896  2.248401  ]
0:00:04.593378
Iteration :  2 Total reward:  -0.28
[ 4.0502229   4.1878645   2.24327792  1.40240464  1.81997219  1.99456092
  4.59627951  3.66454584  4.86312984  4.03563313 -0.75103727  1.6065768 ]
0:00:04.465707
Iteration :  3 Total reward:  -0.42000000000000004
[ 4.10638583  4.55034465  2.55107878  2.28087858  1.86678028  1.64883066
  5.03534788  4.2173304   5.44138749  4.51068098 -0.58035824  2.09304225]
0:00:04.750228
Iteration :  4 Total reward:  -0.21000000000000002
[ 4.021

0:00:04.305643
Iteration :  40 Total reward:  -0.25
[ 4.03263881 10.02367036  2.7487734   1.71381684  3.60267876  1.31189118
  4.55557286  3.92115199  5.96894379  3.97954714 -0.85203579  2.59848529]
0:00:04.434275
Iteration :  41 Total reward:  -0.23
[ 4.22586877 10.13268232  2.7487734   2.46829307  4.26562689  1.31189118
  3.81480364  4.57353029  5.96894379  4.03398454 -0.74770789  2.59848529]
0:00:04.231026
Iteration :  42 Total reward:  -0.27
[ 3.62201848  9.92599501  2.7487734   2.2601805   4.1508711   1.31189118
  3.86975011  4.52675256  5.96894379  3.82943371 -0.6295866   2.59848529]
0:00:04.311075
Iteration :  43 Total reward:  -0.27
[ 3.67866054  9.67522503  2.7487734   2.70819473  3.83088145  1.31189118
  3.57265799  4.8155346   5.96894379  3.6551297  -0.39111827  2.59848529]
0:00:04.421256
Iteration :  44 Total reward:  -0.27
[ 3.58946922  9.65498313  2.7487734   2.60112688  3.66692899  1.31189118
  3.63149445  5.04210456  5.96894379  3.59103275 -0.43077934  2.59848529]
0:00:

0:00:04.273721
Iteration :  80 Total reward:  -0.23
[ 4.47552453 10.62877492  2.7487734   1.98036855  0.37685266  1.31189118
  3.06651205  4.99053984  5.96894379  2.7747746  -3.27636345  2.59848529]
0:00:04.200032
Iteration :  81 Total reward:  -0.27
[ 4.14902651 10.87099431  2.7487734   2.63493858  0.70636778  1.31189118
  2.77876825  5.45785328  5.96894379  3.15474721 -3.62073851  2.59848529]
0:00:04.176406
Iteration :  82 Total reward:  -0.6700000000000003
[ 3.71419476 11.53193814  2.7487734   1.94142554  0.47857573  1.31189118
  2.71549749  5.62364293  5.96894379  3.57452704 -3.98324556  2.59848529]
0:00:04.249305
Iteration :  83 Total reward:  -0.5300000000000001
[ 3.76960874 11.1186918   2.7487734   1.10666512  0.09504174  1.31189118
  2.65138358  5.34519964  5.96894379  3.63140048 -3.32887383  2.59848529]
0:00:04.666791
Iteration :  84 Total reward:  -0.5200000000000001
[ 3.71273274 10.75601904  2.7487734   0.62834118 -0.3572405   1.31189118
  2.08689956  5.14364919  5.96894379 

In [None]:
print(total_rewards)

In [None]:
plt.plot(total_rewards,'x')
plt.show()

In [36]:
env.reset()

0

0 - left
1 - down
2 - right
3 - up

0|1|2|3
4|5|6|7
8|9|10|11
12|13|14|15

In [41]:
env.render()

  (Left)
[41mS[0mFFF
FHFH
FFFH
HFFG


In [42]:
env.step(1)


(4, 0.0, False, {'prob': 1.0})

In [46]:
0.99*0.92 - 0.01

0.9008