In [12]:
import numpy as np
import matplotlib.pyplot as plt
import gym
import pixiedust
from gym.envs.registration import register
from qiskit.aqua.components.optimizers import ADAM
from qiskit.aqua.components.variational_forms.ry import RY
from qiskit import *
from datetime import datetime

In [2]:
IBMQ.load_account()

<AccountProvider for IBMQ(hub='ibm-q', group='open', project='main')>

In [3]:
register(
    id='FrozenLakeNotSlippery-v0',
    entry_point='gym.envs.toy_text:FrozenLakeEnv',
    kwargs={'map_name' : '4x4', 'is_slippery': False},
    max_episode_steps=100,
    reward_threshold=0.8196, # optimum = .8196, changing this seems have no influence
)

In [4]:
env = gym.make("FrozenLakeNotSlippery-v0")

In [5]:
def get_numStates(env):
    if type(env.observation_space) != gym.spaces.tuple.Tuple:
        return env.observation_space.n
    dim_list = []
    for sp in env.observation_space:
        dim_list.append(sp.n)
    dim_list = np.array(dim_list)
    return dim_list.prod()

In [6]:
def get_Statehash(env,state):
    if type(env.observation_space) != gym.spaces.tuple.Tuple:
        return state
    dim_list = []
    for sp in env.observation_space:
        dim_list.append(sp.n)
    dim_list = np.array(dim_list)
    h = 0
    for i in range(len(dim_list)-1):
        h += state[i]*dim_list[i+1:].prod()
    h += state[-1]
    return h
    

In [7]:
def QLearning(iterations,alpha, gamma, epsilon ,env):
    returnlist = []
    num_actions = env.action_space.n
    num_states = get_numStates(env)
    Q = np.zeros((num_states,num_actions))
    for it in range(iterations):
        state = env.reset()
        R = 0

        done = False
        while not done:
            state_h = get_Statehash(env,state)
            if np.random.random() < epsilon:
                action = env.action_space.sample()
            else:
                candidates = np.where(Q[state_h] == np.max(Q[state_h]))[0]
                action = np.random.choice(candidates)
            statep, reward, done, info = env.step(action)
            if reward == 0:
                if done:
                    reward = -0.2
                else:
                    reward = -0.01
            else:
                reward = 1.0
            R *= gamma
            R += reward
            statep_h = get_Statehash(env,statep)
            Q[state_h,action] += alpha*(reward + gamma*Q[statep_h].max() -Q[state_h,action])
            state = statep
            
        returnlist.append(R)
        if it%10000 == 0:
            print('Iteration %d, Reward: %d'%(it,R))
            
    return returnlist, Q
        
        
        



In [None]:
rl, Q = QLearning(100000,1e-3,0.99,0.25,env)  

In [None]:
#     backend_sim = Aer.get_backend('qasm_simulator')

#     result = execute(qc, backend_sim, shots = shots).result()

#     counts = result.get_counts()
#     expect = np.zeros(nqbits)
#     for c in counts :
#         for n in range(nqbits):
#             expect[n] += int(c[n])*counts[c]/shots
#     #print(counts)
#     return np.array(expect)

In [30]:
def get_Qvalues(s_list,theta):
    shots = 1000
    qc_list = []
    nqbits = 4
    theta.flatten()
    for s in s_list:
        q = QuantumRegister(nqbits)
        c = ClassicalRegister(nqbits)
        qc = QuantumCircuit(q,c)

        d = np.binary_repr(int(s),nqbits)
        for j,i in enumerate(d):
            if i == '1':
                qc.x(q[nqbits - j - 1]) #changed
        
        qc += RY(nqbits,1, [[0,1],[1,2],[2,3]], 'linear', None, 'cx').construct_circuit(theta,q)        
        qc_list.append(qc)


    
    backend_sim = Aer.get_backend('statevector_simulator')
    qobj = assemble(qc_list,backend_sim)
    #result_list = execute(qc_list, backend_sim, optimization_level = 0).result()
    job = backend_sim.run(qobj)
    result_list = job.result()
    expect_list = []
    for result in result_list.results:
        proba = abs(np.array(result.data.statevector))**2


        expect = np.zeros(nqbits)

        for c in range(len(proba)):
            cbin = np.binary_repr(int(c),nqbits)

            for n in range(nqbits) : 
                if cbin[nqbits - n - 1] == '1': #changed
                    expect[n] += proba[c]
                    
        expect_list.append(expect) #changed
                
    #print(counts)
    return expect_list

In [31]:
def loss(theta, t):
    su = 0
    Qs = get_Qvalues(t[:,1],theta)
    
    for i in range(len(t)):
        su += (t[i][0] - Qs[i][int(t[i][2])])**2
    return su

In [34]:
def DQN(iterations,alpha, gamma, epsilon ,env,N,batchsize):
    nqbits = 4
    D = []
    while len(D) < N:
        s = env.reset()
        done = False
        while not done:
            a = env.action_space.sample()
            s1, r, done,_ = env.step(a)
            if r == 0:
                if done:
                    r = -0.2
                else:
                    r = -0.01
            else:
                r = 1.0
            D.append((s,a,r,s1,done))
            s = s1
            if len(D) == N:
                break
    total_rewards = []
    theta = np.array([2*np.pi*np.random.random(2),2*np.pi*np.random.random(2),2*np.pi*np.random.random(2),2*np.pi*np.random.random(2)]).flatten()
    for it in range(iterations):
        s = env.reset()
        done = False
        total_reward = 0
        while not done:
            #env.render()
            if np.random.random() < epsilon*(0.99**it):
                a = env.action_space.sample()
            else:
                a = np.argmax(get_Qvalues([s],theta)[0])
            s1,r,done,_ = env.step(a)
            if r == 0:
                if done:
                    r = -0.2
                else:
                    r = -0.01
            else:
                r = 1.0
            #print(r)
            total_reward += r
            s = s1
            D.pop(0)
            D.append((s,a,r,s1,done))
        mB_ind = np.random.choice(range(N),size = batchsize,replace = False)
        mB = np.array(D)[mB_ind]
        #update Q on mB
        t = []
        for j in range(batchsize):
            if mB[j][-1]:
                y_j = mB[j][2]
            else:
                y_j = mB[j][2] + gamma*(2*max(get_Qvalues([mB[j][3]],theta)[0])-1)
            y_j /= 2
            y_j += 0.5
            t.append([y_j,mB[j][0],mB[j][1]])

        t = np.array(t)

        adam = ADAM(maxiter = 10, lr = alpha)
        start = datetime.now()
        print(theta)
        theta,_,_ = adam.optimize(2*nqbits,lambda x: loss(x,t), initial_point = theta)
        print(datetime.now()-start)
        if it %1 == 0:
            print('Iteration : ', it, 'Total reward: ', total_reward)
        total_rewards.append(total_reward)
        
    return total_rewards

In [None]:
total_rewards = DQN(100,1e-1,0.99,0.9,env,100,10)



[5.84927497 1.14986342 3.54230691 6.18838833 0.47717541 0.05057074
 5.60634159 2.24829141]
0:00:18.199401
Iteration :  0 Total reward:  -0.3
[ 5.87111368  2.13795325  4.28703621  7.09273692  1.41072874 -0.9041351
  4.88065385  1.91895234]
0:00:14.460910
Iteration :  1 Total reward:  -0.3
[ 6.60866871  2.67624726  4.85991204  7.89027748  2.15594666 -0.51780066
  4.38572373  1.18437063]
0:00:09.740103
Iteration :  2 Total reward:  -0.27
[ 6.3693413   2.97792161  4.627046    7.93893425  2.0935005  -0.23874476
  4.72314689  1.21801426]
0:00:09.843921
Iteration :  3 Total reward:  -0.32
[ 7.01630895  2.02484246  4.31866509  7.80788782  2.82180655 -0.29014308
  5.02293523  1.80842972]
0:00:09.458441
Iteration :  4 Total reward:  -0.24000000000000002
[ 6.39863114  2.5844638   4.90843465  7.93371349  2.82170724 -0.91635717
  4.50885008  1.55067347]
0:00:09.424773
Iteration :  5 Total reward:  -0.37
[ 6.28935226  2.53075192  5.37382173  8.01164945  3.19535886 -0.31361686
  4.01286654  1.3932746

[ 6.16140621  3.04663121  2.86637034  8.2442972   3.23699321 -0.15263937
  2.74666189  0.85918534]
0:00:13.483129
Iteration :  54 Total reward:  -0.23
[ 6.24444912  3.22061461  2.81758004  8.40912667  3.24852848 -0.05054925
  3.0742939   1.12283147]
0:00:13.152826
Iteration :  55 Total reward:  -0.27
[6.07167529 3.2196529  3.09855167 8.49465971 3.10976333 0.05197543
 3.04695314 0.95714017]
0:00:10.562608
Iteration :  56 Total reward:  -0.27
[5.28360549 4.14212861 2.18315678 7.61969607 2.41568409 0.98299681
 2.34025976 1.7782641 ]
0:00:10.361359
Iteration :  57 Total reward:  -0.6600000000000003
[5.40190179 4.46277998 2.26064687 7.97698436 2.13480838 0.42936842
 2.42610668 1.58952314]
0:00:09.869981
Iteration :  58 Total reward:  -0.23
[ 6.10211121  3.79941716  1.60486521  8.20801225  3.0521088  -0.44174778
  1.72364788  1.47658913]
0:00:11.331196
Iteration :  59 Total reward:  -0.43000000000000005
[ 6.39693297  3.68271229  1.80430785  7.71493437  3.30448428 -0.58936548
  1.56277835  1.

In [None]:
print(total_rewards)

In [None]:
plt.plot(total_rewards,'x')
plt.show()