In [1]:
import gym
import numpy as np

def run_episode(env, parameters):  
    observation = env.reset()
    totalreward = 0
    for _ in range(200):
        action = 0 if np.matmul(parameters,observation) < 0 else 1
        observation, reward, done, info = env.step(action)
        totalreward += reward
        if done:
            break
    return totalreward
def randomsearch(env):
    bestparams = None  
    bestreward = 0  
    for _ in range(10000):  
        parameters = np.random.rand(4) * 2 - 1
        reward = run_episode(env,parameters)
        if reward > bestreward:
            bestreward = reward
            bestparams = parameters
            print("better parameters:")
            print(bestparams)
            print("bestreward:")
            print(bestreward)
            # considered solved if the agent lasts 200 timesteps
            if reward == 200:
                break
def hillclimbing(env):
    noise_scaling = 0.1  
    parameters = np.random.rand(4) * 2 - 1  
    bestreward = 0  
    for _ in range(10000):  
        newparams = parameters + (np.random.rand(4) * 2 - 1)*noise_scaling
        reward = 0
        reward = run_episode(env,newparams)
        
        if reward > bestreward:
            bestreward = reward
            parameters = newparams
            print("better parameters:")
            print(parameters)
            print("bestreward:")
            print(bestreward)
        if reward == 200:
            break

def main():
    env = gym.make("CartPole-v0")
    randomsearch(env)
    #env.reset()
    print("Hill climbing:#######################")
    env = gym.make("CartPole-v0")
    hillclimbing(env)
main()

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
better parameters:
[-0.60998024 -0.33600103  0.93460941  0.85103124]
bestreward:
178.0
better parameters:
[0.54676226 0.0980618  0.94049795 0.99577082]
bestreward:
200.0
Hill climbing:#######################
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
better parameters:
[-0.62657577 -1.0338177   0.9212405   0.24599501]
bestreward:
87.0
better parameters:
[-0.52862169 -1.05704701  1.0004506   0.30510291]
bestreward:
94.0
better parameters:
[-0.59276638 -0.98431025  1.05937998  0.38799486]
bestreward:
103.0
better parameters:
[-0.60155914 -0.90372944  1.02527021  0.4259723 ]
bestreward:
109.0
better parameters:
[-0.63487944 -0.80876368  0.97310605  0.52534324]
bestreward:
200.0


# http://chenrudan.github.io/blog/2016/09/04/cartpole.html

# http://chenrudan.github.io/blog/2016/08/03/reinforcementlearninglesssion7.html

In [2]:
import gym
import numpy as np
import time
def generate_episode(env, weight):
    episode = []
    pre_observation = env.reset()

    t = 0
    #generate 1 episodes for training.
    while 1:
        #env.render()
        pi, action = choose_action(weight, pre_observation)
    
        observation, reward, done, info = env.step(action)
        episode.append([pre_observation, action, pi, reward])
        pre_observation = observation
    
        t += 1
        if done or t > 1000:
            break
    return episode

def evaluate_given_parameter_sigmoid(env, weight):
    observation = env.reset()
    total_reward = 0.
    for t in range(1000):
        env.render()
        weighted_sum = np.dot(weight, observation)
        pi = 1 / (1 + np.exp(-weighted_sum))
        if pi > 0.5:
            action = 1
        else:
            action = 0
        time.sleep(0.4)
        observation, reward, done, info = env.step(action)
        total_reward += reward
        if done:
            break
    return total_reward

def monte_carlo_policy_gradient(env):

    learning_rate = -0.0001
    best_reward = -100.0

    weight = np.random.rand(4)

    for iiter in range(2000):

        cur_episode = generate_episode(env, weight)
        for t in range(len(cur_episode)):
             
            observation, action, pi, reward = cur_episode[t]
            #print(observation, action, pi, reward)
            #weight += learning_rate*(1-pi)*np.transpose(-observation)*reward
            #update weight if πθ(s,1)=1/(1+e−wx)-0.5 sigmoid, ▽θlogπθ(s,1)=(1−pi)∗(−x) 
            #πθ(s,0)=0.5-1/(1+e−wx), ▽θlogπθ(s,1)=(-1)(1−pi)∗(−x)
            #improved by czc
            if pi > 0.5:
                weight += learning_rate*(1-pi)*np.transpose(-observation)*reward
            else:
                weight += learning_rate*(-1)*(1-pi)*np.transpose(-observation)*reward
            
    print ('weight', weight) 
    cur_reward = evaluate_given_parameter_sigmoid(env, weight)
    print ('Monte-Carlo policy gradient get reward', cur_reward)


def softmax(x):
    return np.exp(x)/np.sum(np.exp(x),axis=0)
#测试结果
scores = [3.0,1.0, 0.2]
print softmax(scores)
def choose_actionbysoftmax(weight, observation):

    weighted_sum1 = np.dot(weight[4,0], observation)
    weighted_sum2 = np.dot(weight[4,1], observation)
    epower=softmax([weighted_sum1,weighted_sum2])
    
    pi = 1 / (1 + np.exp(-weighted_sum))#sigmoid function 
    if pi > 0.5:
        action = 1
    else:
        action = 0
    return pi, action

def choose_action(weight, observation):

    weighted_sum = np.dot(weight, observation)
    pi = 1 / (1 + np.exp(-weighted_sum))#sigmoid function 
    if pi > 0.5:
        action = 1
    else:
        action = 0
    return pi, action
# 
import tensorflow as tf
from numpy.random import RandomState

def actor_critic_policy_gradientDeepQN(env):
  
   
    gamma = 1

    p_weight = np.random.rand(4，2)
        
    #weight for value function
    v_weight = np.random.rand(4)

    p_learning_rate = -0.0001
    v_learning_rate = -0.0001

    done = True

    for iiter in range(1000):

        t = 0
        while 1:
            if done:
                #print ('start new training...')
                #print ('p_weight', p_weight)
                #print ('v_weight', v_weight)

                pre_observation = env.reset()
                pre_pi, pre_action = choose_action(p_weight, pre_observation)
        
                pre_phi = pre_observation
                pre_q = np.dot(v_weight, pre_phi)

            #env.render()

            observation, reward, done, info = env.step(pre_action)

            pi, action = choose_action(p_weight, observation)
            
            phi = observation
            q = np.dot(v_weight, phi)

            delta = reward + gamma*q - pre_q
            p_weight += p_learning_rate*(1-pre_pi)*np.transpose(-pre_observation)*pre_q
            #if pi > 0.5:
            #    p_weight += p_learning_rate*(1-pre_pi)*np.transpose(-pre_observation)*pre_q
            #else:
            #    p_weight += p_learning_rate*(-1)*(1-pre_pi)*np.transpose(-pre_observation)*pre_q
            

            v_weight += v_learning_rate*delta*np.transpose(pre_phi)

            pre_pi = pi
            pre_observation = observation
            pre_q = q
            pre_phi = phi
            pre_action = action

            t += 1
            if done:
                break
    print ('p_weight', p_weight)
    print ('v_weight', v_weight)
    cur_reward = evaluate_given_parameter_sigmoid(env, p_weight)
    print ('Actor critic policy gradient get reward', cur_reward)
    def actor_critic_policy_gradient(env):
    gamma = 1

    p_weight = np.random.rand(4)
        
    #weight for value function
    v_weight = np.random.rand(4)

    p_learning_rate = -0.0001
    v_learning_rate = -0.0001

    done = True

    for iiter in range(1000):

        t = 0
        while 1:
            if done:
                #print ('start new training...')
                #print ('p_weight', p_weight)
                #print ('v_weight', v_weight)

                pre_observation = env.reset()
                pre_pi, pre_action = choose_action(p_weight, pre_observation)
        
                pre_phi = pre_observation
                pre_q = np.dot(v_weight, pre_phi)

            #env.render()

            observation, reward, done, info = env.step(pre_action)

            pi, action = choose_action(p_weight, observation)
            
            phi = observation
            q = np.dot(v_weight, phi)

            delta = reward + gamma*q - pre_q
            p_weight += p_learning_rate*(1-pre_pi)*np.transpose(-pre_observation)*pre_q
            #if pi > 0.5:
            #    p_weight += p_learning_rate*(1-pre_pi)*np.transpose(-pre_observation)*pre_q
            #else:
            #    p_weight += p_learning_rate*(-1)*(1-pre_pi)*np.transpose(-pre_observation)*pre_q
            

            v_weight += v_learning_rate*delta*np.transpose(pre_phi)

            pre_pi = pi
            pre_observation = observation
            pre_q = q
            pre_phi = phi
            pre_action = action

            t += 1
            if done:
                break
    print ('p_weight', p_weight)
    print ('v_weight', v_weight)
    cur_reward = evaluate_given_parameter_sigmoid(env, p_weight)
    print ('Actor critic policy gradient get reward', cur_reward)

env = gym.make('CartPole-v0')

#env.monitor.start('cartpole-hill/', force=True)
#actor_critic_policy_gradient(env)
#env.monitor.close()

monte_carlo_policy_gradient(env)

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
weight [ 1.09364854 -0.76305567  0.67415771  2.83642659]
Monte-Carlo policy gradient get reward 180.0


#https://www.jianshu.com/p/2ccbab48414b
#https://github.com/princewen/tensorflow_practice/tree/master/

In [3]:
import numpy as np
def printa():
    a=np.random.rand(4)
    print(a)
printa()

[0.1855881  0.82261147 0.96411283 0.73660345]
