In [1]:
import numpy as np
import gym
import matplotlib.pyplot as plt
import time

In [8]:
class Q_learning:

    def __init__(self, env, alpha, gamma, epsilon, numberEpisodes, numberBins, lowerBounds, upperBound):
        self.env = env
        self.alpha = alpha
        self.epsilon = epsilon
        self.gamma = gamma
        self.numberEpisodes = numberEpisodes
        self.numberBins = numberBins
        self.lowerBounds = lowerBounds
        self.upperBound = upperBound
        self.actionNumber = env.action_space.n
        self.sumofRewards = []
        self.Qmatrix=np.random.uniform(low=0, high=1, 
                size=(numberOfBins[0],numberOfBins[1],numberOfBins[2],numberOfBins[3],self.actionNumber))

    def returnIndexState(self, state):
        position = state[0]
        velocity = state[1]
        angle = state[2]
        angularVelocity = state[3]

        cartPositinBin = np.linspace(self.lowerBounds[0], self.upperBound[0], self.numberBins[0])

        velocityBin = np.linspace(self.lowerBounds[1], self.upperBound[1], self.numberBins[1])

        angleBin = np.linspace(self.lowerBounds[2], self.upperBound[2], self.numberBins[2])

        angularVelocityBin = np.linspace(self.lowerBounds[3], self.upperBound[3], self.numberBins[3])
        
        indexPostion = np.maximum(np.digitize(state[0], cartPositinBin)-1,0)
        indexVelocity = np.maximum(np.digitize(state[1], velocityBin)-1,0)
        indexAngle = np.maximum(np.digitize(state[2], angleBin)-1,0)
        indexAngularVelocity = np.maximum(np.digitize(state[3], angularVelocityBin)-1,0)
        
        return tuple([indexPostion,indexVelocity, indexAngle, indexAngularVelocity])
        
    def selectAction(self, state, index):
        
        if index < 500:
            return np.random.choice(self.actionNumber)
        randomNumber = np.random.random()
        
        if index > 7000:
            self.epsilon=0.9*self.epsilon
        
        if randomNumber < self.epsilon:
            return np.random.choice(self.actionNumber)
        else:
            return np.random.choice(np.where(self.Qmatrix[self.returnIndexState(state)] == np.max(self.Qmatrix[self.returnIndexState(state)]))[0])
        

    def simulateEpisodes(self):
        
        for indexEpisode in range(self.numberEpisodes):
            rewardsEpisode = []
            (stateS,_)= self.env.reset()
            stateS = list(stateS)
            print('Episode {}'.format(indexEpisode))
            terminalState = False
            while not terminalState:
                stateSIndex = self.returnIndexState(stateS)
                actionA = self.selectAction(stateS, indexEpisode)
                (stateSprime, reward, terminalState,_,_)= self.env.step(actionA)
                rewardsEpisode.append(reward)
                stateSprime= list(stateSprime)
                stateSprimeIndex = self.returnIndexState(stateSprime)
                QmaxPrime = np.max(self.Qmatrix[stateSprimeIndex])                
                if not terminalState:
                    error = reward+self.gamma*QmaxPrime-self.Qmatrix[stateSIndex+(actionA,)]
                    self.Qmatrix[stateSIndex+(actionA,)] = self.Qmatrix[stateSIndex+(actionA,)]+self.alpha*error
                else:
                    error = reward-self.Qmatrix[stateSIndex+(actionA,)]
                    self.Qmatrix[stateSIndex+(actionA,)] = self.Qmatrix[stateSIndex+(actionA,)] + self.alpha*error
                stateS = stateSprime
            print("Sum of rewards {}".format(np.sum(rewardsEpisode)))
            self.sumofRewards.append(np.sum(rewardsEpisode))

    def simulateLearnedStrategy(self):
        import gym 
        import time
        env1=gym.make('CartPole-v1',render_mode='human')
        (currentState,_)=env1.reset()
        env1.render()
        timeSteps=1000
        # obtained rewards at every time step
        obtainedRewards=[]
        
        for timeIndex in range(timeSteps):
            print(timeIndex)
            # select greedy actions
            actionInStateS=np.random.choice(np.where(self.Qmatrix[self.returnIndexState(currentState)]==np.max(self.Qmatrix[self.returnIndexState(currentState)]))[0])
            currentState, reward, terminated, truncated, info =env1.step(actionInStateS)
            obtainedRewards.append(reward)   
            time.sleep(0.05)
            if (terminated):
                time.sleep(1)
                break
        return obtainedRewards,env1

    def simulateRandomStrategy(self):
        import gym 
        import time
        import numpy as np
        env2=gym.make('CartPole-v1')
        (currentState,_)=env2.reset()
        env2.render()
        # number of simulation episodes
        episodeNumber=100
        # time steps in every episode
        timeSteps=1000
        # sum of rewards in each episode
        sumRewardsEpisodes=[]
        
        
        for episodeIndex in range(episodeNumber):
            rewardsSingleEpisode=[]
            initial_state=env2.reset()
            print(episodeIndex)
            for timeIndex in range(timeSteps):
                random_action=env2.action_space.sample()
                observation, reward, terminated, truncated, info =env2.step(random_action)
                rewardsSingleEpisode.append(reward)
                if (terminated):
                    break      
            sumRewardsEpisodes.append(np.sum(rewardsSingleEpisode))
        return sumRewardsEpisodes,env2


In [9]:
env=gym.make('CartPole-v1')
(state,_)=env.reset()
#env.render()
#env.close()
 
# here define the parameters for state discretization
upperBounds=env.observation_space.high
lowerBounds=env.observation_space.low
cartVelocityMin=-3
cartVelocityMax=3
poleAngleVelocityMin=-10
poleAngleVelocityMax=10
upperBounds[1]=cartVelocityMax
upperBounds[3]=poleAngleVelocityMax
lowerBounds[1]=cartVelocityMin
lowerBounds[3]=poleAngleVelocityMin
 
numberOfBinsPosition=30
numberOfBinsVelocity=30
numberOfBinsAngle=30
numberOfBinsAngleVelocity=30
numberOfBins=[numberOfBinsPosition,numberOfBinsVelocity,numberOfBinsAngle,numberOfBinsAngleVelocity]
 
# define the parameters
alpha=0.1
gamma=1
epsilon=0.2
numberEpisodes=15000
 
# create an object
Q1=Q_learning(env,alpha,gamma,epsilon,numberEpisodes,numberOfBins,lowerBounds,upperBounds)
# run the Q-Learning algorithm
Q1.simulateEpisodes()
# simulate the learned strategy
(obtainedRewardsOptimal,env1)=Q1.simulateLearnedStrategy()
 
plt.figure(figsize=(12, 5))
# plot the figure and adjust the plot parameters
plt.plot(Q1.sumRewardsEpisode,color='blue',linewidth=1)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.yscale('log')
plt.show()
plt.savefig('convergence.png')
 
 
# close the environment
env1.close()
# get the sum of rewards
np.sum(obtainedRewardsOptimal)
 
# now simulate a random strategy
(obtainedRewardsRandom,env2)=Q1.simulateRandomStrategy()
plt.hist(obtainedRewardsRandom)
plt.xlabel('Sum of rewards')
plt.ylabel('Percentage')
plt.savefig('histogram.png')
plt.show()
 
# run this several times and compare with a random learning strategy
(obtainedRewardsOptimal,env1)=Q1.simulateLearnedStrategy()

Episode 0
Sum of rewards 25.0
Episode 1
Sum of rewards 24.0
Episode 2
Sum of rewards 16.0
Episode 3
Sum of rewards 26.0
Episode 4
Sum of rewards 18.0
Episode 5
Sum of rewards 15.0
Episode 6
Sum of rewards 49.0
Episode 7
Sum of rewards 16.0
Episode 8
Sum of rewards 22.0
Episode 9
Sum of rewards 48.0
Episode 10
Sum of rewards 19.0
Episode 11
Sum of rewards 11.0
Episode 12
Sum of rewards 15.0
Episode 13
Sum of rewards 12.0
Episode 14
Sum of rewards 31.0
Episode 15
Sum of rewards 27.0
Episode 16
Sum of rewards 19.0
Episode 17
Sum of rewards 37.0
Episode 18
Sum of rewards 14.0
Episode 19
Sum of rewards 14.0
Episode 20
Sum of rewards 70.0
Episode 21
Sum of rewards 17.0
Episode 22
Sum of rewards 27.0
Episode 23
Sum of rewards 18.0
Episode 24
Sum of rewards 14.0
Episode 25
Sum of rewards 20.0
Episode 26
Sum of rewards 15.0
Episode 27
Sum of rewards 29.0
Episode 28
Sum of rewards 18.0
Episode 29
Sum of rewards 16.0
Episode 30
Sum of rewards 14.0
Episode 31
Sum of rewards 13.0
Episode 32
Sum of 

KeyboardInterrupt: 