In [0]:
import numpy as np 
from matplotlib import pyplot as plt



# Question 2

In [0]:
class GridWorldClass():

	def __init__(self):

		self.worldSize = 5
		self.discountFactor = 0.9
		self.totalEpochs =1000
		self.error = 1e-9
		self.Aposition = [0,1]
		self.APrimePosition = [4,1]
		self.BPosition = [0,3]
		self.BPrimePosition = [2,3]
		self.right = [0,1]
		self.left = [0,-1]
		self.up = [-1,0]
		self.down = [1,0]
		self.actionList = [self.right,self.left,self.up,self.down]

	def _stepFunction(self,action, currentState):


		if(currentState ==self.Aposition):
			nextState = self.APrimePosition
			rewards = +10
		 
		elif(currentState ==self.BPosition):
			nextState = self.BPrimePosition
			rewards = +5
		
		else:
			nextState = [currentState[0]+action[0],currentState[1]+action[1]]
			rewards = 0

			if(nextState[0]<0 or nextState[0]>=self.worldSize or nextState[1]<0 or nextState[1]>=self.worldSize):
				rewards =-1
				nextState=currentState
				

		return rewards, nextState



	def simulateLinear(self):
		grid = np.zeros ((self.worldSize,self.worldSize))
		print(grid)
		while(True):
			newGrid = np.zeros ((self.worldSize,self.worldSize))
			# print("EPOCH: "+str(currentEpoch))
			for currentRow in range(self.worldSize):
				for currentColumn in range(self.worldSize):
					for currentAction in self.actionList:
						newRewards,newState = self._stepFunction(currentAction,[currentRow,currentColumn])
						newGrid[currentRow,currentColumn]+= 0.25 * (newRewards+self.discountFactor*grid[newState[0],newState[1]])
			
			if(np.sum(np.abs(newGrid-grid))<self.error):
				print(np.round(grid,1))
				break
				
			else:
				grid = newGrid
		 
		
	def simulateOptimal(self):
		grid = np.zeros ((self.worldSize,self.worldSize))
		print(grid)
		while(True):
			newGrid = np.zeros ((self.worldSize,self.worldSize))
			# print("EPOCH: "+str(currentEpoch))
			for currentRow in range(self.worldSize):
				for currentColumn in range(self.worldSize):
					temp=[]
					for currentAction in self.actionList:
						newRewards,newState = self._stepFunction(currentAction,[currentRow,currentColumn])
						temp.append( newRewards+self.discountFactor*grid[newState[0],newState[1]])
					newGrid[currentRow,currentColumn]=np.max(temp)
			if(np.sum(np.abs(newGrid-grid))<self.error):
				print(np.round(grid,1))
				break
			else:
				grid = newGrid
		 


In [3]:
gridWorldObject = GridWorldClass()
gridWorldObject.simulateLinear()

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[[ 3.3  8.8  4.4  5.3  1.5]
 [ 1.5  3.   2.3  1.9  0.5]
 [ 0.1  0.7  0.7  0.4 -0.4]
 [-1.  -0.4 -0.4 -0.6 -1.2]
 [-1.9 -1.3 -1.2 -1.4 -2. ]]


# Question-4

In [4]:
gridWorldObject.simulateOptimal()

[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]
[[22.  24.4 22.  19.4 17.5]
 [19.8 22.  19.8 17.8 16. ]
 [17.8 19.8 17.8 16.  14.4]
 [16.  17.8 16.  14.4 13. ]
 [14.4 16.  14.4 13.  11.7]]


In [0]:
import numpy as np 
from matplotlib import pyplot as plt



class GridWorldClass():

	def __init__(self):

		self.worldSize = 4
		self.discountFactor = 0.9
		
		# self.error = 1e-4
		self.right = [0,1]
		self.left = [0,-1]
		self.up = [-1,0]
		self.down = [1,0]
		self.theta = 1e-4

		self.actionList = [self.left,self.up,self.right,self.down]
		

	
	def _stepFunction(self,action, currentState):
		nextState = [currentState[0]+action[0],currentState[1]+action[1]]


		if (currentState[0]==0 and currentState[1]==0) or (currentState[0]==self.worldSize-1 and currentState[1]==self.worldSize-1):			
			return 0.0,currentState


		if(nextState[0]<0 or nextState[0]>=self.worldSize or nextState[1]<0 or nextState[1]>=self.worldSize):
			nextState=currentState


		rewards =-1.0
		return rewards, nextState

	def policyEvaluation(self,policy):
		
		self.newStateValues =  np.zeros ((self.worldSize,self.worldSize))

		while(True):
			delta  = 0.0


			oldStateValues = self.newStateValues
			
			for currentRow in range(self.worldSize):
				for currentColumn in range(self.worldSize):
					v=self.newStateValues[currentRow,currentColumn]

					currentAction = self.actionList[int(policy[currentRow,currentColumn])]
					newRewards,newState = self._stepFunction(currentAction,[currentRow,currentColumn])
					self.newStateValues[currentRow,currentColumn] = newRewards+self.discountFactor*self.newStateValues[newState[0],newState[1]]
					temp = np.abs(v-self.newStateValues[currentRow,currentColumn])
					delta = max(delta,temp)
					# print(delta)
			
			
			if(delta<self.theta):
				# print("CONVERGE")
				# print(np.round(self.newStateValues,1))
				return self.newStateValues
				
				
	
	def oneStepLookAheaFunction(self,currentRow,currentColumn,V):
		
		allValues=np.zeros(4)
		for currentAction in range(4):
			newRewards,newState = self._stepFunction(self.actionList[currentAction],[currentRow,currentColumn])
					
			allValues[currentAction]= newRewards+self.discountFactor*V[newState[0],newState[1]]
		return np.argmax(allValues)


	
	def policyImprovement(self):

		policy = np.zeros((4,4))
		actionIndex = [0,1,2,3]

		for currentRow in range(self.worldSize):
			for currentColumn in range(self.worldSize):
				policy[currentRow,currentColumn]=np.random.choice(actionIndex)

			

		print("INTIAL POLICY:\n",policy)
		print("*************************")
		# V=self.policyEvaluation(policy)
			
		count=0
		while(True):

			# print("COUNT: ",count)

			count+=1
			V=self.policyEvaluation(policy)
			policyStable =True
			tempCount=0

			for currentRow in range(self.worldSize):
				for currentColumn in range(self.worldSize):

					chosenAction =policy[currentRow,currentColumn]
					actionTaken = self.oneStepLookAheaFunction(currentRow,currentColumn,V)
					
					policy[currentRow,currentColumn] = actionTaken 
					if(chosenAction!=actionTaken):
						tempCount+=1
						policyStable=False
					
			if(policyStable):
				print("***********FINAL POLICY***********")
				print(policy)
				break

		


	def valueIterations(self):

		self.newStateValues =  np.zeros ((self.worldSize,self.worldSize))

		while(True):
			delta  = 0.0


			oldStateValues = self.newStateValues
			
			for currentRow in range(self.worldSize):
				for currentColumn in range(self.worldSize):
					v=self.newStateValues[currentRow,currentColumn]
					allValues=np.zeros(4)
					for index,currentAction in enumerate(self.actionList):
 

						newRewards,newState = self._stepFunction(currentAction,[currentRow,currentColumn])
						allValues[index] =  newRewards+self.discountFactor*self.newStateValues[newState[0],newState[1]]
						
					self.newStateValues[currentRow,currentColumn] = max(allValues)
					temp = np.abs(v-self.newStateValues[currentRow,currentColumn])
					delta = max(delta,temp)
				
			
			if(delta<self.theta):
				# print("CONVERGE")
				# print(np.round(self.newStateValues,1))
				break
		

				
		policy = np.zeros((4,4))
			

		print("INTIAL POLICY:\n",policy)
		print("*************************")

		for currentRow in range(self.worldSize):
			for currentColumn in range(self.worldSize):
				allValues=np.zeros(4)
				for index,currentAction in enumerate(self.actionList):

					newRewards,newState = self._stepFunction(currentAction,[currentRow,currentColumn])
					allValues[index] = newRewards+self.discountFactor*self.newStateValues[newState[0],newState[1]]

				policy[currentRow,currentColumn] = np.argmax(allValues)
		
		
		print(policy)
		



# Question 6 (a)

In [0]:
gridWorldObject = GridWorldClass()


In [7]:
gridWorldObject.policyImprovement()

INTIAL POLICY:
 [[3. 2. 3. 3.]
 [2. 2. 3. 2.]
 [0. 1. 2. 3.]
 [3. 0. 0. 0.]]
*************************
***********FINAL POLICY***********
[[0. 0. 0. 0.]
 [1. 0. 0. 3.]
 [1. 0. 2. 3.]
 [1. 2. 2. 0.]]


# Question-6 (b)

In [9]:
gridWorldObject.valueIterations()


INTIAL POLICY:
 [[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
*************************
[[0. 0. 0. 0.]
 [1. 0. 0. 3.]
 [1. 0. 2. 3.]
 [1. 2. 2. 0.]]
