# monte carlo code for policyII

In [1]:
import numpy as np
import random
from random import sample
import math 
from collections import defaultdict
import itertools 

In [2]:
weibull_scale=(2365.08,996.88,713.55,1406.84,343.76,3933.12,828.19,2040.95)
weibull_shape=(414.16,109.25,79.81,115.21,169.81,143.60,43.83,296.48)
tf=(2,6.5,2.5,6,5,3.5,3,3.5)
tp=(0.4,5.42,0.625,0.857,1.25,0.7,0.429,0.875)
time_interval=5
running_time=100000

In [4]:
def reset():
    st= [0]*16
    return tuple(st)

In [6]:
def env(action,st,i): 
    
    f = random.weibullvariate(weibull_scale[i],weibull_shape[i])
    
    if action == 0 :
        if f <= st[i]: # fail
            st[i+8]=1
            reward= -(time_interval / tp[i])*time_interval * math.ceil(tf[i]/time_interval)
        else:
            #st[i+8]=0
            st[i] +=5
            reward = 5
            
    if action ==1 :
        if st[i+8] ==0 : 
            reward = -(time_interval / tp[i])*tp[i]
        else :
            reward = -(time_interval / tp[i])*time_interval * math.ceil(tf[i]/time_interval)
            
        st[i]=0
        st[i+8]=0
            
           
    return (tuple(st) , reward)

In [7]:
env(1,list((1,1,1,1,1,1,11,1,1,11,1,1,0,1,1,1)),2)

((1, 1, 0, 1, 1, 1, 11, 1, 1, 11, 0, 1, 0, 1, 1, 1), -40.0)

# pi function 

In [8]:
ALL_POSSIBLE_ACTIONS = [0,1]
def policy_using_pi(St, pi):
    return np.random.choice(ALL_POSSIBLE_ACTIONS, p=[pi[(St,a)] for a in ALL_POSSIBLE_ACTIONS])

In [9]:
def choose_action(state , pi,i):
    if(state[i+8]==1):
        return 1
    else:
        st = (state[i],state[i+8])
        return policy_using_pi(st, pi)  #epsilon_soft 

# Episode functions

In [10]:
GAMMA = 0.50
def play_episode(pi,i):
    #S0,A0,R1,S1,A1,R2,S2,A2,R3 ,...
    s= reset()
    a= choose_action(s, pi ,i)
    
    #r(t) results from taking action a(t-1) from s(t-1) and landing in s(t)
    states_actions_rewards = [((s, a, 0))]
    
    for j in range(running_time//time_interval +1):
    
        s , r = env(a,list(s),i)
        if (j == running_time//time_interval):
            states_actions_rewards.append((s, None, r))
        else:
            a= choose_action(s, pi ,i)
            states_actions_rewards.append((s,a, r))
            
    
     # calculate the returns by working backwards from the terminal state
    G = 0
    states_actions_returns = []
    first = True
    for s, a, r in reversed(states_actions_rewards):
        
       # the value of the terminal state is 0 by definition
       # we should ignore the first state we encounter
        # and ignore the last G, which is meaningless since it doesn't correspond to any move
        if first:
            first = False
        else:
            states_actions_returns.append((s, a, G))
        G = r + GAMMA*G
    
    return states_actions_returns


# Run all episodes

# for tire

In [31]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 0  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))

        

In [32]:
i=0
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([2350])

# for transmission

In [14]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 1  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [30]:
i=1
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print(current_state[i] ,action,r,obs[i])
    current_state = obs
np.unique(time_replace)

array([985])

# for wheel Rim

In [33]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 2  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [34]:
i=2
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([695])

# for coupling

In [36]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 3  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [38]:
i=3
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([1375])

# for motor

In [39]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 4  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [40]:
i=4
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([345])

# for brake

In [41]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 5  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [42]:
i=5
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([3860])

# for steering wheel

In [43]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 6  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [47]:
i=6
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([780])

# for shifting gears

In [48]:
pi = defaultdict(lambda: 1/len(ALL_POSSIBLE_ACTIONS))
Q = np.zeros((100000 ,2) + (2,))
returns =np.zeros((100000 ,2) + (2,))
N = np.zeros((100000 ,2) + (2,))

i = 7  

for epi in range(1000 +1):# Looping through episodes 
    
    epsilon = 1/(epi+1)
    # generate an episode using pi
    states_actions_returns = play_episode(pi,i)
    # calculate Q(s,a)
    seen_state_action_pairs = set()
    for s, a, G in states_actions_returns:
        # check if we have already seen s
        # called "first-visit" MC policy evaluation
        s_t = (s[i] ,s[i+8])
        state_action = (s_t,a)
        
        if state_action not in seen_state_action_pairs:
            
            returns[s_t][a] += G
            N[s_t][a] +=1
                
            Q[s_t][a] = returns[s_t][a] /N[s_t][a] # Average reward across episodes
            
            seen_state_action_pairs.add(state_action)
                
            #for each s in the episode         
            A_star = np.argmax(Q[s_t])# Finding the action with maximum value        
            for a in ALL_POSSIBLE_ACTIONS: # Update action probability for s_t in policy
                if a == A_star:
                    pi[(s_t,a)] = 1 - epsilon + (epsilon / len(ALL_POSSIBLE_ACTIONS))
                else:
                    pi[(s_t,a)] = (epsilon / len(ALL_POSSIBLE_ACTIONS))


In [50]:
i=7
time_replace = []
current_state = reset()
for j in range(50000):
    
    action = np.argmax(Q[(current_state[i],current_state[i+8])])
    if action ==1:
        time_replace.append(current_state[i])
    obs , r = env(action,list(current_state),i)
    #print (current_state[i] ,action,r,obs[i] )
    current_state = obs
np.unique(time_replace)

array([2020])