In [64]:
from amalearn.reward import GaussianReward, RewardBase
from amalearn.agent import AgentBase
from amalearn.environment import EnvironmentBase
import numpy as np
from math import *
import matplotlib.pyplot as plt
import scipy.stats as stats
import gym
from random import randrange

In [97]:
class Reward(RewardBase):
    def __init__(self, stay_prob, base, p, min_value, max_value):
        super(Reward, self).__init__()
        self.inc = p[0]
        self.dec = p[1]
        self.value = base
        self.min = min_value
        self.max = max_value
        self.stay_prob = stay_prob
        self.past_value = 0
    
    def get_reward(self):
        return self.value - self.self.value
    
    def update(self, base_change):
        self.past_value = self.value
        rand = random.uniform(0, 1)
        if self.value < self.min + 1:
            if rand > self.stay_prob:
                self.value = self.value + self.base_change
        elif self.value > self.max - 1:
            if rand  > self.stay_prob:
                self.value = self.value - self.base_change
        else:
            if rand < self.inc:
                self.value = self.value + self.base_change
            elif rand < self.inc + self.dec:
                self.value = self.value - self.base_change

class NArmedBanditEnvironment(EnvironmentBase):
    def __init__(self, action_count, state_count, id, initial_money, goal_money, base_change, base_value, change_prob, stay_prob, min_value, max_value, terminal, actions, container=None):
        state_space = gym.spaces.Discrete(state_count)
        action_space = gym.spaces.Discrete(action_count)
        super(NArmedBanditEnvironment, self).__init__(action_space, state_space, id, container)
        self.state_count = state_count
        self.action_count = action_count
        self.money = initial_money
        self.initial_money = initial_money
        self.goal_money = goal_money
        self.base_change = base_change
        self.stay_prob = stay_prob
        self.min_value = min_value
        self.max_value = max_value
        self.base_value = base_value
        self.change_prob = change_prob
        self.rewards = [Reward(b, stay_prob, cp, min_value, max_value) for (b, cp) in zip(base_value, change_prob)]
        self.actions = actions
        self.terminal = terminal
        self.state = 0
        self.p_table = []
    
    def calculate_reward(self, action):
        for r in self.rewards:
            r.update(self.base_change)
        change = 0
        if action != 0:
            for a in self.actions[action]:
                change = change + self.rewards[a].get_reward()
        return change

    def claculate_p_tabel(self, sp, r, s, a):
        if abs(sp - s) > 1:
            return 0
        if abs(r) != self.base_change and r != 0:
            return 0
        if sp == s:
            if r == 0:
                p = 1
                for ai in a:
                    p= p *(1-self.rewards[ai].inc - self.rewards[ai].dec)
                return p
            else:
                return 0
        elif sp < s:
            if r == -1*self.base_change:
                if len(a) == 0:
                    return 0
                p = 1
                for ai in a:
                    p = p *(self.rewards[ai].dec)
                return p
            else:
                return 0
        else:           
            if r == self.base_change:
                if len(a) == 0:
                    return 0
                p = 1
                for ai in a:
                    p = p *(self.rewards[ai].inc)
                return p
            else:
                return 0
    def init_p_table(self):
        for a in self.actions:
            temp = []
            for s in range(self.state_count):
                p0 = []
                p1 = []
                p2 = []
                for j in range(self.state_count):
                    p0.append(self.claculate_p_tabel(j, -5 , i, a))
                    p1.append(self.claculate_p_tabel(j, 0 , i, a))
                    p2.append(self.claculate_p_tabel(j, 5 , i, a))
                temp.append([p0,p1,p2])
            self.p_table.append(temp)
    
    def claculate_p(self, sp, r, s, a):
        action_index = self.action.index(a)
        return self.p_table[action_index][s][int((r+5)/5)][sp]
    
    def terminated(self):
        if self.state == self.terminal:
            return False
        return True

    def observe(self):
        return 

    def available_actions(self):
        return self.action_space.n

    def next_state(self, action):
        v = self.money + self.rewards[action].get_reward()
        if v < 0:
            self.state = 0
            return
        if v == self.goal_money:
            self.state = self.terminal
            return
        self.state = int(v/50) 
        return #(0-50)(50-100)...(950-1000)+Terminal

    def reset(self):
        self.rewards = [Reward(b, self.stay_prob, cp, self.min_value, self.max_value) for (b, cp) in zip(self.base_value, self.change_prob)]
        self.money = self.initial_money
        return

    def render(self, mode='human'):
        #print('{}:\taction={}'.format(self.state['length'], self.state['last_action']))
        return 

    def close(self):
        return
    
class Agent(AgentBase):
    def __init__(self, id, environment, discount, theta):
        #initialize a random policy and V(s) = 0 for each state
        self.V = np.zeros(environment.state_count)
        self.policy = [randrange(environment.action_count) for i in range(environment.state_count)]
        super(Agent, self).__init__(id, environment)
        self.discount = discount
        self.theta = theta
        self.environment.init_p_table()
    def policy_evaluation(self):
        delta = 0
        while True:
            for s in range(self.environment.state_count):
                vp = self.V[s]
                for sp in range(self.environment.state_count):
                    self.V[s] = claculate_p(sp, 0, s, self.environment.actions[self.policy[s]])*(0 + self.discount*self.V[sp]) + claculate_p(sp, self.environment.base_change, s, self.environment.actions[self.policy[s]])*(self.environment.base_change + self.discount*self.V[sp]) + claculate_p(sp, -1*self.environment.base_change, s, self.environment.actions[self.policy[s]])*(-1*self.environment.base_change + self.discount*self.V[sp])
                delta = max(delta, abs(vp - self.V[s]))
            #print(delta)
            if delta < self.theta:
                break
        return
    def policy_iteration(self):
        stable = True
        for s in range(self.environment.state_count):
            old_action = self.policy[s]
            self.policy[s] = np.argmax([np.sum([claculate_p(sp, 0, s, self.environment.actions[i])*(0 + self.discount*self.V[sp]) + claculate_p(sp, self.environment.base_change, s, self.environment.actions[i])*(self.environment.base_change + self.discount*self.V[sp])+ claculate_p(sp, -1*self.environment.base_change, s, self.environment.actions[i])*(-1*self.environment.base_change + self.discount*self.V[sp]) for sp in range(self.environment.state_count)]) for i in range(self.environment.action_count)])
            if old_action != self.policy[s]:
                stable = False
        return stable
    def take_action(self) -> (object, float, bool, object):
        index_selected_arm = 0
        obs, Ri, d, i = self.environment.step(index_selected_arm)
        self.environment.money = self.environment.money + Ri
        #self.environment.update_selected_arm(index_selected_arm, Ri)
        self.environment.render()
        return t

In [103]:
goal_money = 1000
initial_money = 20
base_change = 5
base_value = [10, 20, 10, 35] # 10+2, 15+5, 10+2, 25+9
change_prob = [[0.2, 0.4], [0.4, 0.3], [0.1, 0.1], [0.2, 0.7]]
company_index = ['A', 'B', 'C', 'D']
actions = [[], [0], [1], [2], [3], [0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
stay_prob = 0.25
min_value = 5
max_value = 100
env = NArmedBanditEnvironment(len(actions), int(goal_money/5), 1, initial_money, goal_money, base_change, base_value, change_prob, stay_prob, min_value, max_value, int(goal_money/5), actions)
agent = Agent('1', env, 0.9, 5)
print(np.shape(env.p_table))
it = 0
while True:
    print('in')
    agent.policy_evaluation()
    print('out')
    if agent.policy_iteration():
        break
    it = it + 1
    print(it)
#for step in range(10):
#    agent.take_action()

(11, 200, 3, 200)
in
out
1
in
out
2
in
out
3
in
out
4
in
out
5
in
out
6
in
out
7
in
out
8
in
out
9
in
out
10
in
out
11
in
out
12
in
out
13
in
out
14
in
out
15
in
out
16
in
out
17
in
out
18
in
out
19
in
out
20
in
out
21
in
out
22
in
out
23
in
out
24
in
out
25
in
out
26
in
out
27
in
out
28
in
out
29
in
out
30
in
out
31
in
out
32
in
out
33
in
out
34
in
out
35
in
out
36
in
out
37
in
out
38
in
out
39
in
out
40
in
out
41
in
out
42
in
out
43
in
out
44
in
out
45
in
out
46
in
out
47
in
out
48
in
out
49
in
out
50
in
out
51
in
out
52
in
out
53
in
out
54
in
out
55
in
out
56
in
out
57
in
out
58
in
out
59
in
out
60
in
out
61
in
out
62
in
out
63
in
out
64
in
out
65
in
out
66
in
out
67
in
out
68
in
out
69
in
out
70
in
out
71
in
out
72
in
out
73
in
out
74
in
out
75
in
out
76
in
out
77
in
out
78
in
out
79
in
out
80
in
out
81
in
out
82
in
out
83
in
out
84
in
out
85
in
out
86
in
out
87
in
out
88
in
out
89
in
out
90
in
out
91
in
out
92
in
out
93
in
out
94
in
out
95
in
out
96
in
out
97
in
out
98
in
out
99
i

KeyboardInterrupt: 

In [66]:
rewards = [[0.2, 0.4], [0.4, 0.3], [0.1, 0.1], [0.2, 0.7]]
def claculate_p(sp, r, s, a):
    if abs(sp - s) > 1:
        return 0
    if abs(r) != 5 and r != 0:
        return 0
    if sp == s:
        if r == 0:
            p = 1
            for ai in a:
                p= p *(1-rewards[ai][0] - rewards[ai][1])
            return p
        else:
            return 0
    elif sp < s:
        if r == -5:
            if len(a) == 0:
                return 0
            p = 1
            for ai in a:
                p = p *(rewards[ai][1])
            return p
        else:
            return 0
    else:           
        if r == 5:
            if len(a) == 0:
                return 0
            p = 1
            for ai in a:
                p = p *(rewards[ai][0])
            return p
        else:
            return 0
#0-5/ 5-10/ 10-15/ 15-20
actions = [[], [0], [1], [2], [3], [0,1], [0,2], [0,3], [1,2], [1,3], [2,3]]
state = [i for i in range(200)]
for i in state:
    p = []
    for j in state:
        p.append(claculate_p(j, 5 , i, actions[5]))
    print(p)

[0, 0.08000000000000002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0.08000000000000002, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,