In [None]:
import numpy as np
import math
import functools

In [None]:
def state_reward(state, items):
    return sum(state * items[:, 0])

def state_weight(state, items):
    return sum(state * items[:, 1])

def action_weight(action, items):
    return items[action[0], 1] * action[1]

def available_actions(state, items, actions, max_weight):
    w = max(0, max_weight - state_weight(state, items))

    return [a for a in actions if state[a[0]] + a[1] >= 0 and action_weight(a, items) <= w]

In [None]:
class EarlyStopping:
    def __init__(self, patience = 0):
        self.patience = patience
        self.step = 0
        self.value = 0
    
    def validate(self, value):
        self.step += 1
        
        if value > self.value:
            self.step = 0
            self.value = value
        
        return True if self.step > self.patience else False

# Actor-Critic

In [None]:
def actor_critic(items, epsilon = 0.3, max_weight = 65, penalty = -1000):
    actions = [(n, t) for n in range(len(items)) for t in [-1, 1]]

    init_state = lambda: np.zeros(len(items))
    
    def calc_reward(state):
        w = state_weight(state, items)
        return state_reward(state, items) if w <= max_weight else penalty
    
    def step(state, action):
        next_state = state.copy()

        next_state[action[0]] += action[1]

        return next_state, calc_reward(next_state)

    def policy(Q, state):
        acts = available_actions(state, items, actions, max_weight)

        st = tuple(state)

        if np.random.random() < epsilon or st not in Q or not any([x in Q[st].keys() for x in acts]):
            return acts[np.random.randint(len(acts))]
        
        return functools.reduce(
            lambda acc, x: x if x[0] in acts and (len(acc) == 0 or x[1] > acc[1]) else acc,
            Q[st].items(),
            ()
        )[0]

    def optimize(Q, episodes = 20):
        state = init_state()
        best_state = state

        for _ in range(episodes):
            st = tuple(state)
            
            if st not in Q:
                break
            
            act = max(Q[st], key = Q[st].get)
            
            next_state, reward = step(state, act)
            
            if calc_reward(next_state) > calc_reward(best_state):
                best_state = next_state.copy()
            
            state = next_state
    
        return best_state, calc_reward(best_state)

    def learn(episodes = 1000, max_loop = 100, patience = 10, gamma = 0.9, learning_rate = 0.1):
        Q = {}
        V = {}
        
        def default_Q_values(keys, action):
            for k in keys:
                if k not in Q:
                    Q[k] = {}
                if action not in Q[k]:
                    Q[k][action] = 0

        def default_V_values(keys):
            for k in keys:
                if k not in V:
                    V[k] = 0
                    
        for ep in range(episodes):
            state = init_state()
            
            stopper = EarlyStopping(patience)

            for i in range(max_loop):
                act = policy(Q, state)

                next_state, reward = step(state, act)

                st = tuple(state)
                n_st = tuple(next_state)

                default_Q_values([st, n_st], act)
                default_V_values([st, n_st])

                td = reward + (gamma * V[n_st]) - V[st]

                Q[st][act] += learning_rate * td
                V[st] += learning_rate * td

                state = next_state

                if stopper.validate(reward):
                    break

        return functools.partial(optimize, Q)
    
    return learn

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
from collections import Counter

def test(func, items, n = 100, episodes = 1000, max_weight = 65):
    values = []

    for i in range(n):
        print(i)
        
        learn = func(items, max_weight = max_weight)

        opt = learn(episodes = episodes)
        b_st, b_st_v = opt()

        values += [b_st_v]

    ds = np.array(sorted(Counter(values).items()))

    print(ds)

    plt.bar(ds[:, 0], ds[:, 1])

In [None]:
items1 = np.array([
    [120, 10],
    [130, 12],
    [80, 7],
    [100, 9],
    [250, 21],
    [185, 16]
])

In [None]:
test(actor_critic, items1, n = 100, episodes = 10000)