In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class Bot(object):
    # our states can be either "ROCK, PAPER or SCISSORS"
    state_space = 3
    # three actions by our player
    action_space = 3
    q_table = np.random.uniform(low = -2, high = 5, size = (3, 3))
    total_reward, reward = 0,0
    avg_rewards_list = []
    avg_reward = 0
    result = 'DRAW'
    tags = ["ROCK", "PAPER", "SCISSORS"]
    # looses to map
    loses_to = {
       "0": 1, # rock loses to paper
       "1": 2,  # paper loses to scissor
       "2": 0  # scissor loses to rock
    }
    
    def __init__(self, alpha=0.5, gamma=0.2, epsilon=0.8, min_eps=0, episodes=50):
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.min_eps = min_eps
        self.episodes = episodes
        # Calculate episodic reduction in epsilon
        self.reduction = (epsilon - min_eps) / episodes
        
    # either explore or exploit, any which ways return the next action
    def bot_move(self, player_move):
        action = 0
        # Determine next action - epsilon greedy strategy
        if np.random.random() < 1 - self.epsilon:
            print("Exploiting....")
            action = np.argmax(self.q_table[player_move])
        else:
            print("Exploring.....")
            action = np.random.randint(0, self.action_space)
        # Decay epsilon
        if self.epsilon > self.min_eps:
            self.epsilon -= self.reduction
        print("choose ",self.tags[action])
        return action
    
    def get_reward(self, player, bot):
        reward = 0
        if self.get_result(player, bot) == 'WIN':
            reward = 5
        
        elif self.get_result(player, bot) == 'LOSE':
            reward = -2
            
        else:
            # Draw case
            reward = 4
            
        return reward
    
    # update q_table
    def update_experience(self, state, action, reward):
        delta = self.alpha * (reward + self.gamma * np.max(self.q_table[action]) - self.q_table[state,action])
        self.q_table[state,action] += delta
    
    def print_stats(self, player, bot, reward):
        print("Player move : {0}, bot: {1}, reward: {2}, result: {3}, total_reward: {4}".format(self.tags[player], self.tags[bot], reward, self.result, self.total_reward))
        print(self.q_table)
        pass
    
    # returns either a WIN, LOSE or a DRAW to indicate the same.
    def get_result(self, player_move, bot_move):
        if bot_move == player_move:
            self.result = 'DRAW'
        elif self.loses_to[str(bot_move)] == player_move :
            self.result = 'LOSE'
        else:
            self.result = 'WIN'
            
        return self.result    
            
    def get_avg_rewards(self):
        return self.avg_rewards_list
    
    def play(self, player_move):
        # add reward
        bot_move = self.bot_move(player_move)
        reward = self.get_reward(player_move, bot_move)
        self.total_reward += reward
        self.avg_rewards_list.append(reward)
        # update experience
        self.update_experience(player_move, bot_move, reward)
        self.print_stats(player_move, bot_move, reward)

class Game(object):

    def __init__(self, bot, episodes=200):
        self.bot = bot
        self.episodes = episodes
    
    def begin(self):
        player_moves = self.generate_fake_moves(True)
        for idx in range(0, self.episodes):
            self.bot.play(player_moves[idx])
        # Plot Rewards
#         plt.plot(np.arange(len(self.bot.get_avg_rewards())) + 1)
#         plt.savefig('rewards.png')     
#         plt.close()

    def generate_fake_moves(self, fake_moves):
        player_moves = []
        for i in range(0, self.episodes):
            if fake_moves:
                player_moves.append(np.random.randint(0, 3))
            else:
                self.read_player_move()
        return player_moves

    def read_player_move(self):
        return int(input("Enter your move (0- Rock, 1 - paper, 2- scissors): "))

game = Game(Bot())
game.begin()

Exploiting....
choose  ROCK
Player move : SCISSORS, bot: ROCK, reward: 5, result: WIN, total_reward: 5
[[ 4.78310516  0.27437983  2.79928639]
 [ 1.91141204  1.21894438 -1.42904574]
 [ 4.89040196  3.26894488 -1.50504946]]
Exploring.....
choose  ROCK
Player move : SCISSORS, bot: ROCK, reward: 5, result: WIN, total_reward: 10
[[ 4.78310516  0.27437983  2.79928639]
 [ 1.91141204  1.21894438 -1.42904574]
 [ 5.4235115   3.26894488 -1.50504946]]
Exploring.....
choose  SCISSORS
Player move : ROCK, bot: SCISSORS, reward: -2, result: LOSE, total_reward: 8
[[ 4.78310516  0.27437983  0.94199435]
 [ 1.91141204  1.21894438 -1.42904574]
 [ 5.4235115   3.26894488 -1.50504946]]
Exploring.....
choose  ROCK
Player move : ROCK, bot: ROCK, reward: 4, result: DRAW, total_reward: 12
[[ 4.8698631   0.27437983  0.94199435]
 [ 1.91141204  1.21894438 -1.42904574]
 [ 5.4235115   3.26894488 -1.50504946]]
Exploring.....
choose  ROCK
Player move : ROCK, bot: ROCK, reward: 4, result: DRAW, total_reward: 16
[[ 4.92191

[[ 5.          3.12008748 -0.40890928]
 [ 0.45102109  5.          2.35587189]
 [ 6.         -0.02724943  3.51540923]]
Exploiting....
choose  PAPER
Player move : PAPER, bot: PAPER, reward: 4, result: DRAW, total_reward: 825
[[ 5.          3.12008748 -0.40890928]
 [ 0.45102109  5.          2.35587189]
 [ 6.         -0.02724943  3.51540923]]
