In [2]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer

# Tic Toc Toe environment

Our 1st game is the famous Tic Toc Toe. You can read about the game and its rules here: https://en.wikipedia.org/wiki/Tic-tac-toe

We implemented the game as an environment in the style of games in the [Python GYM library](https://gym.openai.com/). The commented source code is available in the file "tic_env.py". Here, we give a brief introduction to the environment and how it can be used.

### Initialization and attributes

You can initialize the environment / game as following:

In [None]:
env = TictactoeEnv()

Which then has the following attributes with the corresponding initial values:

In [None]:
env.__dict__

The game is played by two players: player 'X' and player 'O'. The attribute 'current_player' shows whose turn it is. We assume that player 'X' always plays first.

The attribute 'grid' is a 3x3 numpy array and presents the board in the real game and the state $s_t$ in the reinfocement learning language. Each elements can take a value in {0, 1, -1}:
     0 : place unmarked
     1 : place marked with X 
    -1 : place marked with O 
        
The attribute 'end' shows if the game is over or not, and the attribute 'winner' shows the winner of the game: either "X", "O", or None.  

You can use function 'render' to visualize the current position of the board:

In [None]:
env.render()

### Taking actions

The game environment will recieve action from two players in turn and update the grid. At each time, one player can take the action $a_t$, where $a_t$ can either be an integer between 0 to 8 or a touple, corresponding to the 9 possible.

Function 'step' is used to recieve the action of the player, update the grid:

In [None]:
env.step(6)

In [None]:
env.render()

In [None]:
env.__dict__

In [None]:
env.step((1,1))

In [None]:
env.render()

In [None]:
env.__dict__

But not all actions are available at each time: One cannot choose a place which has been taken before. There is an error if an unavailable action is taken:

In [None]:
env.step((0,2))

### Reward

Reward is always 0 until the end of the game. When the game is over, the reward is 1 if you win the game, -1 if you lose, and 0 besides. Function 'observe' can be used after each step to recieve the new state $s_t$, whether the game is over, and the winner, and function 'reward' to get the reward value $r_t$:

In [None]:
env.observe()

In [None]:
env.reward(player='X')

In [None]:
env.reward(player='O')

An example of finishing the game:

In [None]:
env.step(0)
env.step(3)
env.step(1)

In [None]:
env.render()

In [None]:
env.observe()

In [None]:
env.reward(player='X')

In [None]:
env.reward(player='O')

# Optimal policy for Tic Toc Toe environment

Fortunately, we know the exact optimal policy for Tic Toc Toe. We have implemented and $\epsilon$-greedy version of optimal polciy which you can use for the project.

In [None]:
env.reset();

In [None]:
opt_player = OptimalPlayer(epsilon = 0., player = 'X')

In [None]:
opt_player.act(env.grid)

In [None]:
opt_player.player

### An example of optimal player playing against random player

In [None]:
Turns = np.array(['X','O'])
for i in range(5):
    env.reset()
    grid, _, __ = env.observe()
    Turns = Turns[np.random.permutation(2)]
    player_opt = OptimalPlayer(epsilon=0., player=Turns[0])
    player_rnd = OptimalPlayer(epsilon=1., player=Turns[1])
    for j in range(9):
        if env.current_player == player_opt.player:
            move = player_opt.act(grid)
        else:
            move = player_rnd.act(grid)

        grid, end, winner = env.step(move, print_grid=False)

        if end:
            print('-------------------------------------------')
            print('Game end, winner is player ' + str(winner))
            print('Optimal player = ' +  Turns[0])
            print('Random player = ' +  Turns[1])
            env.render()
            env.reset()
            break


### An example of optimal player playing against optimal player

In [None]:
Turns = np.array(['X','O'])
for i in range(5):
    env.reset()
    grid, _, __ = env.observe()
    Turns = Turns[np.random.permutation(2)]
    player_opt_1 = OptimalPlayer(epsilon=0., player=Turns[0])
    player_opt_2 = OptimalPlayer(epsilon=0., player=Turns[1])
    for j in range(9):
        if env.current_player == player_opt.player:
            move = player_opt_1.act(grid)
        else:
            move = player_opt_2.act(grid)

        grid, end, winner = env.step(move, print_grid=False)

        if end:
            print('-------------------------------------------')
            print('Game end, winner is player ' + str(winner))
            print('Optimal player 1 = ' +  Turns[0])
            print('Optimal player 2 = ' +  Turns[1])
            env.render()
            env.reset()
            break


# Q Learning implementation

### Q-Learning algorithm

In [None]:
import numpy as np
from tic_env import TictactoeEnv, OptimalPlayer
env = TictactoeEnv()

from collections import defaultdict
import random
class Q_Player:
    '''
    Description:
        A class to implement a Q-Learning optimal player in Tic-tac-toe.

    Parameters:
        epsilon: float, in [0, 1]. This is a value between 0-1 that indicates the
            probability of making a random action instead of the optimal action
            at any given time.
        alpha: float, in [0, 1]. This is the learning rate
        gamma: float, in [0, 1]. This is the discount factor

    '''
    def __init__(self, epsilon=0.2, player='X'):
    '''
    Description:
        Initialize the parameters

    Parameters:
        epsilon: float, in [0, 1]. This is a value between 0-1 that indicates the
            probability of making a random action instead of the optimal action
            at any given time.
        player: string, "X" or "O"

    '''
        self.epsilon = epsilon
        self.player = player

    def set_player(self, player = 'X', j=-1):
        '''Set the player'''
        self.player = player
        if j != -1:
            self.player = 'X' if j % 2 == 0 else 'O'

    def empty(self, grid):
        '''return all empty positions'''
        avail = []
        for i in range(9):
            pos = (int(i/3), i % 3)
            if grid[pos] == 0:
                avail.append(pos)
        return avail
    
    def eps_greedyMove(self, grid, assignment, qvals, val = None):
        """ Return the move with the optimal Q-value according to a epsilon-greedy policy. """
        # initialize the dictionaries hat will be updated, qvals and assignment (used to assing an ID to each state)
        self.qvals = qvals
        self.assignment = assignment
        grid_id = self.assignment[str(grid)]
        positions = np.full(9, True) #mask to set invalid positions
        if val is None:
            val = 1 if self.player == 'X' else -1
        for pos in self.empty(grid):
            grid_ = np.copy(grid)
            grid_[pos] = val            
            action = convert(pos)
            positions[action] = False
        # set to NaN invalid positions
        self.qvals[grid_id][positions] = np.nan
        # return the move with highest Q-value
        best_move = int(np.nanargmax(self.qvals[grid_id]))
        if random.random() < self.epsilon:
            move = self.randomMove(grid)
            return move
        else:
            return best_move
        
    def return_dicts(self):
        """Returns the twi updated dictionaries after each player-Q's step"""
        return self.assignment, self.qvals

    def randomMove(self, grid):
        """ Chose a random move from the available options. """
        avail = self.empty(grid)
        return avail[random.randint(0, len(avail)-1)]

    def act(self, grid, assignment, qvals, **kwargs):
        """
        Returns the move in a espilon-gredy manner.
        """
        return self.eps_greedyMove(grid, assignment, qvals)

In [3]:
def convert(move):
    """
    Convert a move in the tuple format to the int format
    """
    if type(move) != tuple:
        return move
    else:
        return (move[0]*3 + move[1] % 3)

## Q1 & Q2

In [None]:
results = []

for k in range(10): # we perform multiple iteartions to have robust results
    Turns = np.array(['X','O'])
    n_games = 20000
    winners = []
    count = []
    qvals = defaultdict(lambda: np.zeros(9)) #Store the Q-values
    assignments = defaultdict(lambda: len(assignments)) #Assign a unique ID to each state
    alpha = 0.05
    gamma = 0.99
    max_eps = 0.8 # min and max epsilon for dynamic espilon
    min_eps = 0.1
    n_star = 20000 # Variable to be changed for Q2, in [1, 1000, 5000, 10000, 20000, 40000]
    for i in range(n_games):
        eps = 0.1 # max(min_eps, max_eps*(1-((i+1)/n_star))) for Q2
        print(k,i)
        env.reset()
        grid, _, __ = env.observe()
        # Initialize the two players
        player_opt_1 = OptimalPlayer(epsilon=0.5, player=Turns[i%2])
        player_q = Q_Player(epsilon=eps, player=Turns[1 - i%2])
        for j in range(9):
            # If current palyer is the optimal player
            if env.current_player == player_opt_1.player:
                move = player_opt_1.act(grid)
                grid, end, winner = env.step(move, print_grid=False)
            
            # If current palyer is the Q-player   
            else:
                move = player_q.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q.return_dicts()
                # Save Q-player's last move (converted to int) and last grid (state)
                last_move_q = convert(move)
                last_grid_q = assignments[str(grid)] # Assign/retrieve the current state's ID
                grid, end, winner = env.step(move, print_grid=False)
                
            # Q learning update for Q-values
            if (env.current_player == player_q.player and j!=0) or end:
                qvals[last_grid_q][last_move_q] += alpha*(env.reward(player=Turns[1 - i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q][last_move_q])
            
            # If the match has ended break the loop
            if end:
                # Append winner to winners list
                winners.append(winner)
                count.append(env.reward(player=Turns[1 - i%2]))
                env.reset()

                break
    results.append(count)

## Q3

### Calculations of $M_{opt}$ and $M_{rand}$

In [None]:
results = []

for k in range(10): # we perform multiple iteartions to have robust results
    Turns = np.array(['X','O'])
    n_games = 20000
    winners = []
    count = []
    qvals = defaultdict(lambda: np.zeros(9))  #Store the Q-values
    assignments = defaultdict(lambda: len(assignments)) #Assign a unique ID to each state
    alpha = 0.05
    gamma = 0.99
    max_eps = 0.8 # min and max epsilon for dynamic espilon
    min_eps = 0.1
    n_star = 30000 # Variable to be changed, in [1, 1000, 5000, 10000, 20000, 40000]
    for i in range(1, n_games+1):
        # TESTING PHASE
        if i%250==0:
            print("Testing...")
            current_testing = []
            for w in range(500):
                #Reset the environment
                env.reset()
                grid, _, __ = env.observe()
                #Initialize the two players: Q-player with eps = 0 and Opt with eps = 0/1 depending on
                #whether the current run is for Mopt or Mrand 
                player_opt_1 = OptimalPlayer(epsilon=1., player=Turns[w%2])
                player_q = Q_Player(epsilon=0., player=Turns[1 - w%2])
                for j in range(9):
                    if env.current_player == player_opt_1.player:
                        move = player_opt_1.act(grid)
                        grid, end, winner = env.step(move, print_grid=False)

                    else:
                        move = player_q.act(grid, assignments, qvals)
                        grid, end, winner = env.step(move, print_grid=False)

                    # If the match has ended break the loop
                    if end:
                        current_testing.append(winner)
                        env.reset()
                        break
            winners.append(current_testing)
        # END OF TESTING               
        eps = max(min_eps, max_eps*(1-((i)/n_star))) # Dynamic epsilon update
        print(k,i)
        env.reset()
        grid, _, __ = env.observe()
        # Training part
        player_opt_1 = OptimalPlayer(epsilon=0.5, player=Turns[i%2])
        player_q = Q_Player(epsilon=eps, player=Turns[1 - i%2]) # Use the dynamic epsilon
        for j in range(9):
            # If current player is Opt
            if env.current_player == player_opt_1.player:
                move = player_opt_1.act(grid)
                grid, end, winner = env.step(move, print_grid=False)
               
            # If current player is Q-player
            else:
                move = player_q.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q.return_dicts()
                # Save Q-player's last move (converted to int) and last grid (state)
                last_move_q = convert(move)
                last_grid_q = assignments[str(grid)]
                grid, end, winner = env.step(move, print_grid=False)

            # Q-values update   
            if (env.current_player == player_q.player and j!=0) or end:
                qvals[last_grid_q][last_move_q] += alpha*(env.reward(player=Turns[1 - i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q][last_move_q])
            
            # If the match has ended break the loop
            if end:
                env.reset()
                break
    results.append(winners)

## Q4

### Calculations of $M_{rand}$ and $M_{opt}$

In [None]:
results = []

for k in range(10):
    # Usual values initialization
    Turns = np.array(['X','O'])
    n_games = 20000
    winners = []
    count = []
    qvals = defaultdict(lambda: np.zeros(9))  
    assignments = defaultdict(lambda: len(assignments))
    alpha = 0.05
    gamma = 0.99
    max_eps = 0.8
    min_eps = 0.1
    n_star = 1 # Optimal n* found in the previous question
    eps_opt = 0 # value to be changed for the optimal player, in [0, 0.1, 0.25, 0.5, 0.75, 0.9]
    for i in range(1, n_games+1):
        # TESTING PHASE
        if i%250==0:
            print("Testing...")
            current_testing = []
            for w in range(500):
                #Reset the environment
                env.reset()
                grid, _, __ = env.observe()
                #Initialize the two players with epsilon 0 for Q-player and 0/1 for Optimal player
                player_opt_1 = OptimalPlayer(epsilon=1, player=Turns[w%2])
                player_q = Q_Player(epsilon=0., player=Turns[1 - w%2])
                for j in range(9):
                    # If optimal player is current player
                    if env.current_player == player_opt_1.player:
                        move = player_opt_1.act(grid)
                        grid, end, winner = env.step(move, print_grid=False)

                    # If Q-player is current player    
                    else:
                        move = player_q.act(grid, assignments, qvals)
                        grid, end, winner = env.step(move, print_grid=False)

                    # If the match has ended break the loop
                    if end:
                        # Append winner to winners list if the game is ended
                        current_testing.append(winner)
                        env.reset()
                        break
            winners.append(current_testing)
        # END OF TESTING               
        eps = max(min_eps, max_eps*(1-((i)/n_star))) # eps for Q-player
        print(k,i)
        env.reset()
        grid, _, __ = env.observe()
        # Initialize the two players with the correct epsilon values
        player_opt_1 = OptimalPlayer(epsilon=eps_opt, player=Turns[i%2])
        player_q = Q_Player(epsilon=eps, player=Turns[1 - i%2])
        for j in range(9):
            # If optimal player is current player
            if env.current_player == player_opt_1.player:
                move = player_opt_1.act(grid)
                grid, end, winner = env.step(move, print_grid=False)
                
            # If player-Q is current player   
            else:
                move = player_q.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q.return_dicts()
                # Save Q-player's last move (converted to int) and last grid (state)
                last_move_q = convert(move)
                last_grid_q = assignments[str(grid)]
                grid, end, winner = env.step(move, print_grid=False)

            # Q-values update    
            if (env.current_player == player_q.player and j!=0) or end:
                qvals[last_grid_q][last_move_q] += alpha*(env.reward(player=Turns[1 - i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q][last_move_q])
            # If the match has ended break the loop
            if end:
                env.reset()
                break
    results.append(winners)

## Q7

### Calculations of $M_{rand}$ and $M_{opt}$

In [None]:
results = []

for k in range(10):
    # Usual values initialization
    Turns = np.array(['X','O'])
    n_games = 20000
    winners = []
    count = []
    qvals = defaultdict(lambda: np.zeros(9))  
    assignments = defaultdict(lambda: len(assignments))
    alpha = 0.05
    gamma = 0.99
    eps = 0.05 # Value to be changed, in [0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9]
    for i in range(1, n_games+1):
        # TESTING PHASE
        if i%250==0:
            print("Testing...")
            current_testing = []
            for w in range(500):
                #Reset the environment
                env.reset()
                grid, _, __ = env.observe()
                #Initialize the two players with epsilon 0 for Q-player and 0/1 for Optimal player 
                player_opt_1 = OptimalPlayer(epsilon=1, player=Turns[w%2])
                player_q = Q_Player(epsilon=0., player=Turns[1 - w%2])
                for j in range(9):
                    # If optimal player is current player
                    if env.current_player == player_opt_1.player:
                        move = player_opt_1.act(grid)
                        grid, end, winner = env.step(move, print_grid=False)

                    # If Q-player is current player    
                    else:
                        move = player_q.act(grid, assignments, qvals)
                        grid, end, winner = env.step(move, print_grid=False)

                    if end:
                        current_testing.append(winner)
                        env.reset()
                        break
            winners.append(current_testing)
        # END OF TESTING               
        print(k,i)
        env.reset()
        grid, _, __ = env.observe()
        # Initialize two Q-players
        player_q_1 = Q_Player(epsilon=eps, player=Turns[i%2])
        player_q_2 = Q_Player(epsilon=eps, player=Turns[1 - i%2])
        for j in range(9):
            # If current player is Q-player 1
            if env.current_player == player_q_1.player:
                move = player_q_1.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q_1.return_dicts()
                # Save Q-player 1's last move (converted to int) and last grid (state)
                last_move_q_1 = convert(move)
                last_grid_q_1 = assignments[str(grid)]
                grid, end, winner = env.step(move, print_grid=False)
                
            # If current player is Q-player 2
            else:
                move = player_q_2.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q_2.return_dicts()
                # Save Q-player 2's last move (converted to int) and last grid (state)
                last_move_q_2 = convert(move)
                last_grid_q_2 = assignments[str(grid)]
                grid, end, winner = env.step(move, print_grid=False)

            # Q-values update for both players's states and actions
            if (env.current_player == player_q_1.player and j!=0) or end:
                qvals[last_grid_q_1][last_move_q_1] += alpha*(env.reward(player=Turns[i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q_1][last_move_q_1])
            elif (env.current_player == player_q_2.player and j!=0) or end:
                qvals[last_grid_q_2][last_move_q_2] += alpha*(env.reward(player=Turns[1 - i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q_2][last_move_q_2])
            
            # If the match has ended break the loop
            if end:
                env.reset()
                break
    results.append(winners)

## Q8

### Calculations of $M_{rand}$ and $M_{opt}$

In [None]:
results = []

for k in range(10):
    # Usual values initialization
    Turns = np.array(['X','O'])
    n_games = 10000
    winners = []
    count = []
    qvals = defaultdict(lambda: np.zeros(9))  
    assignments = defaultdict(lambda: len(assignments))
    alpha = 0.05
    gamma = 0.99
    max_eps = 0.8
    min_eps = 0.1
    n_star = 1 #Value to be changed, in [1 1000 5000 10000 20000 40000]
    for i in range(1, n_games+1):
        # TESTING PHASE
        if i%250==0:
            print("Testing...")
            current_testing = []
            for w in range(500):
                #Reset the environment
                env.reset()
                grid, _, __ = env.observe()
                #Initialize the two players with epsilon 0 for Q-player and 0/1 for Optimal player 
                player_opt_1 = OptimalPlayer(epsilon=0., player=Turns[w%2])
                player_q = Q_Player(epsilon=0., player=Turns[1 - w%2])
                for j in range(9):
                    if env.current_player == player_opt_1.player:
                        move = player_opt_1.act(grid)
                        grid, end, winner = env.step(move, print_grid=False)

                    else:
                        move = player_q.act(grid, assignments, qvals)
                        grid, end, winner = env.step(move, print_grid=False)

                    if end:
                        current_testing.append(winner)
                        env.reset()
                        break
            winners.append(current_testing)
        # END OF TESTING               
        eps = max(min_eps, max_eps*(1-((i)/n_star))) #dynamic epsilon update
        print(k,i)
        env.reset()
        grid, _, __ = env.observe()
        # Initialize two Q-players
        player_q_1 = Q_Player(epsilon=eps, player=Turns[i%2])
        player_q_2 = Q_Player(epsilon=eps, player=Turns[1 - i%2])
        for j in range(9):
            # If current player is Q-player 1
            if env.current_player == player_q_1.player:
                move = player_q_1.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q_1.return_dicts()
                # Save Q-player 1's last move (converted to int) and last grid (state)
                last_move_q_1 = convert(move)
                last_grid_q_1 = assignments[str(grid)]
                grid, end, winner = env.step(move, print_grid=False)
             
            # If current player is Q-player 2
            else:
                move = player_q_2.act(grid, assignments, qvals)
                # set the updated dictionaries as current dictionaries
                assignments, qvals = player_q_2.return_dicts()
                # Save Q-player 2's last move (converted to int) and last grid (state)
                last_move_q_2 = convert(move)
                last_grid_q_2 = assignments[str(grid)]
                grid, end, winner = env.step(move, print_grid=False)

            # Q-values update for both players's states and actions    
            if (env.current_player == player_q_1.player and j!=0) or end:
                qvals[last_grid_q_1][last_move_q_1] += alpha*(env.reward(player=Turns[i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q_1][last_move_q_1])
            elif (env.current_player == player_q_2.player and j!=0) or end:
                qvals[last_grid_q_2][last_move_q_2] += alpha*(env.reward(player=Turns[1 - i%2]) + gamma*np.nanmax(qvals[assignments[str(grid)]]) - qvals[last_grid_q_2][last_move_q_2])
            
            # If the match has ended break the loop
            if end:
                env.reset()
                break
    results.append(winners)

## Deep QL part

The following section has been written followin Pytorch's tutorial on reinforcement learning (DQN).

Source: https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html

In [4]:
import math
import random
from collections import namedtuple, deque
from itertools import count
from tic_env import *

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = TictactoeEnv()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Define the replay buffer that will be used for learning.

In [5]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        """Sample a random sample from the buffer, of size batch size"""
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

Define the model, following the instructions given in the pdf.

In [6]:
class DQN(nn.Module):

    def __init__(self):
        super(DQN, self).__init__()
        self.flatten = nn.Flatten() # First flatten the input from (batch_size,2,3,3) to (batch_size,18)
        self.lin1 = nn.Linear(18, 128) # First hidden linear layer
        self.lin2 = nn.Linear(128, 128) # Second hidden linear layer
        self.lin3 = nn.Linear(128,9) # Output layer

    def forward(self, x):
        x = x.to(device)
        x = self.flatten(x).float()
        x = F.relu(self.lin1(x)) # Apply RElU activation function
        x = F.relu(self.lin2(x)) # Apply RElU activation function
        x = self.lin3(x) # No activation funciton for output layer (i. e. linear activation funciton)
        return x

In [7]:
def select_action(state, eps = 0.1, target = False):
    """
    Description:
        Returns a move chosen according to a epsilon-greedy policy
    
    Parameters:
        state: Tensor representing the current grid
        eps: float
        target: boolean, set to False during train and to True during test
    """
    # epsilon-greedy choiche of the action
    global steps_done
    sample = random.random()
    if sample > eps:
        # Return a move according to NN prediction
        if target:
            with torch.no_grad():
                # predict the action with the target nework (for testing phase)
                return target_net(state).max(1)[1].view(1, 1)            
        else:
            with torch.no_grad():
                # predict the action with the actual nework
                return policy_net(state).max(1)[1].view(1, 1)
    else:
        # Return random move
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)

Optimization

In [8]:
def optimize_model(policy_net, memory, optimizer, transition = None):
    """Optimizer for training part"""
    # case in which the memory is not used (i.e. only the last transition is considered)
    if memory is None :
        state = transition.state
        action = transition.action
        next_state = transition.next_state
        reward = transition.reward
        state_action_value = policy_net(state)[0][action].reshape(1)
        # filter out None next states, i.e. states after the end of a match
        if next_state is None:
            next_state_value = 0.0
        else:
            next_state_value = target_net(next_state)[0].max(0)[0].detach()
        # compute expected value
        expected_state_action_value = (torch.Tensor([next_state_value]) * GAMMA) + reward

        # Huber loss to be optimized
        criterion = nn.SmoothL1Loss()
        loss = criterion(state_action_value, expected_state_action_value)

        # Optimize the model
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        return loss

    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    
    # Go from batch-array of Transitions to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))
    
    # Compute a mask of non-none (final) states
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None]).view(-1, 18)

    # Aggregate states, actions and reward from the same batch
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward).view(-1) #reshape the reward to be coherent with other values

    # Compute the predicted Q-values
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Predict with the target net the non-final states
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch#.cuda()

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    
    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    return loss

Define what is a state in the current setting.

In [9]:
def get_state(grid, q_player):
    """
    Description:
        Function that returns the correct form of state
    
    Parameters:
        grid: numpy array, represent the current grid of the game
        q_player: int, 0 or 1 indicating from which player's point of view to return the state
        
    Output:
        Tensor of size (1,2,3,3)
    """
    ones = torch.ones(1,3,3)
    if q_player == 0:
        state = torch.cat((ones*(grid == 1), ones*(grid == -1)))
        return state.unsqueeze(0)
    else:
        state = torch.cat((ones*(grid == -1), ones*(grid == +1)))
        return state.unsqueeze(0)


## Q11

In [None]:
results = []
losses = []
for repetition in range(1):
    BATCH_SIZE = 64 #batch size for the memory buffer
    GAMMA = 0.99
    eps = 0.1
    n_actions = 9

    #Initialize the policy and target networks
    policy_net = DQN().to(device)
    target_net = DQN().to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
    #Initialize the optimizer and the memory buffer
    optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
    memory = ReplayMemory(10000)

    num_games = 20000
    Turns = np.array(['X','O'])
    rewards = []
    loss_run = [] # save the losses of each run of 20000 games
    for i in range(num_games):
        game_loss = [] # save the losses of the current game
        # Initialize the environment and state
        print("iteration", repetition,", game "+str(i))
        env.reset()
        grid, _, __ = env.observe()
        # Set the initial state from Q-player's point of view
        state = get_state(grid, 1 - i%2)
        # Initialize the optimal player
        player_opt_1 = OptimalPlayer(epsilon=0.5, player=Turns[i%2])
        for t in range(9):
            # If optimal player is current player
            if env.current_player == player_opt_1.player:
                move = convert(player_opt_1.act(grid))
                grid, end, winner = env.step(move, print_grid=False)
            else:
                # Select an action for Q-player
                move = select_action(state)
                # save Q-player's last move
                last_move_q = move
                # Try to do the chosen move
                try:
                    # If valid, do the move and save the state and the reward
                    last_state_q = get_state(grid, 1 - i%2)
                    grid, end, winner = env.step(int(move), print_grid=False)
                    reward = torch.Tensor([env.reward(player=Turns[1 - i%2])])
                except ValueError:
                    # If invalid move, set reward to -1, end the game and push to memory
                    reward = torch.Tensor([-1])
                    rewards.append(-1)
                    memory.push(last_state_q, last_move_q, None, reward)
                    break

            # When it's Q-player's turn and it's not the first move neither the end, push to memory buffer
            if not end:
                next_state = get_state(grid, 1-i%2)
                if env.current_player != player_opt_1.player and t>0:
                    memory.push(last_state_q, last_move_q, next_state, reward)

            # If game has ended, push to memory, set to None next state
            else:
                next_state = None
                reward = env.reward(player=Turns[1 - i%2])
                rewards.append(reward)
                memory.push(last_state_q, last_move_q, next_state, torch.Tensor([reward]))
                break

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the policy network)
            loss = optimize_model(policy_net, memory, optimizer)
            if loss == None:
                game_loss.append(None)
            else:
                game_loss.append(loss.item())
                
        loss_run.append(game_loss)
        # Update the target network, copying all weights and biases of policy_net
        if i % 500 == 0:
            target_net.load_state_dict(policy_net.state_dict())

    results.append(rewards)
    losses.append(loss_run)

## Q12

In [1]:
results = []
losses = []
for repetition in range(1):
    # Initialize the parameters and the nural networks
    BATCH_SIZE = 1 # In this case batch_size = 1
    GAMMA = 0.99
    eps = 0.1

    n_actions = 9

    policy_net = DQN().to(device)
    target_net = DQN().to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
    memory = None # Set memory to None (i. e. use only the last transition)

    num_games = 20000
    Turns = np.array(['X','O'])
    rewards = []
    loss_run = []
    for i in range(num_games):
        loss_game = []
        # Initialize the environment and state
        print("iteration", repetition,", game "+str(i))
        env.reset()
        grid, _, __ = env.observe()
        # Set the initial state from Q-player's point of view
        state = get_state(grid, 1 - i%2)
        # Initialize the optimal player
        player_opt_1 = OptimalPlayer(epsilon=0.5, player=Turns[i%2])
        for t in range(9):
            # If optimal player is current player
            if env.current_player == player_opt_1.player:
                move = convert(player_opt_1.act(grid))
                grid, end, winner = env.step(move, print_grid=False)
            
            else:
                # Select an action for Q-player
                move = select_action(state)
                # Save Q-player's last move
                last_move_q = move
                try:
                    # If valid, do the move and save the state and the reward
                    last_state_q = get_state(grid, 1 - i%2)
                    grid, end, winner = env.step(int(move), print_grid=False)
                    reward = torch.Tensor([env.reward(player=Turns[1 - i%2])])
                except ValueError:
                    # If invalid move, set reward to -1, end the game and optimize
                    reward = torch.Tensor([-1])
                    rewards.append(-1)
                    # Specify the transition that will be used in the optimization part, as now we use only the last one
                    transition = Transition(last_state_q, last_move_q, None, reward)
                    loss = optimize_model(policy_net, memory, optimizer, transition)
                    if loss == None:
                        loss_game.append(loss)
                    else:
                        loss_game.append(loss.item())
                    break

            # When it's Q-player's turn and it's not the first move neither the end, optimize using last transition
            if not end:
                next_state = get_state(grid, 1-i%2)
                if env.current_player != player_opt_1.player and t>0:
                    transition = Transition(last_state_q, last_move_q, next_state, reward)
                    loss = optimize_model(policy_net, memory, optimizer, transition)
                    if loss == None:
                        loss_game.append(loss)
                    else:
                        loss_game.append(loss.item())
            
            # If game has ended, optimize, set to None next state
            else:
                next_state = None
                reward = env.reward(player=Turns[1 - i%2])
                rewards.append(reward)
                transition = Transition(last_state_q, last_move_q, next_state, torch.Tensor([reward]))
                loss = optimize_model(policy_net, memory, optimizer, transition)
                if loss == None:
                    loss_game.append(loss)
                else:
                    loss_game.append(loss.item())
                break

            state = next_state

        loss_run.append(loss_game)
        # Update the target network, copying all weights and biases in DQN
        if i % 500 == 0:
            target_net.load_state_dict(policy_net.state_dict())

    results.append(rewards)
    losses.append(loss_run)

NameError: name 'DQN' is not defined

## Q13

### Calculations of $M_{rand}$ and $M_{opt}$

In [None]:
for n in [1, 1000, 5000, 10000, 15000, 20000, 40000]:  # possible values of n* to try  
    results = []
    for repetition in range(1):
        #set the required parameters and the neural netwokrs
        BATCH_SIZE = 64
        GAMMA = 0.99
        max_eps = 0.8
        min_eps = 0.1
        n_star = n
        TARGET_UPDATE = 500

        n_actions = 9

        policy_net = DQN().to(device)
        target_net = DQN().to(device)
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()

        # Initialize the optimizer and the memory buffer
        optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
        memory = ReplayMemory(10000)

        num_games = 20000
        Turns = np.array(['X','O'])
        winners = []
        for i in range(1, num_games+1):
            # Testing phase
            if i%250==0:
                print("Testing...")
                current_testing = []
                for w in range(500):
                    #Reset the environment
                    env.reset()
                    grid, _, __ = env.observe()
                    #Initialize the Q-player and the optimal player
                    state = get_state(grid, 1 - w%2)
                    player_opt_1 = OptimalPlayer(epsilon=0., player=Turns[w%2]) #1 if rand 0 if opt
                    for t in range(9):
                        if env.current_player == player_opt_1.player:
                            move = convert(player_opt_1.act(grid))
                            grid, end, winner = env.step(move, print_grid=False)
                        else:
                            # Select an action according to policy_net, setting eps = 0 and target = True
                            move = select_action(get_state(grid, 1-w%2), eps=0, target = True)
                            # Try the chosen move
                            try:
                                grid, end, winner = env.step(int(move), print_grid=False)
                            except ValueError:
                                current_testing.append(Turns[w%2])
                                break

                        if end:
                            current_testing.append(winner)
                            break
                winners.append(current_testing)
            # Testing ended
            # Initialize the environment and state
            print("iteration", repetition,", game "+str(i))
            eps = max(min_eps, max_eps*(1-((i)/n_star))) # Dynamic epsilon update
            env.reset()
            grid, _, __ = env.observe()
            # Set state and optimal player
            state = get_state(grid, 1 - i%2)
            player_opt_1 = OptimalPlayer(epsilon=0.5, player=Turns[i%2])
            for t in range(9):
                if env.current_player == player_opt_1.player:
                    move = convert(player_opt_1.act(grid))
                    grid, end, winner = env.step(move, print_grid=False)
                else:
                    # Select and try an action
                    move = select_action(state, eps)
                    last_move_q = move
                    try:
                        last_state_q = get_state(grid, 1 - i%2)
                        grid, end, winner = env.step(int(move), print_grid=False)
                        reward = torch.Tensor([env.reward(player=Turns[1 - i%2])])
                    except ValueError:
                        reward = torch.Tensor([-1])
                        memory.push(last_state_q, last_move_q, None, reward)
                        break

                # When it's Q-player's turn and it's not the first move neither the end, push to memory buffer
                if not end:
                    next_state = get_state(grid, 1-i%2)
                    if env.current_player != player_opt_1.player and t>0:
                        memory.push(last_state_q, last_move_q, next_state, reward)

                # If game has ended, push to memory, set to None next state
                else:
                    next_state = None
                    reward = env.reward(player=Turns[1 - i%2])
                    memory.push(last_state_q, last_move_q, next_state, torch.Tensor([reward]))
                    break

                # Update current state
                state = next_state

                # Perform one step of the optimization (on the policy network)
                loss = optimize_model(policy_net, memory, optimizer)

            # Update the target network, copying all weights and biases from policy_net
            if i % 500 == 0:
                print("loaded")
                target_net.load_state_dict(policy_net.state_dict())

## Q14

### Calculations of $M_{rand}$ and $M_{opt}$

In [12]:
for epsilon in [0, 0.1, 0.25, 0.5, 0.75, 0.9]: # Different values of epsilon to try
    results = []
    for repetition in range(1):
        BATCH_SIZE = 64
        GAMMA = 0.99
        max_eps = 0.8
        min_eps = 0.1
        n_star = 1 # Value found in the previous exercise
        TARGET_UPDATE = 500

        n_actions = 9

        policy_net = DQN().to(device)
        target_net = DQN().to(device)
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()

        optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
        memory = ReplayMemory(10000)

        num_games = 20000
        Turns = np.array(['X','O'])
        winners = []
        for i in range(1, num_games+1):
            # Testing phase
            if i%250==0:
                print("Testing...")
                current_testing = []
                for w in range(500):
                    #Reset the environment
                    env.reset()
                    grid, _, __ = env.observe()
                    # Set current state and optimal player
                    state = get_state(grid, 1 - w%2)
                    player_opt_1 = OptimalPlayer(epsilon=0., player=Turns[w%2]) #1 if rand 0 if opt
                    for t in range(9):
                        if env.current_player == player_opt_1.player:
                            move = convert(player_opt_1.act(grid))
                            grid, end, winner = env.step(move, print_grid=False)
                        else:
                            # Select an action according to policy_net, setting eps = 0 and target = True
                            move = select_action(get_state(grid, 1-w%2), eps=0, target = True)
                            # Try the chosen action
                            try:
                                grid, end, winner = env.step(int(move), print_grid=False)
                            except ValueError:
                                current_testing.append(Turns[w%2])
                                break

                        if end:
                            current_testing.append(winner)
                            break
                winners.append(current_testing)
            # testing ended
            # Initialize the environment and state
            print("iteration", repetition,", game "+str(i))
            eps = max(min_eps, max_eps*(1-((i)/n_star))) # varying epsilon
            env.reset()
            grid, _, __ = env.observe()
            # Set current state and optimal player 
            state = get_state(grid, 1 - i%2)
            player_opt_1 = OptimalPlayer(epsilon=epsilon, player=Turns[i%2])
            for t in range(9):
                if env.current_player == player_opt_1.player:
                    move = convert(player_opt_1.act(grid))
                    grid, end, winner = env.step(move, print_grid=False)
                else:
                    # Select and try an action
                    move = select_action(state, eps)
                    last_move_q = move
                    try:
                        last_state_q = get_state(grid, 1 - i%2)
                        grid, end, winner = env.step(int(move), print_grid=False)
                        reward = torch.Tensor([env.reward(player=Turns[1 - i%2])])
                    except ValueError:
                        reward = torch.Tensor([-1])
                        memory.push(last_state_q, last_move_q, None, reward)
                        break

                # When it's Q-player's turn and it's not the first move neither the end, push to memory buffer
                if not end:
                    next_state = get_state(grid, 1-i%2)
                    if env.current_player != player_opt_1.player and t>0:
                        memory.push(last_state_q, last_move_q, next_state, reward)

                # If game has ended, push to memory, set to None the next state
                else:
                    next_state = None
                    reward = env.reward(player=Turns[1 - i%2])
                    memory.push(last_state_q, last_move_q, next_state, torch.Tensor([reward]))
                    break

                state = next_state

                # Perform one step of the optimization (on the policy network)
                loss = optimize_model(policy_net, memory, optimizer)

            # Update the target network, copying all weights and biases from policy_net
            if i % 500 == 0:
                print("loaded")
                target_net.load_state_dict(policy_net.state_dict())

iteration 0 , game 1
iteration 0 , game 2
iteration 0 , game 3
iteration 0 , game 4
iteration 0 , game 5
iteration 0 , game 6
iteration 0 , game 7
iteration 0 , game 8
iteration 0 , game 9
iteration 0 , game 10
iteration 0 , game 11
iteration 0 , game 12
iteration 0 , game 13
iteration 0 , game 14
iteration 0 , game 15
iteration 0 , game 16
iteration 0 , game 17
iteration 0 , game 18
iteration 0 , game 19
iteration 0 , game 20
iteration 0 , game 21
iteration 0 , game 22
iteration 0 , game 23
iteration 0 , game 24
iteration 0 , game 25
iteration 0 , game 26
iteration 0 , game 27
iteration 0 , game 28
iteration 0 , game 29
iteration 0 , game 30
iteration 0 , game 31
iteration 0 , game 32
iteration 0 , game 33
iteration 0 , game 34
iteration 0 , game 35
iteration 0 , game 36
iteration 0 , game 37
iteration 0 , game 38
iteration 0 , game 39
iteration 0 , game 40
iteration 0 , game 41
iteration 0 , game 42
iteration 0 , game 43
iteration 0 , game 44
iteration 0 , game 45
iteration 0 , game 

KeyboardInterrupt: 

## Q16

### Calculations of $M_{rand}$ and $M_{opt}$

In [None]:
for epsilon in [0, 0.1, 0.25, 0.5, 0.75, 0.9]:  # Different vaues of epsilon to try 
    results = []
    for repetition in range(1):
        # Initialize parameters and the neural networks
        BATCH_SIZE = 64
        GAMMA = 0.99
        TARGET_UPDATE = 500

        n_actions = 9

        policy_net = DQN().to(device)
        target_net = DQN().to(device)
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()

        # Initialize optimizer and memory buffer
        optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
        memory = ReplayMemory(10000)

        num_games = 20000
        Turns = np.array(['X','O'])
        winners = []
        for i in range(1, num_games+1):
            # Testing phase
            if i%250==0:
                print("Testing...")
                current_testing = []
                for w in range(500):
                    #Reset the environment
                    env.reset()
                    grid, _, __ = env.observe()
                    # Set current state and optimal player
                    state = get_state(grid, 1 - w%2)
                    player_opt_1 = OptimalPlayer(epsilon=0., player=Turns[w%2]) #1 if rand 0 if opt
                    for t in range(9):
                        if env.current_player == player_opt_1.player:
                            move = convert(player_opt_1.act(grid))
                            grid, end, winner = env.step(move, print_grid=False)
                        else:
                            # Select and try an action
                            move = select_action(get_state(grid, 1-w%2), eps=0, target = True)
                            try:
                                grid, end, winner = env.step(int(move), print_grid=False)
                            except ValueError:
                                current_testing.append(Turns[w%2])
                                break

                        if end:
                            current_testing.append(winner)
                            break
                winners.append(current_testing)
            # Testing ended
            # Initialize the environment and state
            print("iteration", repetition,", game "+str(i))
            env.reset()
            grid, _, __ = env.observe()
            # Set the states from both Q-players' perspectives
            state_1 = get_state(grid, i%2)
            state_2 = get_state(grid, 1 - i%2)
            for t in range(9):
                # For each player, propose a move, try it, and save it and the state as last move and last state
                if env.current_player == Turns[i%2]:
                    move = select_action(state_1, epsilon)
                    last_move_1 = move
                    try:
                        # Save the state from current palyer's view
                        last_state_1 = get_state(grid, i%2)
                        grid, end, winner = env.step(int(move), print_grid=False)
                        # Save current player's reward
                        reward1 = torch.Tensor([env.reward(player=Turns[i%2])])
                    except ValueError:
                        # Set current player's reward to -1, push to memory and end the game
                        reward1 = torch.Tensor([-1])
                        memory.push(last_state_1, last_move_1, None, reward1)
                        break
                else:
                    move = select_action(state_2, epsilon)
                    last_move_2 = move
                    try:
                        # Save the state from current palyer's view
                        last_state_2 = get_state(grid, 1 - i%2)
                        grid, end, winner = env.step(int(move), print_grid=False)
                        # Save current player's reward
                        reward2 = torch.Tensor([env.reward(player=Turns[1 - i%2])])
                    except ValueError:
                        # Set current player's reward to -1, push to memory and end the game
                        reward2 = torch.Tensor([-1])
                        memory.push(last_state_2, last_move_2, None, reward2)
                        break

                # If the game has not ended, push the transition of the current player
                if not end:
                    if env.current_player == Turns[i%2]:
                        next_state = get_state(grid, i%2)
                        memory.push(last_state_1, last_move_1, next_state, reward1)
                    elif env.current_player == Turns[1-i%2]:
                        next_state = get_state(grid, 1-i%2)
                        memory.push(last_state_2, last_move_2, next_state, reward2)

                # If the game has ended push both players' transitions with next state = None
                else:
                    next_state = None
                    memory.push(last_state_2, last_move_2, next_state, torch.Tensor([env.reward(player=Turns[1 - i%2])]))
                    memory.push(last_state_1, last_move_1, next_state, torch.Tensor([env.reward(player=Turns[i%2])]))
                    break

                # Update states
                state_1 = get_state(grid, i%2)
                state_2 = get_state(grid, 1 - i%2)

                # Perform one step of the optimization (on the policy network)
                loss = optimize_model(policy_net, memory, optimizer)

            # Update the target network, copying all weights and biases from policy_net
            if i % 500 == 0:
                target_net.load_state_dict(policy_net.state_dict())

### Q17

In [None]:
for n in [1000, 5000, 10000, 15000, 20000, 40000]: # Different vaues of n* to try 
    results = []
    for repetition in range(1):
        # Initialize parameters and the neural networks
        BATCH_SIZE = 64
        GAMMA = 0.99
        max_eps = 0.8
        min_eps = 0.1
        n_star = n
        TARGET_UPDATE = 500

        n_actions = 9

        policy_net = DQN().to(device)
        target_net = DQN().to(device)
        target_net.load_state_dict(policy_net.state_dict())
        target_net.eval()

        # Initialize optimizer and memory buffer
        optimizer = optim.Adam(policy_net.parameters(), lr=0.0005)
        memory = ReplayMemory(10000)

        num_games = 20000
        Turns = np.array(['X','O'])
        winners = []
        for i in range(1, num_games+1):
            # Testing phase
            if i%250==0:
                print("Testing...")
                current_testing = []
                for w in range(500):
                    #Reset the environment
                    env.reset()
                    grid, _, __ = env.observe()
                    # Set current state and optimal player
                    state = get_state(grid, 1 - w%2)
                    player_opt_1 = OptimalPlayer(epsilon=1., player=Turns[w%2]) #1 if rand 0 if opt
                    for t in range(9):
                        if env.current_player == player_opt_1.player:
                            move = convert(player_opt_1.act(grid))
                            grid, end, winner = env.step(move, print_grid=False)
                        else:
                            # Select and try an action
                            move = select_action(get_state(grid, 1-w%2), eps=0, target = True)
                            try:
                                grid, end, winner = env.step(int(move), print_grid=False)
                            except ValueError:
                                current_testing.append(Turns[w%2])
                                break

                        if end:
                            current_testing.append(winner)
                            break
                winners.append(current_testing)
            # Testing ended
            # Initialize the environment and state
            print("iteration", repetition,", game "+str(i))
            env.reset()
            epsilon = max(min_eps, max_eps*(1-((i)/n_star)))
            grid, _, __ = env.observe()
            # Set the states from both Q-players' perspectives
            state_1 = get_state(grid, i%2)
            state_2 = get_state(grid, 1 - i%2)
            for t in range(9):
                # For each player, propose a move, try it, and save it and the state as last move and last state
                if env.current_player == Turns[i%2]:
                    move = select_action(state_1, epsilon)
                    last_move_1 = move
                    try:
                        # Save the state from current palyer's view
                        last_state_1 = get_state(grid, i%2)
                        last_state_1_rev = get_state(grid, 1 - i%2)
                        grid, end, winner = env.step(int(move), print_grid=False)
                        # Save current player's reward
                        reward1 = torch.Tensor([env.reward(player=Turns[i%2])])
                    except ValueError:
                        # Set current player's reward to -1, push to memory and end the game
                        reward1 = torch.Tensor([-1])
                        memory.push(last_state_1, last_move_1, None, reward1)
                        break
                else:
                # Select and perform an action
                    move = select_action(state_2, epsilon)
                    last_move_2 = move
                    try:
                        # Save the state from current palyer's view
                        last_state_2 = get_state(grid, 1 - i%2)
                        last_state_2_rev = get_state(grid, i%2)
                        grid, end, winner = env.step(int(move), print_grid=False)
                        # Save current player's reward
                        reward2 = torch.Tensor([env.reward(player=Turns[1 - i%2])])
                    except ValueError:
                        # Set current player's reward to -1, push to memory and end the game
                        reward2 = torch.Tensor([-1])
                        memory.push(last_state_2, last_move_2, None, reward2)
                        break

                # If the game has not ended, push the transition of the current player
                if not end:
                    if env.current_player == Turns[i%2]:
                        next_state = get_state(grid, i%2)
                        memory.push(last_state_1, last_move_1, next_state, reward1)
                    elif env.current_player == Turns[1-i%2]:
                        next_state = get_state(grid, 1-i%2)
                        memory.push(last_state_2, last_move_2, next_state, reward2)

                # If the game has ended push both players' transitions with next state = None
                else:
                    next_state = None
                    reward = env.reward(player=Turns[1 - i%2])
                    memory.push(last_state_2, last_move_2, next_state, torch.Tensor([reward]))
                    memory.push(last_state_1, last_move_1, next_state, torch.Tensor([env.reward(player=Turns[i%2])]))
                    break

                # Update states
                state_1 = get_state(grid, i%2)
                state_2 = get_state(grid, 1 - i%2)

                # Perform one step of the optimization (on the policy network)
                loss = optimize_model(policy_net, memory, optimizer)

            # Update the target network, copying all weights and biases from policy_net
            if i % 500 == 0:
                target_net.load_state_dict(policy_net.state_dict())

        results.append(winners)
        np.save(f"Q17_rand_n{n}_1run", results)