In [None]:
from simulator import Simulator
from q import QPlayer
import matplotlib.pyplot as plt
from collections import defaultdict
import pickle
import numpy as np
from player import *
from mcts import MCTSPlayer

### 1. Training QPlayer with 3 Lv1 QPlayers

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts
Q_saved = pickle.load(open('q_weights', 'rb'))
N_saved = pickle.load(open('n_weights', 'rb'))
Q2_saved = pickle.load(open('q2_weights', 'rb'))
N2_saved = pickle.load(open('n2_weights', 'rb'))

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_1 = []
for i in range(150000):
    # sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, QPlayer],
    #                          params=[{}, {}, {},
    #                                    {'Q': Q_new, 'N': N_new,
    #                                     'c': .01,
    #                                     'depth': 100,
    #                                     'num_simulations': 10,
    #                                     'alpha': 0.1}],
    #                            verbosity=0)
    sim = Simulator.from_start([QPlayer, QPlayer, QPlayer, QPlayer],
                             params=[{'Q': Q_saved, 'N': N_saved,
                                        'c': .01,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.1,
                                        'learn': False}]*3 + \
                                       [{'Q': Q_new, 'N': N_new,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05}] * 1,
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 1000 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_1.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)

In [None]:
plt.plot(rates_1[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")
# plt.xticks(range(149), np.arange(1000, 150000, 1000))

### 2. Training QPlayer with 3 RandomPlayers

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_2 = []
for i in range(150000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, QPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_new, 'N': N_new,
                                        'c': .01,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.1}],
                               verbosity=0)
#     sim = Simulator.from_start([QPlayer, QPlayer, QPlayer, QPlayer],
#                              params=[{'Q': Q_saved, 'N': N_saved,
#                                         'c': .01,
#                                         'depth': 100,
#                                         'num_simulations': 10,
#                                         'alpha': 0.1,
#                                         'learn': False}]*3 + \
#                                        [{'Q': Q_new, 'N': N_new,
#                                         'c': .1,
#                                         'depth': 100,
#                                         'num_simulations': 10,
#                                         'alpha': 0.05}] * 1,
#                                verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 1000 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_2.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)

In [None]:
plt.plot(rates_2[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")
# plt.xticks(range(149), np.arange(1000, 150000, 1000))

### 3. Training QPlayer with 3 Lv2 QPlayers

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts
Q_saved = pickle.load(open('q3_weights', 'rb'))
N_saved = pickle.load(open('n3_weights', 'rb'))

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_3 = []
for i in range(250000):
    sim = Simulator.from_start([QPlayer, QPlayer, QPlayer, QPlayer],
                             params=[{'Q': Q_saved, 'N': N_saved,
                                        'c': .01,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.1,
                                        'learn': False}]*3 + \
                                       [{'Q': Q_new, 'N': N_new,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05}] * 1,
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 1000 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_3.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)

pickle.dump(Q_new, open('q4_weights', 'wb'))
pickle.dump(N_new, open('n4_weights', 'wb'))

In [None]:
plt.plot(rates_3[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")

### 4. Training QPlayer with 3 HeuristicPlayers

In [None]:
winners = np.array([0, 0, 0, 0, 0])
for i in range(10000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, HeuristicPlayer],
                             params=[{}, {}, {}, {}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
print((winners[3] / np.sum(winners)) * 100)

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_4 = []
for i in range(150000):
    sim = Simulator.from_start([HeuristicPlayer, HeuristicPlayer, HeuristicPlayer, QPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_new, 'N': N_new,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 1000 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_4.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)

pickle.dump(Q_new, open('q_weights_heu', 'wb'))
pickle.dump(N_new, open('n_weights_heu', 'wb'))

In [None]:
plt.plot(rates_4[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")

### 5. Comparing 3 different levels of trained QPlayers

In [None]:
Q_1 = pickle.load(open('q_weights', 'rb'))
N_1 = pickle.load(open('n_weights', 'rb'))
Q_2 = pickle.load(open('q3_weights', 'rb'))
N_2 = pickle.load(open('n3_weights', 'rb'))
Q_3 = pickle.load(open('q4_weights', 'rb'))
N_3 = pickle.load(open('n4_weights', 'rb'))

In [None]:
### Lv1
winners = np.array([0, 0, 0, 0, 0])
for i in range(10000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, QPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_1, 'N': N_1,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                       'learn': False}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
print((winners[3] / np.sum(winners)) * 100)

In [None]:
### Lv2
winners = np.array([0, 0, 0, 0, 0])
for i in range(10000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, QPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_2, 'N': N_2,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                       'learn': False}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
print((winners[3] / np.sum(winners)) * 100)

In [None]:
### Lv3
winners = np.array([0, 0, 0, 0, 0])
for i in range(10000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, QPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_3, 'N': N_3,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                       'learn': False}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
print((winners[3] / np.sum(winners)) * 100)

In [None]:
### Heuristic vs Lv1 vs Lv2 vs Lv3
winners = np.array([0, 0, 0, 0, 0])
for i in range(10000):
    sim = Simulator.from_start([HeuristicPlayer, QPlayer, QPlayer, QPlayer],
                             params=[{}, {'Q': Q_1, 'N': N_1,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                       'learn': False}, 
                                     {'Q': Q_2, 'N': N_2,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                       'learn': False},
                                       {'Q': Q_3, 'N': N_3,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                       'learn': False}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
print((winners[0] / np.sum(winners)) * 100)
print((winners[1] / np.sum(winners)) * 100)
print((winners[2] / np.sum(winners)) * 100)
print((winners[3] / np.sum(winners)) * 100)

In [None]:
### Lv3 vs 3*Lv2
winners = np.array([0, 0, 0, 0, 0])
for i in range(10000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, QPlayer],
                             params=[{'Q': Q_2, 'N': N_2,
                                        'c': .01,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.1,
                                        'learn': False}]*3 + \
                                       [{'Q': Q_3, 'N': N_3,
                                        'c': .1,
                                        'depth': 100,
                                        'num_simulations': 10,
                                        'alpha': 0.05,
                                        'learn': False}] * 1,
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
print((winners[3] / np.sum(winners)) * 100)

### 6. Training MCTSPlayer with 3 HeuristicPlayers

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_6 = []
for i in range(1000):
    sim = Simulator.from_start([HeuristicPlayer, HeuristicPlayer, HeuristicPlayer, MCTSPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_new, 'N': N_new,
                                        'c': .1,
                                        'depth': 5,
                                        'num_simulations': 10,
                                        'alpha': 0.05}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 10 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_6.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)

pickle.dump(Q_new, open('q_weights_heu_mcts', 'wb'))
pickle.dump(N_new, open('n_weights_heu_mcts', 'wb'))

In [None]:
plt.plot(rates_6[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")

### 7. Training MCTSPlaying with 3 RandomPlayers

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_7 = []
for i in range(150000):
    sim = Simulator.from_start([RandomPlayer, RandomPlayer, RandomPlayer, MCTSPlayer],
                             params=[{}, {}, {},
                                       {'Q': Q_new, 'N': N_new,
                                        'c': .1,
                                        'depth': 10,
                                        'num_simulations': 10,
                                        'alpha': 0.05}],
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 1000 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_7.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)

pickle.dump(Q_new, open('q1_weights_mcts', 'wb'))
pickle.dump(N_new, open('n1_weights_mcts', 'wb'))

In [None]:
plt.plot(rates_7[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")

### 8. Training MCSTPlayer with 3 Lv1 MCSTPlayers

In [None]:
Q_new = defaultdict(defaultdict(int).copy)  # action value estimates
N_new = defaultdict(defaultdict(int).copy)  # visit counts
Q_saved = pickle.load(open('q1_weights_mcts', 'rb'))
N_saved = pickle.load(open('n1_weights_mcts', 'rb'))

winners = np.array([0, 0, 0, 0, 0])
last = np.copy(winners)
rates_8 = []
for i in range(150000):
    sim = Simulator.from_start([MCSTPlayer, MCSTPlayer, MCSTPlayer, MCSTPlayer],
                             params=[{'Q': Q_saved, 'N': N_saved,
                                        'c': .01,
                                        'depth': 10,
                                        'num_simulations': 10,
                                        'alpha': 0.1,
                                        'learn': False}]*3 + \
                                       [{'Q': Q_new, 'N': N_new,
                                        'c': .1,
                                        'depth': 10,
                                        'num_simulations': 10,
                                        'alpha': 0.05}] * 1,
                               verbosity=0)
    winner = sim.run_game()
    winners[winner] += 1
    if i % 1000 == 0:
        print(i)
        print(winners)
        print(winners - last)
        print((winners[3] / np.sum(winners)) * 100)
        print(((winners - last)[3] / np.sum(winners - last)) * 100)
        rates_8.append(((winners - last)[3] / np.sum(winners - last)) * 100)
        last = np.copy(winners)
        
pickle.dump(Q_new, open('q2_weights_mcts', 'wb'))
pickle.dump(N_new, open('n2_weights_mcts', 'wb'))

In [None]:
plt.plot(rates_8[1:])
plt.ylim(0, 100)
plt.xlabel("# of training (x1000)")
plt.ylabel("% of winning (over the last 1000 games)")