# Reproducible experiment

In [1]:
import random
import numpy as np
import torch

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False


In [2]:
from open_spiel.python import policy
from open_spiel.python.algorithms import expected_game_score
import pyspiel

import open_spiel.python.pytorch.deep_cfr as deep_cfr
from importlib import reload
reload(deep_cfr)
import open_spiel.python.pytorch.deep_cfr as deep_cfr

Optional module pokerkit_wrapper was not importable: No module named 'pokerkit'


In [3]:
game = pyspiel.load_game("kuhn_poker")

### For reproducible reason

In [70]:
set_seed(1234)
seeds = np.random.randint(10000, size=40)

In [71]:
from tqdm import tqdm

convs, pv = [], []
for seed in tqdm(seeds, desc="Running Deep CFR over seeds"):
    set_seed(seed)
    deep_cfr_solver_i = deep_cfr.DeepCFRSolver(
      game,
      policy_network_layers=(32, 32),
      advantage_network_layers=(16, 16),
      num_iterations=400,
      num_traversals=160,   # Increase
      learning_rate=1e-2,   
      batch_size_advantage=512,
      batch_size_strategy=None,
      memory_capacity=int(1e7),
      reinitialize_advantage_networks=False, # switch to False from True (default)
      policy_network_train_steps=100,  # set to 100 to allow policy network trained more iters
      )
    _, advantage_losses, policy_loss = deep_cfr_solver_i.solve()
    average_policy = policy.tabular_policy_from_callable(game, deep_cfr_solver_i.action_probabilities)
    pyspiel_policy = policy.python_policy_to_pyspiel_policy(average_policy)
    conv = pyspiel.nash_conv(game, pyspiel_policy)

    average_policy_values = expected_game_score.policy_value(game.new_initial_state(), [average_policy] * 2)
    
    print(f"advantage loss is %s"%advantage_losses[0][-3:])
    print(f"policy loss is %s"%policy_loss)
    print(f"average_policy_values is {average_policy_values}")
    convs.append(conv)
    pv.append(average_policy_values)

since Python 3.9 and will be removed in a subsequent version. The only 
supported seed types are: None, int, float, str, bytes, and bytearray.
  random.seed(seed)
Running Deep CFR over seeds:   2%| | 1/40 [01:00<39:35, 60

advantage loss is [array(98.68515, dtype=float32), array(120.032814, dtype=float32), array(97.77221, dtype=float32)]
policy loss is 15.358688
average_policy_values is [-0.04672714  0.04672714]


Running Deep CFR over seeds:   5%| | 2/40 [02:03<39:17, 62

advantage loss is [array(94.0074, dtype=float32), array(98.45993, dtype=float32), array(80.653885, dtype=float32)]
policy loss is 12.913969
average_policy_values is [-0.05472087  0.05472087]


Running Deep CFR over seeds:   8%| | 3/40 [03:05<38:16, 62

advantage loss is [array(115.071014, dtype=float32), array(128.20671, dtype=float32), array(107.69765, dtype=float32)]
policy loss is 16.067297
average_policy_values is [-0.06614242  0.06614242]


Running Deep CFR over seeds:  10%| | 4/40 [04:07<37:07, 61

advantage loss is [array(117.10716, dtype=float32), array(110.08977, dtype=float32), array(126.923706, dtype=float32)]
policy loss is 15.309047
average_policy_values is [-0.05357667  0.05357667]


Running Deep CFR over seeds:  12%|▏| 5/40 [05:08<35:58, 61

advantage loss is [array(95.343834, dtype=float32), array(113.370804, dtype=float32), array(116.775085, dtype=float32)]
policy loss is 18.842821
average_policy_values is [-0.0707524  0.0707524]


Running Deep CFR over seeds:  15%|▏| 6/40 [06:10<35:03, 61

advantage loss is [array(118.81397, dtype=float32), array(102.89109, dtype=float32), array(120.329056, dtype=float32)]
policy loss is 15.614019
average_policy_values is [-0.07430902  0.07430902]


Running Deep CFR over seeds:  18%|▏| 7/40 [07:13<34:13, 62

advantage loss is [array(115.12732, dtype=float32), array(128.08911, dtype=float32), array(147.86285, dtype=float32)]
policy loss is 13.150893
average_policy_values is [-0.08381649  0.08381649]


Running Deep CFR over seeds:  20%|▏| 8/40 [08:15<33:05, 62

advantage loss is [array(108.658745, dtype=float32), array(117.1918, dtype=float32), array(118.317116, dtype=float32)]
policy loss is 17.505964
average_policy_values is [-0.07532747  0.07532747]


Running Deep CFR over seeds:  22%|▏| 9/40 [09:18<32:08, 62

advantage loss is [array(96.41442, dtype=float32), array(98.78238, dtype=float32), array(101.968735, dtype=float32)]
policy loss is 18.698242
average_policy_values is [ 0.09026827 -0.09026827]


Running Deep CFR over seeds:  25%|▎| 10/40 [10:20<31:05, 6

advantage loss is [array(62.919018, dtype=float32), array(79.599655, dtype=float32), array(100.22373, dtype=float32)]
policy loss is 16.804344
average_policy_values is [-0.04096034  0.04096034]


Running Deep CFR over seeds:  28%|▎| 11/40 [11:21<29:55, 6

advantage loss is [array(128.17816, dtype=float32), array(114.32915, dtype=float32), array(128.92757, dtype=float32)]
policy loss is 21.223793
average_policy_values is [-0.05920915  0.05920915]


Running Deep CFR over seeds:  30%|▎| 12/40 [12:22<28:41, 6

advantage loss is [array(103.22662, dtype=float32), array(110.124115, dtype=float32), array(113.61027, dtype=float32)]
policy loss is 19.199251
average_policy_values is [-0.06866065  0.06866065]


Running Deep CFR over seeds:  32%|▎| 13/40 [13:24<27:44, 6

advantage loss is [array(83.47174, dtype=float32), array(88.38674, dtype=float32), array(84.73446, dtype=float32)]
policy loss is 13.438033
average_policy_values is [-0.06953076  0.06953076]


Running Deep CFR over seeds:  35%|▎| 14/40 [14:26<26:47, 6

advantage loss is [array(123.59784, dtype=float32), array(121.81669, dtype=float32), array(118.01859, dtype=float32)]
policy loss is 15.790348
average_policy_values is [-0.05051218  0.05051218]


Running Deep CFR over seeds:  38%|▍| 15/40 [15:30<26:03, 6

advantage loss is [array(134.40918, dtype=float32), array(149.8681, dtype=float32), array(140.08505, dtype=float32)]
policy loss is 10.88227
average_policy_values is [-0.06952416  0.06952416]


Running Deep CFR over seeds:  40%|▍| 16/40 [16:32<24:54, 6

advantage loss is [array(95.157715, dtype=float32), array(116.198715, dtype=float32), array(116.78357, dtype=float32)]
policy loss is 16.81294
average_policy_values is [-0.071769  0.071769]


Running Deep CFR over seeds:  42%|▍| 17/40 [17:35<23:56, 6

advantage loss is [array(89.95076, dtype=float32), array(89.01323, dtype=float32), array(94.09124, dtype=float32)]
policy loss is 17.992985
average_policy_values is [-0.07121824  0.07121824]


Running Deep CFR over seeds:  45%|▍| 18/40 [18:37<22:51, 6

advantage loss is [array(145.0452, dtype=float32), array(134.30055, dtype=float32), array(153.02348, dtype=float32)]
policy loss is 20.017063
average_policy_values is [-0.12457526  0.12457526]


Running Deep CFR over seeds:  48%|▍| 19/40 [19:39<21:49, 6

advantage loss is [array(143.53131, dtype=float32), array(146.58792, dtype=float32), array(179.5675, dtype=float32)]
policy loss is 15.548172
average_policy_values is [-0.07518281  0.07518281]


Running Deep CFR over seeds:  50%|▌| 20/40 [20:42<20:49, 6

advantage loss is [array(83.14411, dtype=float32), array(107.783165, dtype=float32), array(106.53581, dtype=float32)]
policy loss is 15.217685
average_policy_values is [-0.05777782  0.05777782]


Running Deep CFR over seeds:  52%|▌| 21/40 [21:43<19:41, 6

advantage loss is [array(105.679756, dtype=float32), array(103.68033, dtype=float32), array(89.41759, dtype=float32)]
policy loss is 18.063078
average_policy_values is [-0.07005487  0.07005487]


Running Deep CFR over seeds:  55%|▌| 22/40 [22:46<18:41, 6

advantage loss is [array(88.81, dtype=float32), array(80.24755, dtype=float32), array(94.58705, dtype=float32)]
policy loss is 17.181295
average_policy_values is [-0.05602407  0.05602407]


Running Deep CFR over seeds:  57%|▌| 23/40 [23:47<17:34, 6

advantage loss is [array(72.89729, dtype=float32), array(83.70134, dtype=float32), array(79.0167, dtype=float32)]
policy loss is 15.132431
average_policy_values is [-0.07203144  0.07203144]


Running Deep CFR over seeds:  60%|▌| 24/40 [24:49<16:29, 6

advantage loss is [array(108.54961, dtype=float32), array(138.19553, dtype=float32), array(128.67801, dtype=float32)]
policy loss is 16.83176
average_policy_values is [-0.07591118  0.07591118]


Running Deep CFR over seeds:  62%|▋| 25/40 [25:51<15:28, 6

advantage loss is [array(128.30237, dtype=float32), array(132.60138, dtype=float32), array(131.3216, dtype=float32)]
policy loss is 19.952253
average_policy_values is [-0.0691448  0.0691448]


Running Deep CFR over seeds:  65%|▋| 26/40 [26:53<14:28, 6

advantage loss is [array(110.54977, dtype=float32), array(107.99482, dtype=float32), array(103.83646, dtype=float32)]
policy loss is 16.812853
average_policy_values is [-0.05552468  0.05552468]


Running Deep CFR over seeds:  68%|▋| 27/40 [27:56<13:30, 6

advantage loss is [array(90.28627, dtype=float32), array(81.328735, dtype=float32), array(90.636154, dtype=float32)]
policy loss is 16.65238
average_policy_values is [-0.06203876  0.06203876]


Running Deep CFR over seeds:  70%|▋| 28/40 [28:58<12:26, 6

advantage loss is [array(70.716446, dtype=float32), array(70.08856, dtype=float32), array(65.59054, dtype=float32)]
policy loss is 15.23427
average_policy_values is [-0.04722884  0.04722884]


Running Deep CFR over seeds:  72%|▋| 29/40 [30:00<11:24, 6

advantage loss is [array(124.600044, dtype=float32), array(148.80331, dtype=float32), array(133.35489, dtype=float32)]
policy loss is 19.123413
average_policy_values is [-0.03831174  0.03831174]


Running Deep CFR over seeds:  75%|▊| 30/40 [31:01<10:18, 6

advantage loss is [array(121.97598, dtype=float32), array(128.4075, dtype=float32), array(110.63724, dtype=float32)]
policy loss is 22.588905
average_policy_values is [-0.07712369  0.07712369]


Running Deep CFR over seeds:  78%|▊| 31/40 [32:04<09:18, 6

advantage loss is [array(90.412186, dtype=float32), array(89.52411, dtype=float32), array(72.58534, dtype=float32)]
policy loss is 17.210566
average_policy_values is [-0.04506529  0.04506529]


Running Deep CFR over seeds:  80%|▊| 32/40 [33:06<08:16, 6

advantage loss is [array(102.65102, dtype=float32), array(94.52271, dtype=float32), array(118.99711, dtype=float32)]
policy loss is 20.287725
average_policy_values is [-0.05716484  0.05716484]


Running Deep CFR over seeds:  82%|▊| 33/40 [34:08<07:15, 6

advantage loss is [array(121.78424, dtype=float32), array(112.82447, dtype=float32), array(118.12863, dtype=float32)]
policy loss is 17.822802
average_policy_values is [-0.01926895  0.01926895]


Running Deep CFR over seeds:  85%|▊| 34/40 [35:11<06:14, 6

advantage loss is [array(86.46878, dtype=float32), array(113.49956, dtype=float32), array(98.29861, dtype=float32)]
policy loss is 15.452558
average_policy_values is [-0.05765064  0.05765064]


Running Deep CFR over seeds:  88%|▉| 35/40 [36:13<05:10, 6

advantage loss is [array(74.58453, dtype=float32), array(78.30571, dtype=float32), array(90.060135, dtype=float32)]
policy loss is 16.989891
average_policy_values is [-0.04153378  0.04153378]


Running Deep CFR over seeds:  90%|▉| 36/40 [37:15<04:08, 6

advantage loss is [array(86.8625, dtype=float32), array(100.4913, dtype=float32), array(79.41649, dtype=float32)]
policy loss is 18.858387
average_policy_values is [-0.06426039  0.06426039]


Running Deep CFR over seeds:  92%|▉| 37/40 [38:18<03:06, 6

advantage loss is [array(78.49953, dtype=float32), array(73.77362, dtype=float32), array(78.79491, dtype=float32)]
policy loss is 16.512838
average_policy_values is [-0.04499551  0.04499551]


Running Deep CFR over seeds:  95%|▉| 38/40 [39:20<02:04, 6

advantage loss is [array(118.24327, dtype=float32), array(108.450516, dtype=float32), array(108.89754, dtype=float32)]
policy loss is 14.246398
average_policy_values is [-0.05864784  0.05864784]


Running Deep CFR over seeds:  98%|▉| 39/40 [40:21<01:02, 6

advantage loss is [array(104.02095, dtype=float32), array(96.678505, dtype=float32), array(86.6669, dtype=float32)]
policy loss is 18.713467
average_policy_values is [-0.06664698  0.06664698]


Running Deep CFR over seeds: 100%|█| 40/40 [41:24<00:00, 6

advantage loss is [array(144.07509, dtype=float32), array(140.85686, dtype=float32), array(123.71324, dtype=float32)]
policy loss is 17.51198
average_policy_values is [-0.04929202  0.04929202]





In [74]:
np.array(pv).mean(axis=0)

array([-0.05804862,  0.05804862])

In [75]:
np.array(pv).std(axis=0)

array([0.02889775, 0.02889775])

## t/z test

In [76]:
import numpy as np
from scipy import stats

In [77]:
mu0 = -1/18

In [78]:
t_stat, p_value = stats.ttest_1samp(np.array(pv)[:,0], mu0)

In [79]:
p_value

0.5931088126230109