In [1]:
import numpy as np
import copy
import tensorflow as tf
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras import regularizers
from keras.layers import Dropout
from sklearn.preprocessing import StandardScaler
from abc import ABC, abstractmethod
from random import random
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import os
#initialize for random seeds/states
tf.keras.backend.clear_session()
os.environ['PYTHONHASHSEED'] = '0'
rng = np.random.default_rng(12345)
np.random.seed(42)
tf.random.set_seed(42)

In [2]:
rfloat = rng.random()

In [3]:
print(rfloat)

0.22733602246716966


In [4]:
class paper_game(ABC):
    def __init__(self, train, start_state, train_number=2000):
        self.state=start_state
        self.train = train
        self.train_number=train_number
        
    @abstractmethod
    def transition(state, action):
        #return next state
        pass
    
    @abstractmethod
    def reward(state, action):
        #return reward immediately after action
        pass
    
    @abstractmethod
    def policy(self, state):
        #return action
        pass
    
    @abstractmethod
    def Q(state, action):
        pass
    
    @abstractmethod
    def Q_update(self, alpha, gamma):
        pass

In [5]:
class tit_tac_toe(paper_game):
#state: 3x3 array to represent board config, with 1 representing self-checker, -1 representing opponent-checker, 0 representing empty
    def __init__(self, train, start_state=np.zeros((3,3), dtype=np.int8), train_number=2000):
        self.state=start_state
        self.status_func = win_status_function
        self.train = train
        self.train_number=train_number
        self.epsilon = 0.9
        self.aplha = 0.5
        self.gamma = 0.8
        self.Q = np.zeros((3^9, 9))  #initialize Q matrix with state 'reference' (see ref function) and action key (see key function) 
    
    def transition(state, action):
        #action: a list of row and column index to indicate which cell is chosen
        new_state = copy.deepcopy(state)
        new_state[action[0]][action[1]] = 1
        return new_state
        #return next state
        
    def action_list(state):
        return np.where(state == 0)
    
    def win_status(s):
        #s: state, 
        #return: whether win or not
        win = 3 in np.sum(s, axis=0) or 3 in np.sum(s, axis=1) or 3 in s.diagonal or 3 in np.fliplr(s).diagonal()
        return win
    
    def reward(state, action):
        s = transition(state, action)
        if win_status(s):
            reward = 100
        else:
            reward = 0
        return reward

    def ref(state):
        #change state matrix into vector
        ref = np.dot((state + 1).flatten() , np.matrix([1,3,3**2,3**3,3**4,3**5,3**6,3**7,3**8]))
        return ref
    
    def key(action):
        #change actions [r, c] into numbers 0-8
        return 3*action[0]+action[1]
    
    def Q(state, action):
        Q = self.Q[ref(state)][key(action)]
        return Q

    def Q_update(self, alpha, gamma):
        pass
    
    def policy(self, state):
        action_list = action_list(state)
        exploit = (not self.train) or rng.random()> self.epsilon
        if exploit:
            max = -10000
            action=[]
            for i in range(len(action_list)):
                a = [action_list[0][i],action_list[1][i]]
                temp = Q(state, a)
                if temp > max:
                    max = temp
                    action = a
        else:
            i = np.random.randint(len(action_list[0]))
            action = [action_list[0][i],action_list[1][i]]

        if self.epsilon < 0.5:
            self.epsilon *= 0.9999
        else:
            self.epsilon *= 0.99999
        
        return action
            
        
        

In [6]:
x=np.zeros((3,3))
y = np.where(x==0)
print(len(y[0]))
for i in y:
    print(i)
z = np.sum(y, axis=0)
print(z)

9
[0 0 0 1 1 1 2 2 2]
[0 1 2 0 1 2 0 1 2]
[0 1 2 1 2 3 2 3 4]


In [7]:
print (np.argmax(z))

8


In [8]:
print(x.flatten())

[0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [9]:
print(x)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [10]:
print(np.zeros(4))

[0. 0. 0. 0.]


In [11]:
print([1, 2, 3])

[1, 2, 3]


In [17]:
a=np.matrix([1,3,3**2,3**3,3**4,3**5,3**6,3**7,3**8])
print(a)
print(np.dot(a,z))

[[   1    3    9   27   81  243  729 2187 6561]]
[[35202]]


In [18]:
x[0][1]=2
print(x)

[[0. 2. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [19]:
print(x[0])
print(x[1])

[0. 2. 0.]
[0. 0. 0.]


In [22]:
q=np.zeros((10,4))
print(q)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [23]:
print(x.flatten())

[0. 2. 0. 0. 0. 0. 0. 0. 0.]


In [25]:
three_D = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
print(three_D)
print(three_D[0])

[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]
[[1 2]
 [3 4]]
