## Chess AI
<hr>
### Reinforcement Learning
<hr>
### Monte Carlo Method | Bellman Equation | Optimal Policy Iteration
<hr>

The following is a simulation that terminates after games.
The objective is to train a black chess agent how to play chess purely based off the `Monte Carlo`, `Bellman's Equation`, and `Optimal Policy Iteration`.

## Environment Setup
<hr>
## Simulation Setup
<hr>

In [None]:
import datetime
import os
os.chdir("/Users/laurensuarez/Documents/blake/chess/data")
def random_action(theBoard, thePolicy, eps=0.1):
    
    # Current state
    s=str(theBoard)
    
    #   ///////////////////
    #  // If Unexplored //
    # ///////////////////
    if s not in list(thePolicy.keys()): 
        return np.random.choice(list(theBoard.legal_moves))
    
    #   ////////////////////
    #  // Epsilon-greedy //
    # ////////////////////
    p=np.random.random() # [0,1]    
    
    #   /////////////
    #  // Exploit //
    # /////////////
    if p < (1-eps):
        selected=None
        a=thePolicy[s]
        for act in list(theBoard.legal_moves):
            #print(str(act))
            if str(act) == a:
                selected=act
                
        #   /////////////
        #  // Explore //
        # //////////////
        if selected is None:
            # The possibility of being in a state that had an option
            # but is now currently preoccupied by a piece, or check conditions, ect..
            # hence, just move randomly, treat it as if we do not have an option.
            return np.random.choice(list(theBoard.legal_moves))
        else:
            return selected
        
    #   /////////////
    #  // Explore //
    # //////////////
    else:
        return np.random.choice(list(theBoard.legal_moves))

# Play game as black
def play_game(theBoard, thePolicy, gamma=0.9):
        
    #   //////////////////////
    #  // White Move First //
    # //////////////////////
    move=np.random.choice(list(theBoard.legal_moves))
    theBoard.push(move)
    
    #   //////////////
    #  // Game on! //
    # //////////////
    state_action_rewards=[]
    while theBoard.is_game_over()==False:
        
        # Make intelligent move from policy (iteratively updated)
        state=str(theBoard)
        move=random_action(theBoard, thePolicy)
        action=str(move)
        state_action_rewards.append([state,action,0])
        theBoard.push(move)
    
    # Allocate winner points
    winner=newBoard.result()
    if winner == '1-0':
        # We lost
        state_action_rewards[-1][2]=-1
    elif winner == '0-1':
        # We won!
        state_action_rewards[-1][2]=1
    else:
        # Tie...
        state_action_rewards[-1][2]=0.001
        pass
    
    #   /////////////////////////////
    #  // Bellman Update Equation //
    # /////////////////////////////
    G = 0
    states_actions_returns = []

    for s,a,r in reversed(state_action_rewards):

        states_actions_returns.append((s, a, G))
        G = r + gamma*G
    states_actions_returns.reverse()
    
    return states_actions_returns
    
policy={}
Q=defaultdict(dict)
returns=defaultdict(list)
def run_simulation(num_games=5000, write_file_mod=200):

    #   //////////////////////
    #  // Agent Containers //
    # //////////////////////
    policy={}
    Q=defaultdict(dict)
    returns=defaultdict(list)

    #   //////////////////////
    #  // Simulation Begin //
    # //////////////////////
    deltas=[]
    seen_state_action_pairs=set()
    for i in range(num_games):
        
        if i % write_file_mod == 0 and i != 0:
            print(i)
            custom_time=str(datetime.datetime.now()).replace(".","_").replace(":","_").replace(" ", "")
            if not os.path.exists(os.path.join(os.getcwd(),custom_time)):
                os.makedirs(os.path.join(os.getcwd(),custom_time))
            pickle_dump(Q, custom_time+"/Q_"+custom_time+".pkl")
            pickle_dump(Q, custom_time+"/policy_"+custom_time+".pkl")
            pickle_dump(Q, custom_time+"/V_"+custom_time+".pkl")
        #Fresh board
        newBoard=chess.Board()
        
        #For visualization
        biggest_change=0
        
        #Collect 1 game sequence 
        state_action_returns_list=play_game(newBoard, policy)
        
        #   ///////////////////////////
        #  // Value function update //
        # ///////////////////////////
        for s, a, G in state_action_returns_list:
            sa = (s, a)
            if sa in seen_state_action_pairs:
                old_q=Q[s][a]
            elif sa not in seen_state_action_pairs:
                old_q=0
                seen_state_action_pairs.add(sa)
            else:
                print("Error: (s,a) is niether been seen in the past nor has it not been seen. Huge problem.")
            returns[sa].append(G)
            Q[s][a]=np.mean(returns[sa]) #rolling mean, Q now `learns`
            biggest_change=max([biggest_change, np.abs(old_q - Q[s][a])])
            deltas.append(biggest_change)
            
            
        #   ///////////////////
        #  // Policy Update //
        # ///////////////////
        for s in Q.keys():
            keys=list(Q[s].keys())
            vals=list(Q[s].values())
            a = keys[np.argmax(vals)]
            policy[s] = a
    V = {}
    for s in policy.keys():
        V[s] = Q[s][1]
    return Q, policy, V, deltas
                                

## Run the sim
### Plot for convergence visualization
<hr>
<hr>

In [None]:
import matplotlib.pyplot as plt
Q,policy,V,deltas=run_simulation(num_games=5000, write_file_mod=200)
plt.plot(deltas)
plt.show

## Import data
Confirm process working correctly.
<hr><hr>

In [3]:
import os
import pickle
import numpy as np
os.chdir("/Users/laurensuarez/Documents/blake/chess/data")
class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size

def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)

def pickle_load(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(MacOSFile(f))


In [4]:
Q=pickle_load("Q_2018-11-2421_43_58_744997.pkl")
policy=pickle_load("policy_2018-11-2421_43_58_744997.pkl")
V=pickle_load("V_2018-11-2421_43_58_744997.pkl")

In [18]:
Q[list(Q.keys())[8]]

{'h7h5': 1.9622177898881668e-14,
 'f5f4': 2.776664336428095e-08,
 'a6a5': 7.943796589169338e-19}