In [1]:
import numpy as np
import math
import random
import pygame as pg

In [2]:
#===========CHANGABLE PARAMETERS=================
REWARD     = 1
PENALTY    = -1
DISCOUNT   = 1
LEARN_RATE = 1


#===========DEFINE CONSTANTS AND DICS=============
WALL_LEN = 1
PADDLE_H = 0.2

init_state = (0.5, 0.5, 0.03, 0.01, 0.5 - PADDLE_H / 2)

ACTION_DIC={0:-0.04, #'UP'
            1:0,     #'STAY'
            2:0.04}  #'DOWN'

BOARD_SIZE = 12
X_VBALL_DIS = [-1,1]
Y_VBALL_DIS = [-1,0,1]

PADDLE_SPACE = 12
PADDLE_X     = 1

STATE_SPACE = (BOARD_SIZE,BOARD_SIZE,len(X_VBALL_DIS),len(Y_VBALL_DIS),PADDLE_SPACE)

X_V_TSH = 0.03
Y_V_TSH = 0.015

In [110]:
#==============DEFINE STATE CLASS===============
class state:
    
    def __init__(self,ball_x,ball_y,velocity_x,velocity_y,paddle_y,reward,end_state = 0):
        self.ball_x = ball_x              #real numbers on the interval [0,1]
        self.ball_y = ball_y
        self.velocity_x = velocity_x
        self.velocity_y = velocity_y
        self.paddle_y = paddle_y
        self.reward = reward
        self.state_tuple = (ball_x,ball_y,velocity_x,velocity_y,paddle_y)
        self.end_state = end_state
        self._extract()
        
    def _extract(self):
        self.x_grid = math.floor(12*self.ball_x)
        self.y_grid = math.floor(12*self.ball_y)
        if(self.velocity_x>0): 
            self.x_v_sign = 1
        else: 
            self.x_v_sign = -1
            
        if(self.velocity_y>0):
            self.y_v_sign = 1
        else: 
            self.y_v_sign = -1
        
        self.paddle_grid = math.floor(12 * self.paddle_y / (1 - PADDLE_H))
        self.space_tuple = (self.x_grid,self.y_grid,self.x_v_sign,self.y_v_sign,self.paddle_grid)

In [128]:
#=======DEFINE MORE HELPER FUNCTIONS===========
def bounce(state,action):
    n_ball_v_x = state.velocity_x
    n_ball_v_y = state.velocity_y
    
    n_ball_x = state.ball_x + n_ball_v_x
    n_ball_y = state.ball_y + n_ball_v_y

    #======paddle should in range(0,0.8)
    n_paddle_y = min(WALL_LEN-PADDLE_H,max(0,state.paddle_y + action))
    
    c_reward = 0
    
    #==the ball is off the top of the screen==
    if n_ball_y < 0:
        n_ball_y = -n_ball_y
        n_ball_v_y = -n_ball_v_y
        
    #==the ball is off the bottom of the screen==
    if n_ball_y > 1:
        n_ball_y = 2*WALL_LEN-n_ball_y
        n_ball_v_y = -n_ball_v_y
        
    #==the ball is off the left edge of the screen==
    if n_ball_x < 0:
        n_ball_x = -n_ball_x
        n_ball_v_x = -n_ball_v_x
        
    #== the ball bouncing off the paddle==
    if n_ball_x > 1 and (n_paddle_y+PADDLE_H <= n_ball_y <= n_paddle_y):
        n_ball_x = 2*PADDLE_X - n_ball_x
        n_ball_v_x, n_ball_v_y = update_speed_rand(n_ball_v_x,n_ball_v_y)
        c_reward = REWARD
    if n_ball_x > 1 and not(n_paddle_y+PADDLE_H <= n_ball_y <= n_paddle_y):
        c_reward = PENALTY
    return n_ball_x,n_ball_y,n_ball_v_x,n_ball_v_y,n_paddle_y,c_reward

def update_speed_rand(velocity_x,velocity_y):
    vx_delta = random.choice(range(-15,16,1))/1000
    vy_delta = random.choice(range(-3,4,1))  /100
    sign_x = velocity_x/abs(velocity_x)
    sign_y = velocity_y/abs(velocity_y)
    n_ball_v_x = sign_x*max(X_V_TSH,abs(vx_delta+velocity_x))
    n_ball_v_y = sign_y*max(Y_V_TSH,abs(vy_delta+velocity_y))
    return n_ball_v_x, n_ball_v_y

def check_termination(state):
    pad_rang = range(state.paddle_y-PADDLE_H,state.paddle_y)
    if state.ball_x>=WALL_LEN and state.ball_y not in pad_rang:
        return True
    else:
        return False
    
def get_act(state):
    pad_center = state.paddle_y + PADDLE_H/2
        
    if pad_center > (state.ball_y + state.velocity_y): return 0 #0 for UP
    elif pad_center == (state.ball_y + state.velocity_y): return 1 #1 for STAY
    else: return 2 #2 for DOWN
    
        
def proceed_one_step(state,action):

    n_ball_x,n_ball_y,n_ball_v_x,n_ball_v_y,n_paddle_y,c_reward = bounce(state,action)
    end = 0
    if c_reward == -1:
        end = 1
    print(n_ball_x,n_ball_y,n_ball_v_x,n_ball_v_y,n_paddle_y,c_reward,end)
    n_state = state(n_ball_x,n_ball_y,n_ball_v_x,n_ball_v_y,n_paddle_y,c_reward,end_state=end)
    return n_state

In [129]:
#=============DEFINE Q-AGENT CLASS==============
class q_agent:
    
    def __init__(self):
        self.q_table = np.zeros(STATE_SPACE+(len(ACTION_DIC),)) 
        self.end_state = 0
        
    def set_table(self,loc,val):
        self.q_table[loc] = val
    
    def get_table(self,loc):
        return q_table[loc]
    
    

In [130]:
#=============DEFINE TRAIN FUCNTION=============
#init_state = (0.5, 0.5, 0.03, 0.01, 0.5 - PADDLE_H / 2)
#self.space_tuple = (self.x_grid,self.y_grid,self.x_v_sign,self.y_v_sign,self.paddle_grid)



def train(epoch_num):
    cur_state = state(0.5, 0.5, 0.03, 0.01, 0.5 - PADDLE_H / 2,0)
    q_ag = q_agent()
    tot_bounce = 0
    for i in range(1,epoch_num+1):
        temp_bounce = 0
        while True:
            
            action = get_act(cur_state)
            n_state = proceed_one_step(cur_state,ACTION_DIC[action])
            old_val = q_ag.get_table(n_state.space_tuple+(action,))


            prd_max = np.max(q_ag.get_table(n_state.space_tuple))
            prd_tuple = (n_state.x_grid,n_state.y_grid,n_state.x_v_sign,n_state.y_v_sign,n_state.paddle_grid,prd_max)

            new_val = (1-LEARN_RATE)*old_val + LEARN_RATE*(n_state.reward + DISCOUNT*prd_max)
            q_ag.set_table(n_state.space_tuple+(action,),new_val)

            if n_state.end_state == 1:
                break
            if n_state.reward == REWARD:
                temp_bounce+=1
            cur_state = n_state
        tot_bounce+=temp_bounce
        if i%1000 == 0:
            print('now average bounce is', tot_bounce/i)
    

In [131]:
train(10000)

0.53 0.51 0.03 0.01 0.44 0 0


TypeError: 'state' object is not callable

In [None]:
class A:
    def __init__(self,val):
        self.val = val
        
    def set_val(self,val):
        self.val = val
        
    def get_Val(self):
        return self.val

In [None]:
def test(clas):
    clas.set_val(clas.get_Val()+100)
    
def test1(clas):
    clas.set_val(11)
    test(clas)

In [None]:
th = A(999)

In [None]:
th.get_Val()

In [None]:
n = 1
if(not(n+1)):
    print('kk')

In [None]:
np.shape(np.zeros(STATE_SPACE+(3,)))

In [None]:
l = np.zeros(STATE_SPACE+(3,))
l[1][1][1][1][1][1] = 1
t = (1,1,1,1,1)
act = 10
l[t+(act,)] = 1000
print(l[1][1][1][1][1][1])

In [None]:
test = np.array([[1,2,3],[4,5,6]])

In [12]:
state(0.5, 0.5, 0.03, 0.01, 0.5 - PADDLE_H / 2,0)

<__main__.state at 0x1121f1d30>

In [60]:
n_ball_x = 0.53
n_ball_y=0.51
n_ball_v_x=0.03
n_ball_v_y=0.01
n_paddle_y=0.44
c_reward=0
end=0
a = state(n_ball_x,n_ball_y,n_ball_v_x,n_ball_v_y,n_paddle_y,c_reward,end_state=end)

In [113]:
a.space_tuple

(6, 6, 1, 1, 6)