### Deep Q-Learning

In [3]:
from collections import deque
import random
import gym
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam


class Agent(object):
    
    def __init__(self, environment):
        #parameter, hyperparameter
        self.state_size = 4
        self.action_size = 2
        self.gamma = 0.95 #gelecekteki ödüle odaklan ya da elindekiyle yetin
        self.learning_rate = 0.001
        self.epsilon = 1 #explore rate - at the beginning explore is maximum
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.1
        
        self.memory = deque(maxlen = 1000) #FIFO data structure
        
        self.model = self.build_model()
        
        #counters for classifier:
        self.counter_done_true = 0
        self.counter_done_false = 0
        #write to text file
        
            
    def build_model(self):
        # neural network for deep q learning
        model = Sequential()
        model.add(Dense(48, input_dim = self.state_size, activation = "tanh"))
        model.add(Dense(self.action_size,activation = "linear"))
        model.compile(loss = "mse", optimizer = Adam(lr = self.learning_rate))
        return model

    
    def collectData(self,state, action, done):
        
        state = state[0]
        s0 = str(state[0])
        s1 = str(state[1])
        s2 = str(state[2])
        s3 = str(state[3])
        action = str(action)
                
        if done == True and self.counter_done_true <= 10000:
            self.counter_done_true += 1
            f = open("classifier_data_done_true.txt", "a+")
            row = s0+","+s1+","+s2+","+s3+","+action+","+str(1)+"\n"
            f.write(row)
            
        if done == False and self.counter_done_false <= 10000:
            self.counter_done_false += 1
            f = open("classifier_data_done_false.txt", "a+")
            row = s0+","+s1+","+s2+","+s3+","+action+","+str(0)+"\n"
            f.write(row)     
            

            
    def remember(self, state, action, reward, next_state, done):
        #stroge
        memory = (state, action, reward, next_state, done)
        #print("memory : \n", memory,"\n")
        self.memory.append(memory)
        
        
       
    def act(self,state):
        #explore - select action randomly
        if random.uniform(0,1) <= self.epsilon:
            return environment.action_space.sample()
        #exploit
        else:
            actions = self.model.predict(state)
            best_action = np.argmax(actions)
            return best_action
        

              
    
    def replay(self, batch_size):
        #training
        if len(self.memory) < batch_size:
            return
        #select samples randomly 
        minibatch = random.sample(self.memory,batch_size)
        
        total_loss = []
        
        for state, action, reward, next_state, done in minibatch:
            
            if done:
                target = reward
            else:
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])   
                
            train_target = self.model.predict(state)
            train_target[0][action] = target
            
            self.model.fit(state,train_target, verbose = 0)
        
    
    
    def adaptiveEpsilonGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon*self.epsilon_decay
      
    
if __name__ == "__main__":
    
    #define environment and agent
    environment = gym.make("CartPole-v0")
    agent = Agent(environment)
    
      
    #define episode count
    episodes = 60
    
    #batch_size
    batch_size = 16
    
    for i in range(episodes):
        
        #initialize the environment and initial state for every episode
        state = environment.reset()
        state = state.reshape((1,4))
        

        episode_loss = []
        #time
        time = 0
        
        while(True):         
            #act - select action
            action = agent.act(state)
            
            #apply action and take observations
            next_state, reward, done, _ = environment.step(action)
            next_state = next_state.reshape((1,4))
            
            #Collect data for classifier:
            #agent.collectData(state, action, done)
            #print("True: | ", agent.counter_done_true," False: | ",agent.counter_done_false)
                        
            #remember / storage
            agent.remember(state, action, reward, next_state, done)
    
            #Demonstrate
            #environment.render()
            
            #update state
            state = next_state
            
            #replay - training
            agent.replay(batch_size)
            
            #adjust epsilon
            agent.adaptiveEpsilonGreedy()
            
            #update time
            time = time + 1
            
            #check done
            if done:
                print("\nEpisode: {} | Fitness: {}".format(i,time))
                break
                
        
       


Episode: 0 | Fitness: 16

Episode: 1 | Fitness: 24

Episode: 2 | Fitness: 22

Episode: 3 | Fitness: 18

Episode: 4 | Fitness: 23

Episode: 5 | Fitness: 16

Episode: 6 | Fitness: 38

Episode: 7 | Fitness: 40

Episode: 8 | Fitness: 12

Episode: 9 | Fitness: 26

Episode: 10 | Fitness: 57

Episode: 11 | Fitness: 105

Episode: 12 | Fitness: 92

Episode: 13 | Fitness: 81

Episode: 14 | Fitness: 40

Episode: 15 | Fitness: 35

Episode: 16 | Fitness: 82

Episode: 17 | Fitness: 44

Episode: 18 | Fitness: 33

Episode: 19 | Fitness: 32

Episode: 20 | Fitness: 96

Episode: 21 | Fitness: 70

Episode: 22 | Fitness: 46

Episode: 23 | Fitness: 69

Episode: 24 | Fitness: 54

Episode: 25 | Fitness: 72

Episode: 26 | Fitness: 80

Episode: 27 | Fitness: 66

Episode: 28 | Fitness: 36

Episode: 29 | Fitness: 98

Episode: 30 | Fitness: 87

Episode: 31 | Fitness: 94

Episode: 32 | Fitness: 187

Episode: 33 | Fitness: 200

Episode: 34 | Fitness: 200

Episode: 35 | Fitness: 200

Episode: 36 | Fitness: 200

Epis