<a href="https://colab.research.google.com/github/eceak/Reinforcement-Learning-RL-/blob/main/deep_q_learning_cart_pole_v0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:

"""
@author: ece
"""

import gym
import numpy as np
'''deep q learning kullanıldığı için keras 
kütüphaneleri aktif edildi'''
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import random

class DQLAgent:
     
    def __init__(self, env):
        #parameter/hyperparameter
       
      ###DQN girdisi state_size kadar nöron olduğu için ve 
      ### action_size kadar da çıktı nöron olacağı 
      ###için bunların bilinmesi gerekir.
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.n
        
        self.gamma = 0.95 #gelecekteki ödülleri belirtir (future reward)
        self.learning_rate = 0.001 #agentın öğrenme hızı
        
        self.epsilon = 1 #explore
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000) #kapasitesi 1000 olan bir liste hafıza için yaratıldı
        
        self.model = self.build_model()
    
    def build_model(self):
        #neural network for deep q learning
        model = Sequential()
        model.add(Dense(48,input_dim = self.state_size,activation = "tanh"))
        model.add(Dense(self.action_size,activation = "linear"))
        model.compile(loss = "mse", optimizer = Adam(lr = self.learning_rate))
        return model
    
    def remember(self,state, action, reward, next_state, done):
        #storage
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        #acting explore or exploit
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
    
    def replay(self,batch_size):
        #training
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            if done:
                target = reward
            else:
                target = reward + self.gamma*np.amax(self.model.predict(next_state)[0])
            train_target = self.model.predict(state)
            train_target[0][action] = target
            self.model.fit(state,train_target, verbose = 0)
        
    
    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min :
            self.epsilon *= self.epsilon_decay
        

if __name__ == "__main__":
    
    #initialize gym env and agent
    env = gym.make("CartPole-v0")
    agent = DQLAgent(env)
    
    batch_size = 16
    episodes = 40
    for e in range(episodes):
        
        #initialize environment
        state = env.reset() 
        
        #ilerleyen adımlarda stateleri bir arada tutmak için reshape kullanılır
        state = np.reshape(state, [1,4])
        
        time = 0
        while True:
            
            #act / bir hareket seçiliyor
            action = agent.act(state) #select an action
            
            #step
            next_state, reward, done, _ = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            
            #remember/storage
            agent.remember(state,action, reward, next_state, done)
            
            #update state
            state = next_state
            
            #replay
            agent.replay(batch_size)
            
            #adjust epsilon
            agent.adaptiveEGreedy()
            
            time += 1
            
            if done:
                print("Episode: {}, time: {}".format(e,time))
                break


  "The `lr` argument is deprecated, use `learning_rate` instead.")


Episode: 0, time: 13
Episode: 1, time: 16
Episode: 2, time: 14
Episode: 3, time: 41
Episode: 4, time: 12
Episode: 5, time: 20
Episode: 6, time: 200
Episode: 7, time: 23
Episode: 8, time: 15
Episode: 9, time: 18
Episode: 10, time: 72
Episode: 11, time: 19
Episode: 12, time: 22
Episode: 13, time: 30
Episode: 14, time: 22
Episode: 15, time: 45
Episode: 16, time: 87
Episode: 17, time: 43
Episode: 18, time: 44
Episode: 19, time: 40
Episode: 20, time: 61
Episode: 21, time: 71
Episode: 22, time: 41
Episode: 23, time: 63
Episode: 24, time: 52
Episode: 25, time: 59
Episode: 26, time: 147
Episode: 27, time: 15
Episode: 28, time: 14
Episode: 29, time: 23
Episode: 30, time: 48
Episode: 31, time: 103
Episode: 32, time: 44
Episode: 33, time: 52
Episode: 34, time: 78
Episode: 35, time: 91
Episode: 36, time: 111
Episode: 37, time: 148
Episode: 38, time: 200
Episode: 39, time: 200


In [None]:
import time
trained_model = agent
state = env.reset()
state = np.reshape(state, [1,4])
time_t = 0
while True:
    env.render() #it doesn't work in collab
    action = trained_model.act(state)
    next_state, reward, done, _ = env.step(action)
    next_state = np.reshape(next_state, [1,4])
    state = next_state
    time_t += 1
    print(time_t)
    #time.sleep(0.4)
    if done:
        break
print("Done")