YOU SHOULD CHOOSE TPU

In [None]:
!pip install numpy scipy gym pandas keras stable-baselines3 atari_py gym[accept-rom-license] wandb atari-py dopamine-rl shimmy torch gymnasium gymnasium[ale] gymnasium[atari]


: 

In [None]:
import tensorflow as tf

# TPU detektörü
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU bulunursa
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
    tpu = None

# TPU'yu kullanmak için güncellenmiş TensorFlow stratejisini ayarla
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)  # Güncellenmiş strateji kullanımı
else:
    strategy = tf.distribute.get_strategy() # TPU yoksa varsayılan stratejiyi kullan


In [None]:
lr = 0.001
epsilon = 1.0
epsilon_decay = 0.995
gamma = 0.99
training_episodes = 1
some_threshold = 10000
num_envs= 2

: 

In [None]:
import numpy as np
import random
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import relu, linear
from tensorflow.keras.losses import mean_squared_error
from tensorflow.keras.optimizers import Adam
import wandb

class DQN:
    def __init__(self, env, lr, gamma, epsilon, epsilon_decay):
        self.env = env
        self.lr = lr
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.replay_memory_buffer = deque(maxlen=15000)
        self.batch_size = 64
        self.epsilon_min = 0.01
        self.num_envs = env.num_envs
        self.num_action_space = env.action_space.n
        self.num_observation_space = np.prod(env.observation_space.shape)
        self.model = self.initialize_model()

    def initialize_model(self):
        model = Sequential()
        model.add(Dense(512, input_dim=self.num_observation_space, activation=relu))
        model.add(Dense(256, activation=relu))
        model.add(Dense(self.num_action_space, activation=linear))
        model.compile(loss=mean_squared_error, optimizer=Adam(learning_rate=self.lr))
        return model

    def get_action(self, states):
        actions = []
        for state in states:
            if np.random.rand() < self.epsilon:
                actions.append(random.randrange(self.num_action_space))
            else:
                predicted_actions = self.model.predict(state)
                actions.append(np.argmax(predicted_actions[0]))
        return actions

    def add_to_replay_memory(self, state, action, reward, next_state, done):
        self.replay_memory_buffer.append((state, action, reward, next_state, done))

    def learn_and_update_weights_by_reply(self):
        if len(self.replay_memory_buffer) < self.batch_size:
            return

        random_sample = self.get_random_sample_from_replay_mem()
        states, actions, rewards, next_states, done_list = self.get_attribues_from_sample(random_sample)
        targets = rewards + self.gamma * (np.amax(self.model.predict_on_batch(next_states), axis=1)) * (1 - done_list)
        target_vec = self.model.predict_on_batch(states)
        indexes = np.array([i for i in range(self.batch_size)])
        target_vec[[indexes], [actions]] = targets

        history = self.model.fit(states, target_vec, epochs=1, verbose=0)
        loss = history.history['loss'][0]
        wandb.log({'Loss': loss})

    def get_attribues_from_sample(self, random_sample):
        states = np.array([i[0] for i in random_sample])
        actions = np.array([i[1] for i in random_sample])
        rewards = np.array([i[2] for i in random_sample])
        next_states = np.array([i[3] for i in random_sample])
        done_list = np.array([i[4] for i in random_sample])
        states = np.squeeze(states)
        next_states = np.squeeze(next_states)
        return states, actions, rewards, next_states, done_list

    def get_random_sample_from_replay_mem(self):
        return random.sample(self.replay_memory_buffer, self.batch_size)

    def save_training_progress(self, rewards_list, episode, epsilon):
        with open('training_progress.txt', 'a') as file:
            file.write(f'Episode: {episode}, Average Reward: {sum(rewards_list)/len(rewards_list)}, Epsilon: {epsilon}\n')

    def train(self, num_episodes, can_stop=True):
        rewards_list = [[] for _ in range(self.num_envs)]
        for episode in range(num_episodes):
            states = self.env.reset()  # Tüm ortamları sıfırla
            states = np.array([np.reshape(state.flatten(), [1, self.num_observation_space]) for state in states])

            total_rewards = [0 for _ in range(self.num_envs)]
            dones = [False for _ in range(self.num_envs)]

            step = 0
            while not all(dones):
                actions = self.get_action(states)  # Her ortam için ayrı bir eylem seç
                next_states, rewards, dones, _ = self.env.step(actions)

                # Her ortam için verileri işle
                for i in range(self.num_envs):
                    state = states[i]
                    action = actions[i]
                    reward = rewards[i]
                    next_state = next_states[i]
                    done = dones[i]

                    next_state = np.reshape(next_state.flatten(), [1, self.num_observation_space])

                    self.add_to_replay_memory(state, action, reward, next_state, done)
                    total_rewards[i] += reward

                    if step % 100 == 0:
                        wandb.log({'Episode': episode, 'Env': i, 'Step': step, 'Total Reward (Step)': total_rewards[i]})

                states = np.array([np.reshape(state.flatten(), [1, self.num_observation_space]) for state in next_states])
                self.learn_and_update_weights_by_reply()
                step += 1

                if step % 100 == 0:
                    for i in range(self.num_envs):
                        wandb.log({'Episode': episode, 'Env': i, 'Total Reward (Episode)': total_rewards[i]})

            for i in range(self.num_envs):
                rewards_list[i].append(total_rewards[i])
                wandb.log({'Episode': episode, 'Env': i, 'Total Reward (Episode)': total_rewards[i]})

            self.epsilon = max(self.epsilon_min, self.epsilon * self.epsilon_decay)  # Epsilon güncelleme

            # Erken durma koşulu
            if can_stop and all(np.mean(rewards[-100:]) > some_threshold for rewards in rewards_list):
                print(f"Erken durma koşulu {episode} bölümünde karşılandı.")
                break

            if episode % 100 == 0 or episode == num_episodes - 1:
                self.save_training_progress(rewards_list, episode, self.epsilon)

        return rewards_list

: 

In [None]:
import pickle
from matplotlib import pyplot as plt
import pandas as pd
from tensorflow.keras.models import load_model

def plot_df(df, chart_name, title, x_axis_label, y_axis_label):
    """
    Verilen DataFrame üzerinden çizgi grafiği çizer ve kaydeder.

    :param df: Çizim için kullanılacak pandas DataFrame.
    :param chart_name: Grafiğin kaydedileceği dosya adı.
    :param title: Grafiğin başlığı.
    :param x_axis_label: X ekseni için etiket.
    :param y_axis_label: Y ekseni için etiket.
    """
    plt.figure(figsize=(15, 8))
    plt.plot(df)
    plt.title(title)
    plt.xlabel(x_axis_label)
    plt.ylabel(y_axis_label)
    plt.savefig(chart_name)
    plt.close()

def save_model(model, filename):
    """
    Verilen modeli belirtilen dosya adıyla kaydeder.

    :param model: Kaydedilecek model.
    :param filename: Modelin kaydedileceği dosya adı.
    """
    model.save(filename)

def load_trained_model(filename):
    """
    Belirtilen dosya adından eğitilmiş bir model yükler.

    :param filename: Yüklenecek modelin dosya adı.
    :return: Yüklenen model.
    """
    return load_model(filename)

def save_to_pickle(data, filename):
    """
    Verilen veriyi pickle formatında kaydeder.

    :param data: Kaydedilecek veri.
    :param filename: Dosya adı.
    """
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def load_from_pickle(filename):
    """
    Pickle formatında kaydedilmiş veriyi yükler.

    :param filename: Yüklenmek istenen dosyanın adı.
    :return: Yüklenen veri.
    """
    with open(filename, 'rb') as file:
        return pickle.load(file)


: 

In [None]:
# train.py
import wandb
import gym
import numpy as np
import pickle
import pandas as pd
from stable_baselines3.common.vec_env import DummyVecEnv

def make_env():
    def _init():
        env = gym.make('ALE/KungFuMaster-v5', render_mode="rgb_array")
        env.action_space.seed(42)
        return env
    return _init

def main():
    wandb.init(project='RLkungfumaster', entity='fth123bng')

    # Gym ortamını başlat
    envs = [make_env() for _ in range(num_envs)]
    vec_env = DummyVecEnv(envs)

    # DQN modelini başlat
    model = DQN(vec_env, lr, gamma, epsilon, epsilon_decay)

    print("Starting training for DQN model...")
    training_rewards = model.train(training_episodes)

    save_dir = "saved_models/"
    model_path = save_dir + "trained_model.h5"
    save_model(model.model, model_path)
    wandb.save(model_path)

    # Eğitim ödüllerini kaydet ve görselleştir
    pickle.dump(training_rewards, open(save_dir + "train_rewards_list.p", "wb"))
    reward_df = pd.DataFrame(training_rewards)
    plot_df(reward_df, save_dir + "training_rewards.png", "Training Rewards per Episode", "Episode", "Reward")
    wandb.log({"Training Rewards": wandb.Image(save_dir + "training_rewards.png")})

    print("Training Completed!")

if __name__ == "__main__":
    main()

: 

In [None]:
# test.py
from utils import load_trained_model
from utils import plot_df, load_trained_model
import gym
import numpy as np
import pickle
import pandas as pd
import wandb

def test_already_trained_model(trained_model, env, num_episodes=100):
    test_rewards = []
    for episode in range(num_episodes):
        initial_state = env.reset()
        state = initial_state[0] if isinstance(initial_state, tuple) else initial_state
        state_flattened = state.flatten()
        state = np.reshape(state_flattened, [1, np.prod(env.observation_space.shape)])

        total_reward = 0
        done = False
        while not done:
            action = np.argmax(trained_model.predict(state)[0])
            step_result = env.step(action)
            next_state = step_result[0]
            reward = step_result[1]
            done = step_result[2]

            next_state_flattened = next_state.flatten()
            state = np.reshape(next_state_flattened, [1, np.prod(env.observation_space.shape)])

            total_reward += reward
        test_rewards.append(total_reward)
        print(f"Episode: {episode}, Total Reward: {total_reward}")
        wandb.log({'Test Episode': episode, 'Total Reward': total_reward})
    return test_rewards

def test_model():
    # WandB yapılandırması
    wandb.init(project='RKkungfumaster', entity='fth123bng', job_type="testing")

    # Gym ortamını başlat
    env = gym.make('ALE/KungFuMaster-v5', render_mode="rgb_array")

    # Eğitilmiş modeli yükle
    save_dir = "saved_models/"
    model_path = save_dir + "trained_model.h5"
    trained_model = load_trained_model(model_path)

    # Modeli test et
    test_rewards = test_already_trained_model(trained_model, env)
    pickle.dump(test_rewards, open(save_dir + "test_rewards.p", "wb"))
    test_rewards_df = pd.DataFrame(test_rewards)
    plot_df(test_rewards_df, save_dir + "testing_rewards.png", "Testing Rewards per Episode", "Episode", "Reward")
    wandb.log({"Testing Rewards": wandb.Image(save_dir + "testing_rewards.png")})

    print("Testing Completed!")

if __name__ == "__main__":
    test_model()