# Masterarbeit

## Inhaltsverzeichnis

1. Importstatements
2. Datenvorbereitung
3. Simulationsmodell
4. Q-Learning-Agent
5. Hyperparameters
6. Training Loop

## Import statements 

In [22]:
import os
import random
import multiprocessing
import datetime
from collections import deque

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from keras.utils import to_categorical
from keras.models import load_model

from tensorflow import summary, Variable, Session
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import RMSprop


## Datenvorbereitung

### Wetterdaten

In [23]:
def load_weather(path, start, ende):
    df = pd.read_csv(
        path, 
        index_col="date", 
        memory_map=True

        )
    df = df.drop(columns=["Unnamed: 0", "HauptGruppe", "NebenGruppe"])
    # df = df.sort_index()
    # df["Datum"] = df.index.get_values()
    # df["Datum"] = pd.to_datetime(df["Datum"]*24*3600, unit='s')
    # df = df[df.Datum.dt.year.isin([2018,2019])]
    # df = df[df.Datum.dt.dayofweek != 6]
    df = df[df.index.isin(range(start, ende +3))]
    # Plus 2 Tage, da Wetter von morgen und übermorgen
    return df.to_numpy()


### Preisdaten

In [24]:
def load_prices(path):
    df = pd.read_csv(
        path, 
        names=["Zeile", "Preis","Artikelnummer","Datum"],
        header=0,
        index_col="Artikelnummer", 
        memory_map=True
        )
    df = df.sort_index()
    df = df.drop(columns=["Zeile"])
    return df


### Absatzdaten

In [25]:
def load_sales(path):
    # TODO: Statische Artikelinfo aus der Absatztabelle rausnehmen. (Warengruppe, Abteilung)
    """
     for artikel in train_data["Artikel"].unique():
         warengruppen.append([artikel, train_data.loc[(slice(None), slice(5550,5550)),:].iloc[0].Warengruppe])
    """

    df = pd.read_csv(
        path, 
        names=["Zeile", "Datum", "Artikel", "Absatz", "Warengruppe", "Abteilung"], 
        header=0, 
        parse_dates=[1], 
        index_col=[1, 2],
        memory_map=True
        )
    df.dropna(how='any', inplace=True)
    df["Warengruppe"] = df["Warengruppe"].astype(np.uint8)
    df = df.drop(columns=['Abteilung', 'Zeile'])
    # Warengruppen auswählen
    # 13 Frischmilch
    # 14 Joghurt
    # 69 Tabak
    # 8 Obst Allgemen
    # warengruppen = [8, 13, 14, 69 ]
    warengruppen = [8]
    df = df[df['Warengruppe'].isin(warengruppen)]
    for i, wg in enumerate(warengruppen):
        df.loc[df.Warengruppe == wg, "Warengruppe"] = i
    df["Datum"] = df.index.get_level_values('Datum')
    df["Artikel"] = df.index.get_level_values('Artikel').astype(np.int32)
    # df["Wochentag"] = df["Datum"].apply(lambda x:x.dayofweek)
    # df["Jahrestag"] = df["Datum"].apply(lambda x:x.dayofyear)
    df["UNIXTag"] = df["Datum"].astype(np.int64)/(1000000000 * 24 * 3600)
    df["Jahr"] = df["Datum"].apply(lambda x:x.year)
    # df = df.drop(columns=['Datum'])
    df = df.sort_index()
    
    test_data = df[df["Jahr"]==2019]
    train_data = df[df["Jahr"]==2018]
    return test_data, train_data


In [26]:
def copy_data_to_numpy(big_df, artikel, start, end):
    """Returns a numpy array with lenght = self.kalendertage. Days without Sales are filled with zeros"""
    s = big_df[big_df.Artikel == artikel].copy()
    s.set_index(s.UNIXTag, inplace=True)
    wg = s.iloc[0][["Warengruppe"]][0]
    s = s.drop(columns=["Datum", "Artikel", "Warengruppe", "Jahr", "UNIXTag"])
    s = s.reindex(range(int(start), int(end+1)), fill_value=0)

    return s.to_numpy(), wg


## Simulationsmodell

In [27]:
class StockSimulation:
    def __init__(self, data_dir, time_series_lenght):
        """
        Lädt Daten selbstständig aus Data_dir und erstellt das Simulationsmodell. 
        1. Episode entspricht einem Durchlauf mit einem Artikel.
        
        """

        test_data, train_data = load_sales(os.path.join(data_dir, '3 absatz_altforweiler.csv'))

        self.df = train_data

        self.start_tag = int(min(train_data["UNIXTag"]))
        self.end_tag = int(max(train_data["UNIXTag"]))
        self.kalender_tage = self.end_tag - self.start_tag + 1

        preise = load_prices(os.path.join(data_dir, '3 preise_altforweiler.csv'))

        self.wetter = load_weather(os.path.join(data_dir, '2 wetter_saarlouis.csv'), self.start_tag, self.end_tag)
        
        self.warengruppen = self.df["Warengruppe"].unique()
        self.anz_wg = len(self.warengruppen)

        self.anfangsbestand = np.random.randint(0,10)

        self.time_series_lenght = time_series_lenght

        olt = 1  # Fürs erste
        self.fertig = None
        self.vergangene_tage = None
        self.akt_prod_bestand = None
        self.akt_prod_absatz = None
        self.akt_prod_wg = None
        self.akt_prod_preis = None
        self.akt_prod_olt = None
        self.time_series_state = None

        self.absatz_data = {}
        self.static_state_data = {}
        for artikel in tqdm(self.df["Artikel"].unique()):
            art_df, wg = copy_data_to_numpy(self.df, artikel, self.start_tag, self.end_tag)
            self.absatz_data[artikel] = art_df
            wg = to_categorical(wg, num_classes=self.anz_wg)

            artikel_preis = preise.loc[artikel]

            if type(artikel_preis) == pd.core.frame.DataFrame:
                artikel_preis = np.array(
                    [artikel_preis[artikel_preis.Datum == max(artikel_preis.Datum)]["Preis"].iat[0]]
                )
            elif type(artikel_preis) == pd.core.series.Series:
                artikel_preis = np.array([artikel_preis["Preis"]])
            elif type(artikel_preis) == int:
                artikel_preis = np.array([artikel_preis])
            else:
                raise AssertionError("Unknown Type for Price: {}".format(type(artikel_preis)))
            self.static_state_data[artikel] = {"Warengruppe":wg, "OrderLeadTime": olt, "Preis": artikel_preis}

        self.aktueller_tag = self.start_tag
        self.aktuelles_produkt = self.df["Artikel"].sample(1).to_numpy()[0]

    def create_new_state(self, wochentag):
        new_state = np.concatenate(
            [
                np.array([self.akt_prod_bestand]), 
                wochentag, 
                self.akt_prod_wg, 
                self.akt_prod_preis, 
                self.wetter[self.vergangene_tage], 
                self.wetter[self.vergangene_tage+1]
                ]
            )
        return new_state

    def reset(self):
        """ 

        """
        self.fertig = False
        self.anfangsbestand = np.random.randint(0, 10)
        self.aktueller_tag = self.start_tag
        self.vergangene_tage = 0
        self.aktuelles_produkt = self.df["Artikel"].sample(1).to_numpy()[0]
        self.akt_prod_bestand = self.anfangsbestand
        self.akt_prod_absatz = self.absatz_data[self.aktuelles_produkt]
        self.akt_prod_wg = self.static_state_data[self.aktuelles_produkt]["Warengruppe"]
        self.akt_prod_preis = self.static_state_data[self.aktuelles_produkt]["Preis"]
        self.akt_prod_olt = self.static_state_data[self.aktuelles_produkt]["OrderLeadTime"]

        wochentag = self.aktueller_tag % 7

        wochentag = to_categorical(wochentag, num_classes=7)

        new_state = self.create_new_state(wochentag)
        
        self.time_series_state = deque(maxlen=self.time_series_lenght)
        for _ in range(self.time_series_lenght):
            self.time_series_state.append(new_state)
        return np.array(self.time_series_state), {"Artikel": self.aktuelles_produkt}

    def make_action(self, action):
        if self.fertig:
            raise AssertionError("Simulation für diesen Artikel fertig. Simulation zurücksetzen")

        absatz = self.akt_prod_absatz[self.vergangene_tage][0]

        self.aktueller_tag += 1
        self.vergangene_tage += 1

        if self.aktueller_tag % 7 == 3: # Sonntag
            self.aktueller_tag += 1
            self.vergangene_tage += 1
        
        wochentag = self.aktueller_tag % 7

        # Action ist die Bestellte Menge an Artikeln
        # Tagsüber Absatz abziehen:
        self.akt_prod_bestand -= absatz

        # Nachmittag: Bestellung kommt an
        self.akt_prod_bestand += action

        # Abend: Bestand wird bewertet
        if self.akt_prod_bestand >= 1:
            reward = np.exp((-self.akt_prod_bestand+1)/5)
        else:
            reward = np.exp((self.akt_prod_bestand-1)*1.5-1)
            # Nichtnegativität des Bestandes
            self.akt_prod_bestand = 0

        wochentag = to_categorical(wochentag, num_classes=7)
        
        new_state = self.create_new_state(wochentag)

        self.time_series_state.append(new_state)

        if self.vergangene_tage == self.kalender_tage -1:
            self.fertig = True
        
        return reward, self.fertig, np.array(self.time_series_state)


## Q-Learning Agent

In [28]:
class DQN:
    def __init__(self, 
                 memory_size, 
                 state_shape, 
                 action_space, 
                 gamma, 
                 learning_rate, 
                 batch_size, 
                 epsilon, 
                 epsilon_decay, 
                 epsilon_min, 
                 possible_actions, 
                 time_series_length
                 ):
        self.memory_size = memory_size
        self.state_shape = state_shape
        self.action_space = action_space
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epsilon = epsilon 
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.possible_actions = possible_actions
        self.time_series_length = time_series_length
        self.memory = deque(maxlen=memory_size)
        self.model = self.create_model()
        self.logdir = "./logs/" + datetime.datetime.today().date().__str__() + "-" \
                      + datetime.datetime.today().time().__str__()[:8].replace(":", ".")
        self.target_model = self.create_model()
        self.sess = Session()
        self.writer = summary.FileWriter(self.logdir, self.sess.graph)
        self.reward = Variable(0.0, trainable=False, name="vReward")
        self.reward_mean = Variable(0.0, trainable=False, name="vMeanReward")
        self.loss = Variable(0.0, trainable=False, name="vLoss")
        self.accuracy = Variable(0.0, trainable=False, name="vMSE")
        self.summary_reward = summary.scalar("Reward", self.reward)
        self.summary_reward_mean = summary.scalar("MeanReward", self.reward_mean)
        self.summary_loss = summary.scalar("Loss", self.loss)
        self.summary_mse = summary.scalar("Accuracy", self.accuracy)
        self.merged = summary.merge(
            [
                self.summary_reward, 
                self.summary_reward_mean, 
                self.summary_loss, 
                self.summary_mse
            ])

    def create_model(self):
        inputs = Input(shape=(self.time_series_length, self.state_shape))
        x = LSTM(64, activation='relu')(inputs)
        x = Dense(128, activation='relu')(x)
        x = Dense(256, activation='relu')(x)
        predictions = Dense(self.action_space, activation='relu')(x)
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=RMSprop(lr=self.learning_rate), loss='mse', metrics=["accuracy"])
        
        return model

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        samples = random.sample(self.memory, self.batch_size)

        states = [sample[0] for sample in samples]
        actions = [sample[1] for sample in samples]
        rewards = [sample[2] for sample in samples]
        new_states = [sample[3] for sample in samples]
        new_states = np.array(new_states)
        states = np.array(states)
        dones = [sample[4] for sample in samples]
        targets = self.target_model.predict(states)
        qs_new_states = self.target_model.predict(new_states)
        
        target_qs_batch = []
        for i in range(self.batch_size):
            terminal = dones[i]

            if terminal:
                updated_target = targets[i]
                updated_target[actions[i]] = rewards[i]
                target_qs_batch.append(updated_target)
            else:
                updated_target = targets[i]
                updated_target[actions[i]] = rewards[i] + self.gamma * np.max(qs_new_states[i])
                target_qs_batch.append(updated_target)

        targets = np.array([each for each in target_qs_batch])

        history = self.model.fit(states, targets, epochs=1, verbose=0, callbacks=[])
        return history.history

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i]
        self.target_model.set_weights(target_weights)

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = np.max([self.epsilon, self.epsilon_min])
        if random.random() < self.epsilon:
            return random.sample(self.possible_actions, 1)[0]
        return np.argmax(self.model.predict(state.reshape(1, self.time_series_length, self.state_shape))[0])
    
    def save(self):
        agent.target_model.save("model/model.h5")
    
    def load(self):
        model = load_model("model/model.h5")
        agent.target_model = model
        agent.model = model



## Hyperparameter

In [33]:
do_train = True

use_saved_model = False


memory_size = 364*500
gamma = 0.5
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.9999
learning_rate = 0.0001
tau = 0.05
batch_size = 32
n_step = 64
log_frequency = 100  # jeder 100te n_step

epochs = 10000

update_target_network = batch_size * 100

state_shape = 24
action_space = 10

time_series_length = 10

order_none = 0
order_one = 1
order_two = 2
order_tree = 3
order_four = 4
order_five = 5
order_six = 6
order_seven = 7
order_eight = 8
order_nine = 9

possible_actions = [
    order_none, 
    order_one, 
    order_two, 
    order_tree, 
    order_four, 
    order_five, 
    order_six, 
    order_seven, 
    order_eight, 
    order_nine
    ]


## Trainingsloop

### Initialisieren

In [34]:
data_dir = 'F:/OneDrive/Dokumente/1 Universität - Master/6. Semester/Masterarbeit/Implementation/Echtdaten'


In [35]:
simulation = StockSimulation(data_dir, time_series_length)


  mask |= (ar1 == a)


  0%|                                                                                          | 0/566 [00:00<?, ?it/s]

  2%|█▌                                                                              | 11/566 [00:00<00:05, 105.82it/s]

  5%|███▋                                                                            | 26/566 [00:00<00:04, 115.81it/s]

  7%|█████▉                                                                          | 42/566 [00:00<00:04, 125.68it/s]

 11%|████████▍                                                                       | 60/566 [00:00<00:03, 136.91it/s]

 14%|██████████▉                                                                     | 77/566 [00:00<00:03, 143.95it/s]

 17%|█████████████▎                                                                  | 94/566 [00:00<00:03, 150.48it/s]

 20%|███████████████▍                                                               | 111/566 [00:00<00:02, 154.58it/s]

 23%|██████████████████                                                             | 129/566 [00:00<00:02, 160.12it/s]

 26%|████████████████████▍                                                          | 146/566 [00:00<00:02, 160.65it/s]

 29%|██████████████████████▉                                                        | 164/566 [00:01<00:02, 164.17it/s]

 32%|█████████████████████████▎                                                     | 181/566 [00:01<00:02, 161.64it/s]

 35%|███████████████████████████▋                                                   | 198/566 [00:01<00:02, 157.23it/s]

 38%|█████████████████████████████▊                                                 | 214/566 [00:01<00:02, 152.19it/s]

 41%|████████████████████████████████                                               | 230/566 [00:01<00:02, 153.12it/s]

 43%|██████████████████████████████████▎                                            | 246/566 [00:01<00:02, 151.60it/s]

 46%|████████████████████████████████████▌                                          | 262/566 [00:01<00:02, 150.54it/s]

 49%|██████████████████████████████████████▊                                        | 278/566 [00:01<00:01, 150.66it/s]

 52%|█████████████████████████████████████████                                      | 294/566 [00:01<00:01, 149.06it/s]

 55%|███████████████████████████████████████████▏                                   | 309/566 [00:02<00:01, 148.45it/s]

 58%|█████████████████████████████████████████████▌                                 | 326/566 [00:02<00:01, 152.66it/s]

 61%|████████████████████████████████████████████████                               | 344/566 [00:02<00:01, 158.26it/s]

 64%|██████████████████████████████████████████████████▌                            | 362/566 [00:02<00:01, 162.00it/s]

 67%|████████████████████████████████████████████████████▉                          | 379/566 [00:02<00:01, 163.84it/s]

 70%|███████████████████████████████████████████████████████▍                       | 397/566 [00:02<00:01, 166.02it/s]

 73%|█████████████████████████████████████████████████████████▊                     | 414/566 [00:02<00:00, 165.75it/s]

 76%|████████████████████████████████████████████████████████████▏                  | 431/566 [00:02<00:00, 166.98it/s]

 79%|██████████████████████████████████████████████████████████████▋                | 449/566 [00:02<00:00, 168.29it/s]

 83%|█████████████████████████████████████████████████████████████████▏             | 467/566 [00:02<00:00, 170.66it/s]

 86%|███████████████████████████████████████████████████████████████████▋           | 485/566 [00:03<00:00, 170.44it/s]

 89%|██████████████████████████████████████████████████████████████████████▏        | 503/566 [00:03<00:00, 171.68it/s]

 92%|████████████████████████████████████████████████████████████████████████▋      | 521/566 [00:03<00:00, 172.10it/s]

 95%|███████████████████████████████████████████████████████████████████████████▏   | 539/566 [00:03<00:00, 172.42it/s]

 98%|█████████████████████████████████████████████████████████████████████████████▋ | 557/566 [00:03<00:00, 172.12it/s]

100%|███████████████████████████████████████████████████████████████████████████████| 566/566 [00:03<00:00, 161.21it/s]




In [36]:
agent = DQN(
    memory_size, 
    state_shape, 
    action_space, 
    gamma,
    learning_rate, 
    batch_size, 
    epsilon, 
    epsilon_decay, 
    epsilon_min, 
    possible_actions, 
    time_series_length
    )

if use_saved_model:
    agent.load()


In [37]:
if do_train:
    global_steps = 0
    stats = {"loss": [], "acc": [], "rew": []}
    for epoch in range(epochs):
        state, info = simulation.reset()
        print(info)
        current_rewards = []
        while True:
            action = agent.act(state)
            global_steps += 1
            reward, fertig, new_state = simulation.make_action(action)
            current_rewards.append(reward)
            agent.remember(state, action, reward, new_state, fertig)
            if global_steps % n_step == 0:
                history = agent.replay()
                if history:
                    curr_loss = history["loss"][0]
                    curr_acc = history["acc"][0]
                    stats["loss"].append(curr_loss)
                    stats["acc"].append(curr_acc)
                
            if global_steps % update_target_network == 0:
                agent.target_train()
    
            state = new_state
    
            if fertig:
                history = agent.replay()
                curr_loss = history["loss"][0]
                curr_acc = history["acc"][0]
                curr_rew = np.sum(current_rewards)
                curr_mean_rew = np.mean(current_rewards)
                agent.sess.run(
                    [
                        agent.reward.assign(curr_rew), 
                        agent.reward_mean.assign(curr_mean_rew), 
                        agent.loss.assign(curr_loss), 
                        agent.accuracy.assign(curr_acc)
                    ]
                )
                summary = agent.sess.run(agent.merged)
                agent.writer.add_summary(summary, epoch)
                print("Epoch {}".format(epoch))
                print(
                    "\tMean reward: {} --- Total Reward: {} --- EXP-EXP: {}".format(curr_mean_rew, curr_rew, agent.epsilon)
                )
                agent.save()
                break
    agent.writer.close()
    agent.sess.close()


{'Artikel': 185992}




Epoch 0




	Mean reward: 0.0008869737928586575 --- Total Reward: 0.2758488495790425 --- EXP-EXP: 0.9693771228828401




{'Artikel': 5864}




Epoch 1




	Mean reward: 0.0016909270113323709 --- Total Reward: 0.5258783005243673 --- EXP-EXP: 0.9396920063686125




{'Artikel': 129964}




Epoch 2




	Mean reward: 0.0004927519553902271 --- Total Reward: 0.15324585812636063 --- EXP-EXP: 0.9109159335296094




{'Artikel': 129963}




Epoch 3




	Mean reward: 0.000831870110043643 --- Total Reward: 0.258711604223573 --- EXP-EXP: 0.883021066833069




{'Artikel': 15421}




Epoch 4




	Mean reward: 0.000782594347554857 --- Total Reward: 0.24338684208956055 --- EXP-EXP: 0.8559804212115764




{'Artikel': 129964}




Epoch 5




	Mean reward: 0.0010319960565446046 --- Total Reward: 0.320950773585372 --- EXP-EXP: 0.8297678379581195




{'Artikel': 184659}




Epoch 6




	Mean reward: 0.00026758009456602445 --- Total Reward: 0.08321740941003361 --- EXP-EXP: 0.8043579594205564




{'Artikel': 320875}




Epoch 7




	Mean reward: 0.0008054213384654146 --- Total Reward: 0.25048603626274396 --- EXP-EXP: 0.7797262044710112




{'Artikel': 15314}




Epoch 8




	Mean reward: 0.021977670817865353 --- Total Reward: 6.835055624356125 --- EXP-EXP: 0.7558487447264656




{'Artikel': 13117}




Epoch 9




	Mean reward: 0.0008644492507177353 --- Total Reward: 0.2688437169732157 --- EXP-EXP: 0.7327024814975474




{'Artikel': 6972}




Epoch 10




	Mean reward: 0.001816893633808059 --- Total Reward: 0.5650539201143063 --- EXP-EXP: 0.71026502344321




{'Artikel': 168673}




Epoch 11




	Mean reward: 0.0033433477060848516 --- Total Reward: 1.0397811365923888 --- EXP-EXP: 0.6885146649096918




{'Artikel': 6972}




Epoch 12




	Mean reward: 0.0010229511978373353 --- Total Reward: 0.3181378225274113 --- EXP-EXP: 0.6674303649328001




{'Artikel': 340511}




Epoch 13




	Mean reward: 0.0024288226795560167 --- Total Reward: 0.7553638533419212 --- EXP-EXP: 0.6469917268832013




{'Artikel': 5558}




Epoch 14




	Mean reward: -0.025981307654387054 --- Total Reward: -8.080186680514373 --- EXP-EXP: 0.6271789787350371




{'Artikel': 143245}




Epoch 15




	Mean reward: 0.002156005418855799 --- Total Reward: 0.6705176852641535 --- EXP-EXP: 0.6079729539387682




{'Artikel': 225539}




Epoch 16




	Mean reward: 0.00030489234552811135 --- Total Reward: 0.09482151945924262 --- EXP-EXP: 0.5893550728797433




{'Artikel': 6079}




Epoch 17




	Mean reward: 0.0016492684779439218 --- Total Reward: 0.5129224966405597 --- EXP-EXP: 0.5713073249045721




{'Artikel': 153122}




Epoch 18




	Mean reward: 0.0008717613187380904 --- Total Reward: 0.2711177701275461 --- EXP-EXP: 0.5538122508978873




{'Artikel': 13478}




Epoch 19




	Mean reward: -0.005256576923507733 --- Total Reward: -1.634795423210905 --- EXP-EXP: 0.5368529263926637




{'Artikel': 6057}




Epoch 20




	Mean reward: 0.005731897589293268 --- Total Reward: 1.7826201502702066 --- EXP-EXP: 0.5204129451977533




{'Artikel': 11598}




Epoch 21




	Mean reward: 0.0010900741021059362 --- Total Reward: 0.3390130457549462 --- EXP-EXP: 0.504476403526783




{'Artikel': 171669}




Epoch 22




	Mean reward: 0.002522031393536097 --- Total Reward: 0.7843517633897262 --- EXP-EXP: 0.48902788461307556




{'Artikel': 5906}




Epoch 23




	Mean reward: 0.0016755429551312542 --- Total Reward: 0.5210938590458201 --- EXP-EXP: 0.4740524437957048




{'Artikel': 16193}




Epoch 24




	Mean reward: 0.002818954590506364 --- Total Reward: 0.8766948776474792 --- EXP-EXP: 0.4595355940622598




{'Artikel': 5558}




Epoch 25




	Mean reward: -0.02295363874924512 --- Total Reward: -7.138581651015232 --- EXP-EXP: 0.4454632920343298




{'Artikel': 13433}




Epoch 26




	Mean reward: 0.0010645492021376277 --- Total Reward: 0.3310748018648022 --- EXP-EXP: 0.4318219243821574




{'Artikel': 320845}




Epoch 27




	Mean reward: 0.0009000784176934484 --- Total Reward: 0.27992438790266244 --- EXP-EXP: 0.4185982946553079




{'Artikel': 308002}




Epoch 28




	Mean reward: 0.000744912245079173 --- Total Reward: 0.23166770821962282 --- EXP-EXP: 0.4057796105166253




{'Artikel': 234598}




Epoch 29




	Mean reward: -0.03497453972477235 --- Total Reward: -10.877081854404201 --- EXP-EXP: 0.39335347136712556


