# Masterarbeit

## Inhaltsverzeichnis

1. Importstatements
2. Datenvorbereitung
3. Simulationsmodell
4. Q-Learning-Agent
5. Hyperparameters
6. Training Loop

## Import statements 

In [1]:
import os
import random
import multiprocessing
import datetime
from collections import deque

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

from keras.utils import to_categorical
from keras.models import load_model

from tensorflow import summary, Variable, Session
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import RMSprop


Using TensorFlow backend.


## Datenvorbereitung

### Wetterdaten

In [2]:
def load_weather(path, start, ende):
    df = pd.read_csv(
        path, 
        index_col="date", 
        memory_map=True

        )
    df = df.drop(columns=["Unnamed: 0", "HauptGruppe", "NebenGruppe"])
    # df = df.sort_index()
    # df["Datum"] = df.index.get_values()
    # df["Datum"] = pd.to_datetime(df["Datum"]*24*3600, unit='s')
    # df = df[df.Datum.dt.year.isin([2018,2019])]
    # df = df[df.Datum.dt.dayofweek != 6]
    df = df[df.index.isin(range(start, ende +3))]
    # Plus 2 Tage, da Wetter von morgen und übermorgen
    return df.to_numpy()


### Preisdaten

In [3]:
def load_prices(path):
    df = pd.read_csv(
        path, 
        names=["Zeile", "Preis","Artikelnummer","Datum"],
        header=0,
        index_col="Artikelnummer", 
        memory_map=True
        )
    df = df.sort_index()
    df = df.drop(columns=["Zeile"])
    return df


### Absatzdaten

In [4]:
def load_sales(path):
    # TODO: Statische Artikelinfo aus der Absatztabelle rausnehmen. (Warengruppe, Abteilung)
    """
     for artikel in train_data["Artikel"].unique():
         warengruppen.append([artikel, train_data.loc[(slice(None), slice(5550,5550)),:].iloc[0].Warengruppe])
    """

    df = pd.read_csv(
        path, 
        names=["Zeile", "Datum", "Artikel", "Absatz", "Warengruppe", "Abteilung"], 
        header=0, 
        parse_dates=[1], 
        index_col=[1, 2],
        memory_map=True
        )
    df.dropna(how='any', inplace=True)
    df["Warengruppe"] = df["Warengruppe"].astype(np.uint8)
    df = df.drop(columns=['Abteilung', 'Zeile'])
    # Warengruppen auswählen
    # 13 Frischmilch
    # 14 Joghurt
    # 69 Tabak
    # 8 Obst Allgemen
    # warengruppen = [8, 13, 14, 69 ]
    warengruppen = [8]
    df = df[df['Warengruppe'].isin(warengruppen)]
    for i, wg in enumerate(warengruppen):
        df.loc[df.Warengruppe == wg, "Warengruppe"] = i
    df["Datum"] = df.index.get_level_values('Datum')
    df["Artikel"] = df.index.get_level_values('Artikel').astype(np.int32)
    # df["Wochentag"] = df["Datum"].apply(lambda x:x.dayofweek)
    # df["Jahrestag"] = df["Datum"].apply(lambda x:x.dayofyear)
    df["UNIXTag"] = df["Datum"].astype(np.int64)/(1000000000 * 24 * 3600)
    df["Jahr"] = df["Datum"].apply(lambda x:x.year)
    # df = df.drop(columns=['Datum'])
    df = df.sort_index()
    
    test_data = df[df["Jahr"]==2019]
    train_data = df[df["Jahr"]==2018]
    return test_data, train_data


In [5]:
def copy_data_to_numpy(big_df, artikel, start, end):
    """Returns a numpy array with lenght = self.kalendertage. Days without Sales are filled with zeros"""
    s = big_df[big_df.Artikel == artikel].copy()
    s.set_index(s.UNIXTag, inplace=True)
    wg = s.iloc[0][["Warengruppe"]][0]
    s = s.drop(columns=["Datum", "Artikel", "Warengruppe", "Jahr", "UNIXTag"])
    s = s.reindex(range(int(start), int(end+1)), fill_value=0)

    return s.to_numpy(), wg


## Simulationsmodell

In [6]:
class StockSimulation:
    def __init__(self, data_dir, time_series_lenght):
        """
        Lädt Daten selbstständig aus Data_dir und erstellt das Simulationsmodell. 
        1. Episode entspricht einem Durchlauf mit einem Artikel.
        
        """

        test_data, train_data = load_sales(os.path.join(data_dir, '3 absatz_altforweiler.csv'))

        self.df = train_data

        self.start_tag = int(min(train_data["UNIXTag"]))
        self.end_tag = int(max(train_data["UNIXTag"]))
        self.kalender_tage = self.end_tag - self.start_tag + 1

        preise = load_prices(os.path.join(data_dir, '3 preise_altforweiler.csv'))

        self.wetter = load_weather(os.path.join(data_dir, '2 wetter_saarlouis.csv'), self.start_tag, self.end_tag)
        
        self.warengruppen = self.df["Warengruppe"].unique()
        self.anz_wg = len(self.warengruppen)

        self.anfangsbestand = np.random.randint(0,10)

        self.time_series_lenght = time_series_lenght

        olt = 1  # Fürs erste
        self.fertig = None
        self.vergangene_tage = None
        self.akt_prod_bestand = None
        self.akt_prod_absatz = None
        self.akt_prod_wg = None
        self.akt_prod_preis = None
        self.akt_prod_olt = None
        self.time_series_state = None

        self.absatz_data = {}
        self.static_state_data = {}
        for artikel in tqdm(self.df["Artikel"].unique()):
            art_df, wg = copy_data_to_numpy(self.df, artikel, self.start_tag, self.end_tag)
            self.absatz_data[artikel] = art_df
            wg = to_categorical(wg, num_classes=self.anz_wg)

            artikel_preis = preise.loc[artikel]

            if type(artikel_preis) == pd.core.frame.DataFrame:
                artikel_preis = np.array(
                    [artikel_preis[artikel_preis.Datum == max(artikel_preis.Datum)]["Preis"].iat[0]]
                )
            elif type(artikel_preis) == pd.core.series.Series:
                artikel_preis = np.array([artikel_preis["Preis"]])
            elif type(artikel_preis) == int:
                artikel_preis = np.array([artikel_preis])
            else:
                raise AssertionError("Unknown Type for Price: {}".format(type(artikel_preis)))
            self.static_state_data[artikel] = {"Warengruppe":wg, "OrderLeadTime": olt, "Preis": artikel_preis}

        self.aktueller_tag = self.start_tag
        self.aktuelles_produkt = self.df["Artikel"].sample(1).to_numpy()[0]

    def create_new_state(self, wochentag):
        new_state = np.concatenate(
            [
                np.array([self.akt_prod_bestand]), 
                wochentag, 
                self.akt_prod_wg, 
                self.akt_prod_preis, 
                self.wetter[self.vergangene_tage], 
                self.wetter[self.vergangene_tage+1]
                ]
            )
        return new_state

    def reset(self):
        """ 

        """
        self.fertig = False
        self.anfangsbestand = np.random.randint(0, 10)
        self.aktueller_tag = self.start_tag
        self.vergangene_tage = 0
        self.aktuelles_produkt = self.df["Artikel"].sample(1).to_numpy()[0]
        self.akt_prod_bestand = self.anfangsbestand
        self.akt_prod_absatz = self.absatz_data[self.aktuelles_produkt]
        self.akt_prod_wg = self.static_state_data[self.aktuelles_produkt]["Warengruppe"]
        self.akt_prod_preis = self.static_state_data[self.aktuelles_produkt]["Preis"]
        self.akt_prod_olt = self.static_state_data[self.aktuelles_produkt]["OrderLeadTime"]

        wochentag = self.aktueller_tag % 7

        wochentag = to_categorical(wochentag, num_classes=7)

        new_state = self.create_new_state(wochentag)
        
        self.time_series_state = deque(maxlen=self.time_series_lenght)
        for _ in range(self.time_series_lenght):
            self.time_series_state.append(new_state)
        return np.array(self.time_series_state), {"Artikel": self.aktuelles_produkt}

    def make_action(self, action):
        if self.fertig:
            raise AssertionError("Simulation für diesen Artikel fertig. Simulation zurücksetzen")

        absatz = self.akt_prod_absatz[self.vergangene_tage][0]

        self.aktueller_tag += 1
        self.vergangene_tage += 1

        if self.aktueller_tag % 7 == 3: # Sonntag
            self.aktueller_tag += 1
            self.vergangene_tage += 1
        
        wochentag = self.aktueller_tag % 7

        # Action ist die Bestellte Menge an Artikeln
        # Tagsüber Absatz abziehen:
        self.akt_prod_bestand -= absatz

        # Nachmittag: Bestellung kommt an
        self.akt_prod_bestand += action

        # Abend: Bestand wird bewertet
        if self.akt_prod_bestand >= 1:
            reward = np.exp((-self.akt_prod_bestand+1)/5)
        else:
            reward = np.exp((self.akt_prod_bestand-1)*1.5-1)
            # Nichtnegativität des Bestandes
            self.akt_prod_bestand = 0

        wochentag = to_categorical(wochentag, num_classes=7)
        
        new_state = self.create_new_state(wochentag)

        self.time_series_state.append(new_state)

        if self.vergangene_tage == self.kalender_tage -1:
            self.fertig = True
        
        return reward, self.fertig, np.array(self.time_series_state)


## Q-Learning Agent

In [7]:
class DQN:
    def __init__(self, 
                 memory_size, 
                 state_shape, 
                 action_space, 
                 gamma, 
                 learning_rate, 
                 batch_size, 
                 epsilon, 
                 epsilon_decay, 
                 epsilon_min, 
                 possible_actions, 
                 time_series_length
                 ):
        self.memory_size = memory_size
        self.state_shape = state_shape
        self.action_space = action_space
        self.gamma = gamma
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.epsilon = epsilon 
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.possible_actions = possible_actions
        self.time_series_length = time_series_length
        self.memory = deque(maxlen=memory_size)
        self.model = self.create_model()
        self.logdir = "./logs/" + datetime.datetime.today().date().__str__() + "-" \
                      + datetime.datetime.today().time().__str__()[:8].replace(":", ".")
        self.target_model = self.create_model()
        self.sess = Session()
        self.writer = summary.FileWriter(self.logdir, self.sess.graph)
        self.reward = Variable(0.0, trainable=False, name="vReward")
        self.reward_mean = Variable(0.0, trainable=False, name="vMeanReward")
        self.loss = Variable(0.0, trainable=False, name="vLoss")
        self.accuracy = Variable(0.0, trainable=False, name="vMSE")
        self.summary_reward = summary.scalar("Reward", self.reward)
        self.summary_reward_mean = summary.scalar("MeanReward", self.reward_mean)
        self.summary_loss = summary.scalar("Loss", self.loss)
        self.summary_mse = summary.scalar("Accuracy", self.accuracy)
        self.merged = summary.merge(
            [
                self.summary_reward, 
                self.summary_reward_mean, 
                self.summary_loss, 
                self.summary_mse
            ])

    def create_model(self):
        inputs = Input(shape=(self.time_series_length, self.state_shape))
        x = LSTM(32, activation='relu')(inputs)
        x = Dense(32, activation='relu')(x)
        x = Dense(64, activation='relu')(x)
        predictions = Dense(self.action_space, activation='relu')(x)
        model = Model(inputs=inputs, outputs=predictions)
        model.compile(optimizer=RMSprop(lr=self.learning_rate), loss='mse', metrics=["accuracy"])
        
        return model

    def remember(self, state, action, reward, new_state, done):
        self.memory.append([state, action, reward, new_state, done])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return

        samples = random.sample(self.memory, self.batch_size)

        states = [sample[0] for sample in samples]
        actions = [sample[1] for sample in samples]
        rewards = [sample[2] for sample in samples]
        new_states = [sample[3] for sample in samples]
        new_states = np.array(new_states)
        states = np.array(states)
        dones = [sample[4] for sample in samples]
        targets = self.target_model.predict(states)
        qs_new_states = self.target_model.predict(new_states)
        
        target_qs_batch = []
        for i in range(self.batch_size):
            terminal = dones[i]

            if terminal:
                updated_target = targets[i]
                updated_target[actions[i]] = rewards[i]
                target_qs_batch.append(updated_target)
            else:
                updated_target = targets[i]
                updated_target[actions[i]] = rewards[i] + self.gamma * np.max(qs_new_states[i])
                target_qs_batch.append(updated_target)

        targets = np.array([each for each in target_qs_batch])

        history = self.model.fit(states, targets, epochs=1, verbose=0, callbacks=[])
        return history.history

    def target_train(self):
        weights = self.model.get_weights()
        target_weights = self.target_model.get_weights()
        for i in range(len(target_weights)):
            target_weights[i] = weights[i]
        self.target_model.set_weights(target_weights)

    def act(self, state):
        self.epsilon *= self.epsilon_decay
        self.epsilon = np.max([self.epsilon, self.epsilon_min])
        if random.random() < self.epsilon:
            return random.sample(self.possible_actions, 1)[0]
        return np.argmax(self.model.predict(state.reshape(1, self.time_series_length, self.state_shape))[0])
    
    def save(self):
        agent.target_model.save("model/model.h5")
    
    def load(self):
        model = load_model("model/model.h5")
        agent.target_model = model
        agent.model = model



## Hyperparameter

In [8]:
do_train = True

use_saved_model = False


memory_size = 364*500
gamma = 0.5
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.99999
learning_rate = 0.001
tau = 0.05
batch_size = 128
n_step = 300
log_frequency = 100  # jeder 100te n_step

epochs = 10000

update_target_network = batch_size * 100

state_shape = 24
action_space = 10

time_series_length = 10

order_none = 0
order_one = 1
order_two = 2
order_tree = 3
order_four = 4
order_five = 5
order_six = 6
order_seven = 7
order_eight = 8
order_nine = 9

possible_actions = [
    order_none, 
    order_one, 
    order_two, 
    order_tree, 
    order_four, 
    order_five, 
    order_six, 
    order_seven, 
    order_eight, 
    order_nine
    ]


## Trainingsloop

### Initialisieren

In [13]:
try:
    os.environ["OS"] == "Windows_NT"
    # Bin am eigenen Desktop
    data_dir = 'F:/OneDrive/Dokumente/1 Universität - Master/6. Semester/Masterarbeit/Implementation/Echtdaten'
except KeyError:
    # Bin auf der EC2 Linux Maschine 
    data_dir = './data'

In [10]:
simulation = StockSimulation(data_dir, time_series_length)


  mask |= (ar1 == a)
100%|██████████| 566/566 [00:03<00:00, 188.66it/s]


In [11]:
agent = DQN(
    memory_size, 
    state_shape, 
    action_space, 
    gamma,
    learning_rate, 
    batch_size, 
    epsilon, 
    epsilon_decay, 
    epsilon_min, 
    possible_actions, 
    time_series_length
    )

if use_saved_model:
    agent.load()


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [12]:
if do_train:
    global_steps = 0
    stats = {"loss": [], "acc": [], "rew": []}
    for epoch in range(epochs):
        state, info = simulation.reset()
        # print(info)
        current_rewards = []
        while True:
            action = agent.act(state)
            global_steps += 1
            reward, fertig, new_state = simulation.make_action(action)
            current_rewards.append(reward)
            agent.remember(state, action, reward, new_state, fertig)
            
            if global_steps % n_step == 0:
                history = agent.replay()
                if history:
                    curr_loss = history["loss"][0]
                    curr_acc = history["acc"][0]
                    stats["loss"].append(curr_loss)
                    stats["acc"].append(curr_acc)
                
            if global_steps % update_target_network == 0:
                agent.target_train()
    
            state = new_state
    
            if fertig:
                history = agent.replay()
                curr_loss = history["loss"][0]
                curr_acc = history["acc"][0]
                curr_rew = np.sum(current_rewards)
                curr_mean_rew = np.mean(current_rewards)
                agent.sess.run(
                    [
                        agent.reward.assign(curr_rew), 
                        agent.reward_mean.assign(curr_mean_rew), 
                        agent.loss.assign(curr_loss), 
                        agent.accuracy.assign(curr_acc)
                    ]
                )
                summary = agent.sess.run(agent.merged)
                agent.writer.add_summary(summary, epoch)
                if epoch % 10 == 0:
                    print("Epoch {}".format(epoch))
                    print(
                        "\tMean reward: {} --- Total Reward: {} --- EXP-EXP: {}".format(curr_mean_rew, curr_rew, agent.epsilon)
                    )
                    agent.save()
                break
    agent.writer.close()
    agent.sess.close()


Instructions for updating:
Use tf.cast instead.
Epoch 0
	Mean reward: 0.0008779494955179615 --- Total Reward: 0.273042293106086 --- EXP-EXP: 0.9968948155387197
Epoch 10
	Mean reward: 0.0023456116688027598 --- Total Reward: 0.7294852289976583 --- EXP-EXP: 0.9663683806346631
Epoch 20
	Mean reward: 0.0004982307282028126 --- Total Reward: 0.15494975647107473 --- EXP-EXP: 0.9367767115789445
Epoch 30
	Mean reward: 0.0009730354791500945 --- Total Reward: 0.3026140340156794 --- EXP-EXP: 0.9080911844200968
Epoch 40
	Mean reward: 0.0002833072845165684 --- Total Reward: 0.08810856548465278 --- EXP-EXP: 0.8802840517155684
Epoch 50
	Mean reward: 0.001437969463922863 --- Total Reward: 0.4472085032800104 --- EXP-EXP: 0.8533284156916687
Epoch 60
	Mean reward: 0.004156401558316673 --- Total Reward: 1.2926408846364854 --- EXP-EXP: 0.8271982022254514
Epoch 70
	Mean reward: 0.0039489630625457635 --- Total Reward: 1.2281275124517324 --- EXP-EXP: 0.8018681356232514
Epoch 80
	Mean reward: 0.00185079377629158

Epoch 690
	Mean reward: 0.0006602619083619587 --- Total Reward: 0.20534145350056915 --- EXP-EXP: 0.1165982813232902
Epoch 700
	Mean reward: 0.004973215603145374 --- Total Reward: 1.5466700525782113 --- EXP-EXP: 0.1130278646762574
Epoch 710
	Mean reward: 0.06790791762424409 --- Total Reward: 21.11936238113991 --- EXP-EXP: 0.10956677961532287
Epoch 720
	Mean reward: 0.0006287981258414761 --- Total Reward: 0.19555621713669907 --- EXP-EXP: 0.10621167824109574
Epoch 730
	Mean reward: 0.0008573713941310737 --- Total Reward: 0.26664250357476393 --- EXP-EXP: 0.10295931517195399
Epoch 740
	Mean reward: 0.007871798045699659 --- Total Reward: 2.4481291922125936 --- EXP-EXP: 0.09980654440479556
Epoch 750
	Mean reward: 0.00062068581979014 --- Total Reward: 0.19303328995473354 --- EXP-EXP: 0.09675031627191585
Epoch 760
	Mean reward: 0.0033381301148562425 --- Total Reward: 1.0381584657202914 --- EXP-EXP: 0.093787674491072
Epoch 770
	Mean reward: 0.11850922195200295 --- Total Reward: 36.85636802707292

Epoch 1410
	Mean reward: 0.051183098545440855 --- Total Reward: 15.917943647632105 --- EXP-EXP: 0.01242267383312865
Epoch 1420
	Mean reward: 0.030173484839858204 --- Total Reward: 9.383953785195901 --- EXP-EXP: 0.012042272673256684
Epoch 1430
	Mean reward: 0.1420352655536554 --- Total Reward: 44.172967587186825 --- EXP-EXP: 0.011673519975251732
Epoch 1440
	Mean reward: 0.0761225467086146 --- Total Reward: 23.674112026379138 --- EXP-EXP: 0.011316059045501467
Epoch 1450
	Mean reward: 0.054959492867378154 --- Total Reward: 17.092402281754605 --- EXP-EXP: 0.010969544112894174
Epoch 1460
	Mean reward: 0.13773805708736195 --- Total Reward: 42.83653575416957 --- EXP-EXP: 0.010633639994355395
Epoch 1470
	Mean reward: 0.1007026888232828 --- Total Reward: 31.31853622404095 --- EXP-EXP: 0.010308021770625902
Epoch 1480
	Mean reward: 0.0036049748100821684 --- Total Reward: 1.1211471659355543 --- EXP-EXP: 0.01
Epoch 1490
	Mean reward: 0.07377998203181663 --- Total Reward: 22.94557441189497 --- EXP-E

KeyboardInterrupt: 