In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
DATA_DIR = "//Natalie/Dokumente/TempMasterarbeit/Raw Data"
NAME_WA = "0 Warenausgang.Time.csv"
NAME_WGR = "0 Warengruppenstamm.csv"
NAME_ART = "0 ArtikelstammV4.csv"

# Grundidee
Alle Daten in richtiger Reihenfolge vorhalten. 
Daten als Numpy Array in einen tf.DataSet umwandeln. 
DataSet komplett in den Arbeitsspeicher laden und Shufflen mit ganzem DataSet als Buffer und dann batchen. 

In [4]:
df = pd.read_csv(os.path.join(DATA_DIR, NAME_WA))
backup = df # backup sort bei Fehlberechnungen dafür, dass der Datensatz nicht neu geladen werden muss.

Für Debugging wird der Datensatz etwas verkleinert.

In [5]:
artikel_mask = [421607]
df = df[df.ARTIKEL.isin(artikel_mask)]

### Daten formatieren & bereinigen

In [6]:
df["DATUM"] = pd.to_datetime(df.DATUM, format="%d.%m.%y")
df = df[df.BELEGTYP.isin(["UMSATZ_SCANNING", "UMSATZ_AKTION", "ABSCHRIFT"])]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [7]:
print(df.shape, len(df.ARTIKEL.unique()), len(df.DATUM.unique()))

(2, 5) 1 2


### Neue Index setzen

In [8]:
verkaufstage = pd.tseries.offsets.CustomBusinessDay(n=1, weekmask='Mon Tue Wed Thu Fri Sat')
dates_index = pd.date_range('01.01.2016', '30.06.2019', freq=verkaufstage)

In [9]:
products = df.ARTIKEL.unique()

In [10]:
multi_index = pd.MultiIndex.from_product([dates_index, products], names=["DATUM", "ARTIKEL"])

In [11]:
df.set_index(["DATUM", "ARTIKEL"], inplace=True)

In [12]:
df = pd.pivot_table(df, values='MENGE', index=['DATUM', 'ARTIKEL'],

                    columns=['BELEGTYP'], aggfunc=np.sum)

In [13]:
df = df.reindex(multi_index, fill_value=0)

In [14]:
df = df.sort_values(["ARTIKEL", "DATUM"])

### unabhängige Variablen erzeugen

In [15]:
df["PREIS"] = 1.4
df["MENGE"] = df["UMSATZ_SCANNING"]
df["X"] = list(zip(df["MENGE"], df["PREIS"]))
df["X"] = df["X"].apply(list)

In [50]:
# Unelegante Lösung. Besser wäre df.UMSATZ_SCANNING.rolling(lag).apply(list), aber das unterstützt pandas aktuell noch nicht, 
# da die __iter__ Methode des Rolling-Window nicht implementiert ist. 
timeseries_length = 6
df['VORWOCHE'] = list(zip(*[df.X.shift(lag +1 ).fillna(method='backfill') for lag in range(timeseries_length)]))
df['VORVORWOCHE'] = list(zip(*[df.X.shift(lag + 6 + 1).fillna(method='backfill') for lag in range(timeseries_length)]))

In [51]:
df['VORWOCHE'] = df['VORWOCHE'].apply(list)
df['VORVORWOCHE'] = df['VORVORWOCHE'].apply(list)

In [34]:
df.reset_index(inplace=True)

In [35]:
df['y_wochenanfang'] = df.MENGE.rolling(3).sum().shift(-3)

In [36]:
df['Y_wochenende'] = df.MENGE.rolling(2).sum().shift(-5)

In [37]:
df['Y_samstag'] = df.MENGE.shift(-6)

In [38]:
df['Wochentag'] = df['DATUM'].dt.day_name()

In [53]:
df.head(15)

BELEGTYP,index,DATUM,ARTIKEL,UMSATZ_SCANNING,PREIS,MENGE,X,VORWOCHE,VORVORWOCHE,y_wochenanfang,Y_wochenende,Y_samstag,Wochentag
0,0,2016-01-01,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Friday
1,1,2016-01-02,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Saturday
2,2,2016-01-04,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Monday
3,3,2016-01-05,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Tuesday
4,4,2016-01-06,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Wednesday
5,5,2016-01-07,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Thursday
6,6,2016-01-08,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Friday
7,7,2016-01-09,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Saturday
8,8,2016-01-11,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Monday
9,9,2016-01-12,421607,0.0,1.4,0.0,"[0.0, 1.4]","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...","[[0.0, 1.4], [0.0, 1.4], [0.0, 1.4], [0.0, 1.4...",0.0,0.0,0.0,Tuesday


In [201]:
# df = backup

In [73]:
learning_frame = df[df.Wochentag == 'Saturday']

In [76]:
print(learning_frame.shape)
learning_frame.dropna(how='any', axis='index', inplace=True)
print(learning_frame.shape)



(183, 13)
(182, 13)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Direktes konvertieren in einen Numpy Array klappt nicht, aufgrund der geschachtelten Listen (die werden als dtype: Object behandelt). 
Auflösen in eine Liste von Listen und anschließendes konvertieren in einen nd-array löst das Problem.

In [120]:
arr_vw = learning_frame.VORWOCHE.to_list()
arr_vvw = learning_frame.VORVORWOCHE.to_list()

y_wa = learning_frame.y_wochenanfang.to_numpy(copy=True)
y_we = learning_frame.Y_wochenende.to_numpy(copy=True)
y_sa = learning_frame.Y_samstag.to_numpy(copy=True)

In [121]:
arr_vw = np.array(arr_vw)
arr_vvw = np.array(arr_vvw)

In [88]:
arr_vw.shape

(182, 6, 2)

# Testing if suitable for TF

In [83]:
import tensorflow as tf

In [147]:
vw_dataset = tf.data.Dataset.from_tensor_slices(arr_vw)
vvw_dataset = tf.data.Dataset.from_tensor_slices(arr_vvw)
x_dataset = tf.data.Dataset.zip((vw_dataset, vvw_dataset)).cache()

wa_dataset = tf.data.Dataset.from_tensor_slices(y_wa)
we_dataset = tf.data.Dataset.from_tensor_slices(y_we)
sa_dataset = tf.data.Dataset.from_tensor_slices(y_sa)
y_dataset = tf.data.Dataset.zip((wa_dataset, we_dataset, sa_dataset)).cache()

dataset = tf.data.Dataset.zip((x_dataset, y_dataset)).shuffle(buffer_size=182).batch(32, drop_remainder=True)


In [151]:
vw_input = tf.keras.layers.Input(shape=(6, 2))
vvw_input = tf.keras.layers.Input(shape=(6, 2))

shared_LSTM = tf.keras.layers.LSTM(10, return_sequences=True, unroll=True) # unroll for speedup on CPU

encoded_ts_vw = shared_LSTM(vw_input)
encoded_ts_vvw = shared_LSTM(vvw_input)

joined_encoding = tf.keras.layers.concatenate([encoded_ts_vw, encoded_ts_vvw], axis=2)
modulator = tf.keras.layers.Dense(15, activation='elu')(joined_encoding)

master_lstm = tf.keras.layers.LSTM(10, return_sequences=False, unroll=True)(modulator)

head_wa = tf.keras.layers.Dense(10)(master_lstm)
head_we = tf.keras.layers.Dense(10)(master_lstm)
head_sa = tf.keras.layers.Dense(10)(master_lstm)

model = tf.keras.models.Model(inputs=[vw_input, vvw_input], outputs=[head_wa, head_we, head_sa])

model.compile(optimizer='Adam', loss='mse', loss_weights=[1., 1., 1.], metrics=[])

In [152]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 6, 2)]       0                                            
__________________________________________________________________________________________________
input_14 (InputLayer)           [(None, 6, 2)]       0                                            
__________________________________________________________________________________________________
lstm_12 (LSTM)                  (None, 6, 10)        520         input_13[0][0]                   
                                                                 input_14[0][0]                   
__________________________________________________________________________________________________
concatenate_5 (Concatenate)     (None, 6, 20)        0           lstm_12[0][0]              

In [153]:
model.fit(x=dataset.prefetch(tf.data.experimental.AUTOTUNE), epochs=10)

Train for 5 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1f2edf5c1c8>