# Getting Started

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import tensorflow as tf

from sklearn.metrics import r2_score

In [3]:
data = pd.read_csv("data/fakta_penjualan_barangg_202112081901.csv")
data

Unnamed: 0,sk,nonota,barcode,hari,bulan,tahun,kuartal,namabarang,satuan,qty,...,keuntungan,keuntungan_persen,subtotal,kodesubkategori,kodecabang,kodeoperator,isbkp,kodecustomer,kodedistributor,kodedistributordivisi
0,101060,22001000618,4902430396028,1,1,2020,1,H&S SHP SS EOL BTL 180ML,BTL,2.0,...,3600.00,15.22,54500.0,03031203,3,OP0045,1,CU00000001,D0045,1056
1,101061,22001000620,8998866606158,1,1,2020,1,NUVO ACTIVE COOL 110GR*72,PCS,10.0,...,240.00,9.23,28400.0,03030602,3,OP0037,1,CU00000001,D0388,128
2,101062,22001000618,8997015390122,1,1,2020,1,SEKAR JAGAT LULUR BENGKUANG,PCS,1.0,...,3000.00,37.50,11000.0,03030607,3,OP0045,1,CU00000001,D0615,181
3,101063,22001000620,011747234191,1,1,2020,1,DOLPIN GARAM BESAR 500GR*24,PCS,2.0,...,500.00,8.85,12300.0,01011101,3,OP0037,0,CU00000001,D0235,82
4,101064,22001000618,089686441581,1,1,2020,1,INDOFOOD BUMBU NASI GORENG PDS,PCS,2.0,...,350.00,7.78,9700.0,01011103,3,OP0045,1,CU00000001,D0134,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349298,101055,22001000618,8999999401238,1,1,2020,1,RINSO ANTINODA 770GR*12,PCS,1.0,...,1250.00,6.93,19300.0,03040101,3,OP0045,1,CU00000001,D0378,124
349299,101056,22001000620,8999999390198,1,1,2020,1,SUNLIGHT LIME 12*755ML REF,PCS,1.0,...,1050.00,7.34,15350.0,03040201,3,OP0037,1,CU00000001,D0378,124
349300,101057,22001000620,8999777010638,1,1,2020,1,NIVEA DEO DEEP 50ML,PCS,1.0,...,2475.00,16.36,17600.0,03031207,3,OP0037,1,CU00000001,D0536,1031
349301,101058,22001000620,8992304009143,1,1,2020,1,GARNIER MEN TURBILIGHT OIL FOM 50M,PCS,1.0,...,2460.00,16.09,17750.0,03031201,3,OP0037,1,CU00000001,D0079,1072


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349303 entries, 0 to 349302
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   sk                     349303 non-null  int64  
 1   nonota                 349303 non-null  int64  
 2   barcode                349303 non-null  object 
 3   hari                   349303 non-null  int64  
 4   bulan                  349303 non-null  int64  
 5   tahun                  349303 non-null  int64  
 6   kuartal                349303 non-null  int64  
 7   namabarang             349303 non-null  object 
 8   satuan                 349303 non-null  object 
 9   qty                    349303 non-null  float64
 10  hargajual              349303 non-null  float64
 11  hargabeli              349303 non-null  float64
 12  diskon                 349303 non-null  float64
 13  hargajual2             349303 non-null  float64
 14  keuntungan             349303 non-nu

# Preprocessing

In [None]:
def encode_dates(df, column):
    df = df.copy()
    df[column] = pd.to_datetime(df[column])
    df[column + '_year'] = df[column].apply(lambda x: x.year)
    df[column + '_month'] = df[column].apply(lambda x: x.month)
    df[column + '_day'] = df[column].apply(lambda x: x.day)
    df = df.drop(column, axis=1)
    return df

def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [17]:
def preprocess_inputs(df):
    df = df.copy()
    
    #drop column yang tidak penting
    df = df.drop(['sk', 'nonota', 'namabarang', 'hargajual', 'hargabeli', 'hargajual', 'diskon', 'hargajual2', 'keuntungan', 'keuntungan_persen', 'subtotal', 'kodesubkategori', 'kodecabang', 'kodeoperator', 'isbkp', 'kodecustomer', 'kodedistributor', 'kodedistributordivisi'], axis=1)
    
    
    return df

In [18]:
X = preprocess_inputs(data)

In [19]:
{column: len(X[column].unique()) for column in X.columns}

{'barcode': 13319,
 'hari': 31,
 'bulan': 2,
 'tahun': 1,
 'kuartal': 1,
 'satuan': 49,
 'qty': 1222}

In [20]:
X

Unnamed: 0,barcode,hari,bulan,tahun,kuartal,satuan,qty
0,4902430396028,1,1,2020,1,BTL,2.0
1,8998866606158,1,1,2020,1,PCS,10.0
2,8997015390122,1,1,2020,1,PCS,1.0
3,011747234191,1,1,2020,1,PCS,2.0
4,089686441581,1,1,2020,1,PCS,2.0
...,...,...,...,...,...,...,...
349298,8999999401238,1,1,2020,1,PCS,1.0
349299,8999999390198,1,1,2020,1,PCS,1.0
349300,8999777010638,1,1,2020,1,PCS,1.0
349301,8992304009143,1,1,2020,1,PCS,1.0


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

In [None]:
import torch
from torch import nn, optim
from jcopdl.callback import Callback, set_config

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

In [None]:
tes = pd.read_csv("data/fakta_penjualan_barangg_202112081901.csv")
tes

In [None]:
tes['Date'] = tes['hari'].map(str) + '/' + tes['bulan'].map(str) + '/' + tes['tahun'].map(str)
tes.head()

In [None]:
tes['Date'] = pd.to_datetime(tes['Date'],format='%d/%m/%Y')

In [None]:
tes["qty"] = tes["qty"].astype(int)

In [None]:
stok = tes.groupby(['Date', 'barcode'])['qty'].sum().reset_index(name='stok')
stok

In [None]:
stok.dtypes

In [None]:
stok.resample(rule='M', on='date')['Date'].sum()

In [None]:
tes.dtypes

In [None]:
tes.shape

In [None]:
tes.corr()

In [None]:
stok

# Import Data

In [None]:
df = pd.read_csv("data/daily_min_temp.csv", index_col="Date", parse_dates=["Date"])
df.head()

In [None]:
df.Temp.plot(figsize=(15, 5), color="b")

In [None]:
ts_train, ts_test = train_test_split(df, test_size=0.2, shuffle=False)
ts_train.shape, ts_test.shape

# Dataset & Dataloader

In [None]:
from jcopdl.utils.dataloader import TimeSeriesDataset
from torch.utils.data import DataLoader

In [None]:
seq_len = 14
bs = 32

train_set = TimeSeriesDataset(ts_train, "Temp", seq_len)
trainloader = DataLoader(train_set, batch_size=bs, num_workers=4)

test_set = TimeSeriesDataset(ts_test, "Temp", seq_len)
testloader = DataLoader(test_set, batch_size=bs)

# Arsitektur & Config

In [None]:
class GRU(nn.Module):
    def __init__(self, input_size, output_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.rnn = nn.GRU(input_size, hidden_size, num_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self, x, hidden):
        x, hidden = self.rnn(x, hidden)
        x = self.fc(x)
        return x, hidden

In [None]:
config = set_config({
    "input_size": train_set.n_feature,
    "seq_len": train_set.n_seq,
    "output_size": 1,
    "hidden_size": 64,
    "num_layers": 2,
    "dropout": 0,
})

# Training Preparation -> MCOC

In [None]:
model = GRU(config.input_size, config.output_size, config.hidden_size, config.num_layers, config.dropout).to(device)
criterion = nn.MSELoss(reduction="mean")
optimizer = optim.AdamW(model.parameters(), lr=0.001)
callback = Callback(model, config, outdir="model/gru")

In [None]:
from tqdm.auto import tqdm

def loop_fn(mode, dataset, dataloader, model, criterion, optimizer, device):
    if mode == "train":
        model.train()
    elif mode == "test":
        model.eval()
    cost = 0
    for feature, target in tqdm(dataloader, desc=mode.title()):
        feature, target = feature.to(device), target.to(device)
        output, hidden = model(feature, None)
        loss = criterion(output, target)
        
        if mode == "train":
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        
        cost += loss.item() * feature.shape[0]
    cost = cost / len(dataset)
    return cost

In [None]:
while True:
    train_cost = loop_fn("train", train_set, trainloader, model, criterion, optimizer, device)
    with torch.no_grad():
        test_cost = loop_fn("test", test_set, testloader, model, criterion, optimizer, device)
    
    # Logging
    callback.log(train_cost, test_cost)

    # Checkpoint
    callback.save_checkpoint()
        
    # Runtime Plotting
    callback.cost_runtime_plotting()
    
    # Early Stopping
    if callback.early_stopping(model, monitor="test_cost"):
        callback.plot_cost()
        break

# Sanity Check

In [None]:
from luwiji.rnn import illustration

In [None]:
illustration.forecast

In [None]:
from utils import data4pred, pred4pred

In [None]:
train_forecast_set = TimeSeriesDataset(ts_train, "Temp", 1)
trainforecastloader = DataLoader(train_forecast_set)
test_forecast_set = TimeSeriesDataset(ts_test, "Temp", 1)
testforecastloader = DataLoader(test_forecast_set)

In [None]:
#data4pred
plt.figure(figsize=(15, 10))

plt.subplot(211)
data4pred(model, train_forecast_set, trainforecastloader, device)
plt.title("Train")

plt.subplot(212)
data4pred(model, test_forecast_set, testforecastloader, device)
plt.title("Test")

In [None]:
#pred4pred
plt.figure(figsize=(15, 5))
pred4pred(model, test_forecast_set, testforecastloader, device)
plt.title("Test")