In [None]:
import sys
sys.path.insert(0, '..//src')
sys.path.insert(0, '..//src//data')
sys.path.insert(0, '..//src//features')

import global_func as gf
import read_data
import data_preprocessing

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 

# Data Collection

In [None]:
#df_raw = read_data.load_csv(csv_name=r'..\data\raw\dataset_Q122.csv')
#gf.create_pkl(obj=df_raw, pkl=r'..\data\processed\df_raw.pkl')

df_raw = gf.load_pkl(pkl=r'..\data\processed\df_raw.pkl')
df_raw.head()

# Data Definition & Validation

In [None]:
print(df_raw.info())
print()
print(df_raw.isna().sum())

## Resample data time-series: Interval = hari-kerja

In [None]:
DataPrep = data_preprocessing.DataPreprocessing()
df_proc = DataPrep.resampling(dataframe=df_raw, interval='B', resampling_method='median', fillna_method='ffill')
df_proc.tail()

## Feature enriching

In [None]:
df_proc = DataPrep.enriching(dataframe=df_proc)
df_proc.tail()

# Data Splitting

In [None]:
#df_train, df_valid, df_test = DataPrep.splitting(dataframe=df_proc, train_end='2020-12-31', valid_end='2021-07-31')
#gf.create_pkl(obj=df_train, pkl=r'..\data\processed\df_train_01.pkl')
#gf.create_pkl(obj=df_valid, pkl=r'..\data\processed\df_valid_01.pkl')
#gf.create_pkl(obj=df_test, pkl=r'..\data\processed\df_test_01.pkl')

# EDA

In [None]:
df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_01.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_01.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_01.pkl')

df_train.name = 'df_train'
df_valid.name = 'df_valid'
df_test.name = 'df_test'

## Explorasi Target

In [None]:
def plotting_target():
    fig = plt.figure(figsize=(18,5))
    for frame in [df_train, df_valid, df_test]:
        plt.plot(frame.index, frame['lq45'], label=frame.name)

    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

plotting_target()

## Explorasi Tren

In [None]:
def plotting_tren(df):
    df_monthly = DataPrep.resampling(dataframe=df, interval='M', resampling_method='median', fillna_method='ffill')
    df_quarterly = DataPrep.resampling(dataframe=df, interval='Q', resampling_method='median', fillna_method='ffill')
    df_yearly = DataPrep.resampling(dataframe=df, interval='A', resampling_method='median', fillna_method='ffill')

    df_monthly.name = 'df_monthly'
    df_quarterly.name = 'df_quarterly'
    df_yearly.name = 'df_yearly'

    fig = plt.figure(figsize=(18,5))
    for frame in [df, df_monthly, df_quarterly, df_yearly]:
        plt.plot(frame.index, frame['lq45'], label=frame.name)

    plt.title('Tren LQ45')
    plt.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

plotting_tren(df=df_train)

# Dekomposisi Seasonal

In [None]:
DataPrep = data_preprocessing.DataPreprocessing()
df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_01.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_01.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_01.pkl')

In [None]:
result_mul, _ = DataPrep.monthly_seasonal_feature(dataframe_train=df_train)

result_mul.seasonal[:36].to_frame().plot(subplots=True,figsize =(18,5))
plt.title('Seasonal tren Bulanan LQ45')
plt.show()

## Feature enriching: col seasonal

In [None]:
#_, df_seasonal = DataPrep.monthly_seasonal_feature(dataframe_train=df_train)
#df_train,_ = DataPrep.enriching_seasonal(dataframe=df_train, df_seasonal=df_seasonal)
#df_valid,_ = DataPrep.enriching_seasonal(dataframe=df_valid, df_seasonal=df_seasonal)
#df_test,_ = DataPrep.enriching_seasonal(dataframe=df_test, df_seasonal=df_seasonal)
#gf.create_pkl(obj=df_train, pkl=r'..\data\processed\df_train_02.pkl')
#gf.create_pkl(obj=df_valid, pkl=r'..\data\processed\df_valid_02.pkl')
#gf.create_pkl(obj=df_test, pkl=r'..\data\processed\df_test_02.pkl')

# Cek stasionary & Transform Lag-Differencing

In [None]:
DataPrep = data_preprocessing.DataPreprocessing()
df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_02.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_02.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_02.pkl')

df_train.name = 'df_train'
df_valid.name = 'df_valid'
df_test.name = 'df_test'

## Augmented Dickey-Fuller Test
The time series is considered stationary if the p-value is low (<0.05) and the Test Statistic is lower than the critical values at 1%, 5%, 10% levels of significance.

In [None]:
non_stationary_cols = DataPrep.stationary_checker_with_adfuller(dataframe=df_train, show_detail=True)
non_stationary_cols

In [None]:
#DataPrep = data_preprocessing.DataPreprocessing()
#pkl = r'..\models\pkl\list_non_stationary_cols.pkl'
#df_train = DataPrep.stationary_transform(dataframe=df_train, non_stationary_cols_pkl=pkl)
#df_valid = DataPrep.stationary_transform(dataframe=df_valid, non_stationary_cols_pkl=pkl)
#df_test = DataPrep.stationary_transform(dataframe=df_test, non_stationary_cols_pkl=pkl)
#gf.create_pkl(obj=df_train, pkl=r'..\data\processed\df_train_03.pkl')
#gf.create_pkl(obj=df_valid, pkl=r'..\data\processed\df_valid_03.pkl')
#gf.create_pkl(obj=df_test, pkl=r'..\data\processed\df_test_03.pkl')


df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_03.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_03.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_03.pkl')

df_train.name = 'df_train'
df_valid.name = 'df_valid'
df_test.name = 'df_test'

In [None]:
def plotting_stasioner(df, col):
    fig, axes = plt.subplots(figsize=[18, 5])

    rol_mean = df[col].rolling(window = 30, center = False).mean()
    rol_std = df[col].rolling(window = 30, center = False).std()

    axes.plot(df.index, df[col], label=col)
    axes.plot(rol_mean, label='rol_mean')
    axes.plot(rol_std, label='rol_std')

    axes.set_title(f'Stasioneritas {str(col).upper()}')
    axes.legend(loc='upper left')
    plt.tight_layout()
    plt.show()

plotting_stasioner(df=df_train,col='lq45')
plotting_stasioner(df=df_valid,col='lq45')
plotting_stasioner(df=df_test,col='lq45')

# Cek Outlier & Imputasi

In [None]:
DataPrep = data_preprocessing.DataPreprocessing()
df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_03.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_03.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_03.pkl')

df_train.name = 'df_train'
df_valid.name = 'df_valid'
df_test.name = 'df_test'

## Plotting: Histogram

In [None]:
gf.plotting_hist_all(dataframe=df_train)

## Plotting: Boxplot

In [None]:
gf.plotting_box_all(dataframe=df_train)

## Imputasi

In [None]:
#df_value_for_outlier = DataPrep.get_value_for_outlier(train_dataframe=df_train, lo_perc=10.0, hi_perc=90.0)
#gf.create_pkl(obj=df_value_for_outlier, pkl=r'..\models\pkl\df_imputasi.pkl')
#df_value_for_outlier

In [None]:
'''
DataPrep = data_preprocessing.DataPreprocessing()
df_value_for_outlier = DataPrep.get_value_for_outlier(train_dataframe=df_train, lo_perc=10.0, hi_perc=90.0)
df_train = DataPrep.outlier_treatment_batch(df_train, df_value_for_outlier=df_value_for_outlier)
df_valid = DataPrep.outlier_treatment_batch(df_valid, df_value_for_outlier=df_value_for_outlier)
df_test = DataPrep.outlier_treatment_batch(df_test, df_value_for_outlier=df_value_for_outlier)
gf.create_pkl(obj=df_train, pkl=r'..\data\processed\df_train_04.pkl')
gf.create_pkl(obj=df_valid, pkl=r'..\data\processed\df_valid_04.pkl')
gf.create_pkl(obj=df_test, pkl=r'..\data\processed\df_test_04.pkl')
'''

# Standardizing

In [None]:
DataPrep = data_preprocessing.DataPreprocessing()
df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_04.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_04.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_04.pkl')

df_train.name = 'df_train'
df_valid.name = 'df_valid'
df_test.name = 'df_test'

In [None]:
gf.plotting_line_all(dataframe=df_train)

In [None]:
'''
DataPrep = data_preprocessing.DataPreprocessing()
scaler = DataPrep.std_scaler_fitting(train_dataframe=df_train)
df_train = DataPrep.std_scaler_transform(dataframe=df_train, scaler=scaler)
df_valid = DataPrep.std_scaler_transform(dataframe=df_valid, scaler=scaler)
df_test = DataPrep.std_scaler_transform(dataframe=df_test, scaler=scaler)
gf.create_pkl(obj=df_train, pkl=r'..\data\processed\df_train_05.pkl')
gf.create_pkl(obj=df_valid, pkl=r'..\data\processed\df_valid_05.pkl')
gf.create_pkl(obj=df_test, pkl=r'..\data\processed\df_test_05.pkl')
'''

In [None]:
DataPrep = data_preprocessing.DataPreprocessing()
df_train = gf.load_pkl(pkl=r'..\data\processed\df_train_05.pkl')
df_valid = gf.load_pkl(pkl=r'..\data\processed\df_valid_05.pkl')
df_test = gf.load_pkl(pkl=r'..\data\processed\df_test_05.pkl')

df_train.name = 'df_train'
df_valid.name = 'df_valid'
df_test.name = 'df_test'

gf.plotting_line_all(dataframe=df_train)