# Introduction

This notebook demonstrates the preprocessing of raw shallot price data to prepare it for time series forecasting using AutoML tools.

In [30]:
from google.cloud import storage

import pandas as pd
import numpy as np
import csv
import os
import io

In [31]:
# Helper functions

def download_csv_from_gcs(bucket, file_name,
                          date_columns=None, col_names=None):
    """ A function to download dataset from GCS. """

    blob = bucket.blob(file_name)
    data = blob.download_as_text()
    df = pd.read_csv(io.StringIO(data),
                     parse_dates=date_columns,
                     usecols=col_names)
    return df

def upload_csv_to_gcs(df, bucket, file_path, sep=','):
    """ A function to upload dataset to GCS. """

    csv_buffer = io.StringIO()
    df.to_csv(csv_buffer, index=False, quoting=csv.QUOTE_ALL, encoding='utf-8')

    blob = bucket.blob(file_path)
    blob.upload_from_string(csv_buffer.getvalue(), content_type='text/csv')

    print(f"Successfully uploaded: '{blob.name}' to '{bucket.name}'")

In [25]:
# Secrets

PROJECT_ID="machine-learning-toy-project"
LOCATION="us-west1"
SERVICE_ACCOUNT="ml-project-user@machine-learning-toy-project.iam.gserviceaccount.com"

BUCKET_NAME="commodity_prices_automl"

DATA_RAW_CSV="shallot_raw.csv"

In [26]:
# Create a client GCS and get the specified bucket
client = storage.Client()
bucket = client.get_bucket(BUCKET_NAME)

In [27]:
# Download the dataset from GCS
prices_df = download_csv_from_gcs(bucket, DATA_RAW_CSV)

In [28]:
prices_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2435 entries, 0 to 2434
Data columns (total 39 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   date             2435 non-null   object
 1   surabayakota     2435 non-null   int64 
 2   malangkota       2435 non-null   int64 
 3   kedirikota       2435 non-null   int64 
 4   jemberkab        2435 non-null   int64 
 5   bangkalankab     2435 non-null   int64 
 6   banyuwangikab    2435 non-null   int64 
 7   blitarkab        2435 non-null   int64 
 8   bojonegorokab    2435 non-null   int64 
 9   bondowosokab     2435 non-null   int64 
 10  gresikkab        2435 non-null   int64 
 11  jombangkab       2435 non-null   int64 
 12  kedirikab        2435 non-null   int64 
 13  lamongankab      2435 non-null   int64 
 14  lumajangkab      2435 non-null   int64 
 15  madiunkab        2435 non-null   int64 
 16  magetankab       2435 non-null   int64 
 17  malangkab        2435 non-null   

In [29]:
prices_df["date"] = pd.to_datetime(prices_df["date"])
prices_df.set_index("date", inplace=True)

In [32]:
# For our analysis, we will focus on two regions in East Java: Mojokerto and Surabaya
selected_regions = ["mojokertokab", "surabayakota"]
prices_df = prices_df[selected_regions]
prices_df = prices_df.rename(columns={"mojokertokab": "mojokerto",
                                      "surabayakota": "surabaya"})

## Dataset Overview

In [33]:
prices_df.head()

Unnamed: 0_level_0,mojokerto,surabaya
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-01-01,14750,20000
2018-01-02,14750,19800
2018-01-03,14750,20600
2018-01-04,14750,20600
2018-01-05,14750,20600


In [34]:
# describe the statistics
prices_df.agg(["min", "max", "median", "mean", "std"]).T

Unnamed: 0,min,max,median,mean,std
mojokerto,12000.0,68000.0,24250.0,25070.30308,7070.966527
surabaya,14600.0,64400.0,28833.0,29515.717043,7477.329673


In [35]:
prices_df.isna().sum()

Unnamed: 0,0
mojokerto,0
surabaya,0


As we can see from the statistics above, there are no missing values in this dataset.

Next, we ensure that the date range is complete, with no dates skipped.

In [36]:
prices_df.index.min(), prices_df.index.max()

(Timestamp('2018-01-01 00:00:00'), Timestamp('2024-08-31 00:00:00'))

In [37]:
ds_start = "2018-01-01"
ds_end = "2024-08-31"

complete_date = pd.date_range(start=ds_start, end=ds_end, freq="D", name="date")
prices_df = prices_df.reindex(complete_date)

In [38]:
prices_df.isna().sum()

Unnamed: 0,0
mojokerto,0
surabaya,0


## Splitting Data into Train, Validation, and Test Sets

In [39]:
train_end = "2024-01-31"
val_end = "2024-04-30"
test_end = ds_end

In [40]:
prices_df.loc[prices_df.index <= train_end, "data_split"] = "TRAIN"
prices_df.loc[(prices_df.index > train_end) & (prices_df.index <= val_end), "data_split"] = "VALIDATE"
prices_df.loc[(prices_df.index > val_end) & (prices_df.index <= test_end), "data_split"] = "TEST"

In [43]:
# Format the dataset into a long dataset to merge the regions into a column
df = prices_df.reset_index().melt(id_vars=["date", "data_split"],
                               value_vars=["mojokerto", "surabaya"],
                               var_name="regions",
                               value_name="price")

In [44]:
df.groupby(["data_split", "regions"]).head(2)

Unnamed: 0,date,data_split,regions,price
0,2018-01-01,TRAIN,mojokerto,14750
1,2018-01-02,TRAIN,mojokerto,14750
2222,2024-02-01,VALIDATE,mojokerto,22962
2223,2024-02-02,VALIDATE,mojokerto,22962
2312,2024-05-01,TEST,mojokerto,35450
2313,2024-05-02,TEST,mojokerto,35450
2435,2018-01-01,TRAIN,surabaya,20000
2436,2018-01-02,TRAIN,surabaya,19800
4657,2024-02-01,VALIDATE,surabaya,26833
4658,2024-02-02,VALIDATE,surabaya,26833


In [45]:
upload_csv_to_gcs(df, bucket, "shallot_dataset.csv")

Successfully uploaded: 'shallot_dataset.csv' to 'commodity_prices_automl'
