In [0]:
# imports
import os
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"
# place where raw csvs land after download
VOLUME_TARGET_DIR = f"{VOLUME_ROOT_PATH}/raw"
# raw data
VOLUME_BRONZE_DIR = f"{VOLUME_ROOT_PATH}/bronze"
# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"
# place where final data is written
VOLUME_GOLD_DIR = f"{VOLUME_ROOT_PATH}/gold"

# ensure all paths exist
for path in [VOLUME_TARGET_DIR, VOLUME_BRONZE_DIR, VOLUME_SILVER_DIR, VOLUME_GOLD_DIR]:
  if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

In [0]:
# load the data from local volumes
filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

holidays_events_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('holidays_events')}", header=True, inferSchema=True)
# oil_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('oil')}", header=True, inferSchema=True)
stores_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('stores')}", header=True, inferSchema=True)
transactions_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('transactions')}", header=True, inferSchema=True)
train_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('train')}", header=True, inferSchema=True)

test_df = spark.read.csv(f"{VOLUME_TARGET_DIR}/{filenames.get('test')}", header=True, inferSchema=True)

In [0]:
# write all dfs as they are into bronze
for df, name in zip([holidays_events_df, stores_df, transactions_df, train_df, test_df ], ['holidays', 'stores', 'transactions', 'train', 'test']):
  # delete any filename with the same name beforehand
  dbutils.fs.rm(f"{VOLUME_BRONZE_DIR}/{name}", True)
  df.write.mode("overwrite").parquet(f"{VOLUME_BRONZE_DIR}/{name}")