In [0]:
# imports
import os
from datetime import datetime

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from pyspark.sql import Window
from pyspark.sql import functions as F
from pyspark.sql import DataFrame as SparkDataFrame
from pyspark.ml.feature import FeatureHasher

In [0]:
VOLUME_ROOT_PATH = "/Volumes/cscie103_catalog/final_project/data"

# place where prepared data is written
VOLUME_SILVER_DIR = f"{VOLUME_ROOT_PATH}/silver"

# ensure all paths exist
for path in [VOLUME_SILVER_DIR]:
  if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

In [0]:
silver_filenames = {
    'holidays': 'holidays',
    'stores': 'stores',
    'train': 'train',
    'transactions': 'transactions',
    'test': 'test'
}

# read from Bronze tier as Delta tables
holidays_df = spark.read.format("delta").load(f"{VOLUME_SILVER_DIR}/{silver_filenames.get('holidays')}")
stores_df = spark.read.format("delta").load(f"{VOLUME_SILVER_DIR}/{silver_filenames.get('stores')}")
train_df = spark.read.format("delta").load(f"{VOLUME_SILVER_DIR}/{silver_filenames.get('train')}")
transactions_df = spark.read.format("delta").load(f"{VOLUME_SILVER_DIR}/{silver_filenames.get('transactions')}")
# test_df = spark.read.format("delta").load(f"{VOLUME_SILVER_DIR}/{silver_filenames.get('test')}")

In [0]:
def smart_na_drop(df):
    """
    Drops all rows with any null values in columns.
    """
    before = df.count()
    df = df.dropna()
    after = df.count()
    print(f"dropped {before - after} rows")
    return df

In [0]:
holidays_df.printSchema()
stores_df.printSchema()
train_df.printSchema()
transactions_df.printSchema()
# test_df.printSchema()

In [0]:
# merge train & transactions by date, store_nbr
training_df = train_df.join(transactions_df, on=['date', 'store_nbr'], how='left')

# for some families of products sales were none on the given day, transactions df contains nulls, filling them with 0
training_df = training_df.withColumn(
    'transactions',
    F.when(F.col('sales') == 0, 0).otherwise(F.col('transactions'))
)
training_df = smart_na_drop(training_df) # expected to drop 3248 rows, for these rows there were no transactions recorded despite sales present

In [0]:
# merge training & stores by store_nbr
training_df = training_df.join(stores_df, on='store_nbr', how='left')
training_df = smart_na_drop(training_df) # expected to drop 0 rows

In [0]:
# merge training & holidays by date
training_df = training_df.join(holidays_df, on='date', how='left')
# impute for absent rows
training_df = smart_na_drop(training_df) # expected to drop 0 rows

In [0]:
# drop 'family', 'id' columns
training_df = training_df.drop('family', 'id')

In [0]:
training_df.printSchema()
display(training_df)

In [0]:
# hash with FeatureHasher to create a new column with hashed values for 'store_nbr' column
# hash 'state' column
holidays_hasher = FeatureHasher(
    inputCols=['store_nbr'],
    outputCol='hash_storeNbr',
    numFeatures=1024
)
training_df = holidays_hasher.transform(training_df)

# drop 'store_nbr' column
training_df = training_df.drop('store_nbr')

display(training_df)

In [0]:
# write Silver tier as Delta table
# WARN: will take approx. 2 minutes
silver_path = f"{VOLUME_SILVER_DIR}/training"
training_df.write.format("delta").mode("overwrite").save(f"{VOLUME_SILVER_DIR}/training")