This notebook serves as a sandbox to provide exploratory data analysis in preparation for ERD & E2E workflow specifications.
It will:
1. Create schema & volume if needed.
2. Fetch data from [Kaggle competition](https://www.kaggle.com/competitions/store-sales-time-series-forecasting/overview).
3. Create respective tables per csv file.

... (wip)

In [1]:
import yaml
from pathlib import Path

with open("../config.yaml", "r") as f:
    config = yaml.safe_load(f)

config['databricks']['catalog'], config['databricks']['schema'], config["databricks"]["volume"]

('portfolio_catalog', 'databricks_pipeline', 'data')

In [None]:
def running_on_databricks():
    try:
        import pyspark.dbutils  # only available in Databricks
        return True
    except ImportError:
        return False

IS_DATABRICKS = running_on_databricks()
print(IS_DATABRICKS)

False


In [3]:
if IS_DATABRICKS:
    CATALOG = config["databricks"]["catalog"]
    SCHEMA = config["databricks"]["schema"]
    VOLUME = config["databricks"]["volume"]

    BASE_PATH = f"/Volumes/{CATALOG}/{SCHEMA}/{VOLUME}"
else:
    BASE_PATH = Path(config["local"]["base_path"]) #/ config['databricks']['schema']

BASE_PATH

PosixPath('data')

In [4]:
if IS_DATABRICKS:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {CATALOG}")
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG}.{SCHEMA}")
    spark.sql(f"CREATE VOLUME IF NOT EXISTS {CATALOG}.{SCHEMA}.{VOLUME}")
else:
    BASE_PATH.mkdir(parents=True, exist_ok=True)


In [5]:
%pip install kaggle

Note: you may need to restart the kernel to use updated packages.


In [6]:
# all imports here
import os
import json
from pathlib import Path
import subprocess
import zipfile
import requests

In [7]:
if IS_DATABRICKS:
    from databricks.sdk import WorkspaceClient

    w = WorkspaceClient()
    SCOPE = "kaggle"

    try:
        w.secrets.create_scope(scope=SCOPE)
        print(f"‚úì Created scope '{SCOPE}'")
    except Exception as e:
        if "already exists" in str(e).lower():
            print(f"Scope '{SCOPE}' already exists")
        else:
            raise

else:
    print("Running locally ‚Äî Databricks secrets not created")


Running locally ‚Äî Databricks secrets not created


In [8]:
def get_kaggle_credentials():
    if IS_DATABRICKS:
        username = dbutils.secrets.get(scope="kaggle", key="kaggle-username")
        token = dbutils.secrets.get(scope="kaggle", key="kaggle-api-token")
    else:
        # Local environment variables
        username = os.environ.get("KAGGLE_USERNAME")
        token = os.environ.get("KAGGLE_API_TOKEN")
        if not username or not token:
            raise RuntimeError(
                "Missing Kaggle credentials locally. "
                "Set KAGGLE_USERNAME and KAGGLE_API_TOKEN as environment variables "
                "or use a .env file."
            )
    return username, token

In [9]:
import json
from pathlib import Path

kaggle_username, kaggle_token = get_kaggle_credentials()

# kaggle_dir = Path.home() / ".kaggle"
import os
kaggle_dir = os.path.expanduser("~/.kaggle")
# kaggle_dir.mkdir(exist_ok=True)
os.makedirs(kaggle_dir, exist_ok=True)

# kaggle_config_path = kaggle_dir / "kaggle.json"
kaggle_config_path = os.path.join(kaggle_dir, "kaggle.json")

with open(kaggle_config_path, "w") as f:
    json.dump({"username": kaggle_username, "key": kaggle_token}, f)

# Kaggle requires permission 600
# kaggle_config_path.chmod(0o600)
os.chmod(kaggle_config_path, 0o600)

print(f"‚úì Created Kaggle config at: {kaggle_config_path}")


‚úì Created Kaggle config at: /Users/daniel/.kaggle/kaggle.json


In [10]:
from pathlib import Path
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile

# --- Paths ---
VOLUME_TARGET_DIR = Path(BASE_PATH) # / "data" / "raw"
VOLUME_TARGET_DIR.mkdir(parents=True, exist_ok=True)

# List of files to check
filenames = [
    "train.csv",
    "test.csv",
    "stores.csv",
    "holidays_events.csv",
    "oil.csv",
    "transactions.csv",
    "sample_submission.csv"
]

# Check if all files already exist
all_exist = all((VOLUME_TARGET_DIR / f).exists() for f in filenames)

if all_exist:
    print("‚ÑπÔ∏è All CSV files already exist. Skipping download.")
else:
    print("üì• Downloading Kaggle competition files...")

    # Authenticate Kaggle API (credentials must be set up beforehand)
    api = KaggleApi()
    api.authenticate()

    # Download all files as a ZIP
    competition_name = "store-sales-time-series-forecasting"
    zip_path = VOLUME_TARGET_DIR / f"{competition_name}.zip"
    api.competition_download_files(competition=competition_name, path=str(VOLUME_TARGET_DIR), quiet=False)

    # Extract ZIP
    if zip_path.exists():
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extractall(VOLUME_TARGET_DIR)
        zip_path.unlink()  # remove ZIP after extraction
        print(f"‚úì Downloaded and extracted files to {VOLUME_TARGET_DIR}")
    else:
        print(f"‚ö†Ô∏è ZIP file not found at {zip_path}")

# Now CSVs are guaranteed to exist; you can read them into Spark or Pandas


‚ÑπÔ∏è All CSV files already exist. Skipping download.


In [11]:
from pathlib import Path
from pyspark.sql import SparkSession

if IS_DATABRICKS:
    # Use Databricks-provided Spark session (Spark Connect)
    spark = SparkSession.builder.getOrCreate()
else:
    # Local Spark session
    spark = SparkSession.builder.appName("local-training").master("local[*]").getOrCreate()

# Ensure VOLUME_TARGET_DIR is a Path
VOLUME_TARGET_DIR = Path(BASE_PATH)

# Dictionary of filenames
filenames = {
    'holidays_events': 'holidays_events.csv',
    'oil': 'oil.csv',
    'sample_submission': 'sample_submission.csv',
    'stores': 'stores.csv',
    'test': 'test.csv',
    'train': 'train.csv',
    'transactions': 'transactions.csv'
}

# Read all CSVs into a dictionary of Spark DataFrames
dataframes = {}
for key, fname in filenames.items():
    file_path = VOLUME_TARGET_DIR / fname
    if file_path.exists():
        df = spark.read.csv(str(file_path), header=True, inferSchema=True)
        dataframes[key] = df
        print(f"‚úì Loaded '{fname}' as Spark DataFrame with {df.count()} rows and {len(df.columns)} columns")
    else:
        print(f"‚ö†Ô∏è File not found: {file_path}")

# Access individual DataFrames like:
holidays_events_df = dataframes.get('holidays_events')
oil_df = dataframes.get('oil')
stores_df = dataframes.get('stores')
transactions_df = dataframes.get('transactions')
train_df = dataframes.get('train')


25/12/22 01:52:54 WARN Utils: Your hostname, daniels-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.88.117 instead (on interface en0)
25/12/22 01:52:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/22 01:52:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


‚úì Loaded 'holidays_events.csv' as Spark DataFrame with 350 rows and 6 columns
‚úì Loaded 'oil.csv' as Spark DataFrame with 1218 rows and 2 columns
‚úì Loaded 'sample_submission.csv' as Spark DataFrame with 28512 rows and 2 columns
‚úì Loaded 'stores.csv' as Spark DataFrame with 54 rows and 5 columns
‚úì Loaded 'test.csv' as Spark DataFrame with 28512 rows and 5 columns


                                                                                

‚úì Loaded 'train.csv' as Spark DataFrame with 3000888 rows and 6 columns
‚úì Loaded 'transactions.csv' as Spark DataFrame with 83488 rows and 3 columns
