In [0]:
%fs ls /Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/

In [0]:
%fs ls /Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/

In [0]:
import logging
import os
from datetime import datetime

# Create logs directory if it doesn't exist
os.makedirs('logs', exist_ok=True)

# Set up log file name with current date and time
now = datetime.now()
log_filename = f"logs/run_{now.strftime('%Y%m%d_%H%M')}.log"

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s',
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(log_filename)
    ]
)

# Log start of run
logging.info("Run started.")
logging.info(f"Cluster/runtime info: AWS Serverless interactive cluster (terminated by inactivity)")

# âœ… Updated paths to GitHub-connected Databricks repo
logging.info(f"menu file path: /Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/menu_items.csv")
logging.info(f"orders file path: /Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/order_details.csv")
logging.info(f"Log file: {log_filename}")


In [0]:
import os, random, numpy as np
import subprocess
import hashlib
import json

# Fix random seeds
os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)
logging.info("Random seeds set to 0")

# Capture environment
!pip freeze > requirements.txt
logging.info("Saved environment packages to requirements.txt")

# Compute SHA-256 hashes for input CSVs
def compute_sha256(file_path):
    sha256_hash = hashlib.sha256()
    with open(file_path, "rb") as f:
        for byte_block in iter(lambda: f.read(4096), b""):
            sha256_hash.update(byte_block)
    return sha256_hash.hexdigest()

data_files = [
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/menu_items.csv",
    "/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/order_details.csv"
]

hashes = {}
for f in data_files:
    if os.path.exists(f):
        hashes[f] = compute_sha256(f)
    else:
        logging.warning(f"File not found: {f}. Skipping hash computation.")

with open("data_hashes.json", "w") as f:
    json.dump(hashes, f, indent=2)

logging.info(f"Data hashes saved to data_hashes.json: {hashes}")


In [0]:
import pandas as pd

menu = pd.read_csv('/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/menu_items.csv')
orders = pd.read_csv('/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/data_samples/order_details.csv')

print('menu_items shape:', menu.shape)
print('order_details shape:', orders.shape)
display(menu.head())
display(orders.head())

In [0]:
# Clean basic issues
orders['order_date'] = pd.to_datetime(orders['order_date'], errors='coerce')
orders['order_time'] = orders['order_time'].str.strip()
if 'item_id' in orders.columns:
    orders['item_id'] = orders['item_id'].astype('Int64')

menu['item_name'] = menu['item_name'].str.strip()
menu['category'] = menu['category'].str.strip()

# Join on menu_items.menu_item_id = order_details.item_id
etl_df = orders.merge(menu, left_on='item_id', right_on='menu_item_id', how='left')

# Create tidy table with useful columns
etl_df = etl_df[['order_id', 'order_date', 'order_time', 'item_name', 'category', 'price']]
display(etl_df.head())

# Save cleaned and joined outputs (TOP 5 ONLY)
import os
output_dir = '/Workspace/Users/gsc314@ensign.edu/csai382_lab_2_4_-GustavoC-/etl_output'
os.makedirs(output_dir, exist_ok=True)

menu.to_csv(f'{output_dir}/menu_items_loaded.csv', index=False)
orders.to_csv(f'{output_dir}/order_details_loaded.csv', index=False)

# ðŸ”‘ Save only the top 5 rows
etl_df.head(5).to_csv(f'{output_dir}/etl_df_cleaned_joined_top5.csv', index=False)

print('Saved menu_items, order_details, and TOP 5 rows of cleaned/joined etl_df to etl_output directory.')


### Ethical Reflection

When working with data and AI, it is crucial to consider the ethical implications of our practices. Sensitive information, such as personal customer details or passwords, should never be written to log files, as logs may be accessible to others and could pose privacy risks. Ensuring reproducibility by capturing random seeds, environment details, and input file hashes supports accountability and fairnessâ€”allowing others to verify analyses and confirm consistent results. This is especially important when models or decisions impact individuals, as reproducible workflows help reduce errors and bias. By following these practices, we foster trust and transparency in data-driven projects, upholding ethical standards and protecting those affected by our work.