In [9]:
import os, sys
import json
from pyspark.sql import functions as F
from pyspark.sql.types import *
import pandas as pd
from sqlalchemy import create_engine, text
from pathlib import Path

PROJECT_ROOT = Path(".").resolve()
WAREHOUSE_DIR = PROJECT_ROOT / "warehouse"
DATA_DIR = PROJECT_ROOT / "data"

WAREHOUSE_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("WAREHOUSE_DIR:", WAREHOUSE_DIR)


os.environ["HADOOP_HOME"] = r"C:\hadoop-3.3.6"
os.environ["PATH"] = r"C:\hadoop-3.3.6\bin" + os.pathsep + os.environ["PATH"]


os.environ["PYSPARK_PYTHON"] = sys.executable
os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

print("HADOOP_HOME:", os.environ["HADOOP_HOME"])
print("Python:", sys.executable)

DATA_DIR: C:\Users\evasb_eqbyhhj\Documents\DS-2002-Capstone\notebooks\data
WAREHOUSE_DIR: C:\Users\evasb_eqbyhhj\Documents\DS-2002-Capstone\notebooks\warehouse
HADOOP_HOME: C:\hadoop-3.3.6
Python: c:\Users\evasb_eqbyhhj\anaconda3\envs\ds2002spark\python.exe


In [2]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

builder = (
    SparkSession.builder
    .appName("restaurant-lakehouse")
    .master("local[*]")
    .config("spark.pyspark.python", sys.executable)
    .config("spark.pyspark.driver.python", sys.executable)
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
)

spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("WARN")


In [4]:
mysql_args = {
    "uid": "root",            
    "pwd": "760349Eb!toogie",  
    "hostname": "localhost",
    "port": 3306,
    "dbname": "restaurant_oltp"
}

In [5]:

def get_sql_dataframe(sql_query: str, **args) -> pd.DataFrame:
    """
    Extracts data from a MySQL database using SQLAlchemy
    and returns a Pandas DataFrame.
    """
    conn_str = (
        f"mysql+pymysql://{args['uid']}:{args['pwd']}@"
        f"{args['hostname']}:{args['port']}/{args['dbname']}"
    )
    engine = create_engine(conn_str, pool_recycle=3600)

    with engine.connect() as conn:
        return pd.read_sql(text(sql_query), conn)


In [6]:
customers_pdf = get_sql_dataframe(
    "SELECT * FROM customers",
    **mysql_args
)

print(customers_pdf.shape)
customers_pdf.head()

(5, 9)


Unnamed: 0,customer_id,customer_code,first_name,last_name,email,phone,loyalty_tier,signup_date,birthdate
0,1,CUST-001,Emma,Johnson,emma.j@example.com,555-1111,Gold,2023-01-10,1998-04-12
1,2,CUST-002,Liam,Smith,liam.s@example.com,555-2222,Silver,2023-03-15,1995-09-22
2,3,CUST-003,Olivia,Brown,olivia.b@example.com,555-3333,Bronze,2024-01-02,2000-07-18
3,4,CUST-004,Noah,Davis,noah.d@example.com,555-4444,Gold,2022-11-08,1992-02-05
4,5,CUST-005,Ava,Wilson,ava.w@example.com,555-5555,Silver,2023-06-21,1999-11-30


In [7]:
customers_sdf = (
    spark.createDataFrame(customers_pdf)
         .withColumn("customer_id", F.col("customer_id").cast("int"))
         .withColumn("signup_date", F.to_date("signup_date"))
         .withColumn("birthdate", F.to_date("birthdate"))
         .withColumn("created_at", F.current_timestamp())
)

customers_sdf.show(5, truncate=False)


+-----------+-------------+----------+---------+--------------------+--------+------------+-----------+----------+--------------------------+
|customer_id|customer_code|first_name|last_name|email               |phone   |loyalty_tier|signup_date|birthdate |created_at                |
+-----------+-------------+----------+---------+--------------------+--------+------------+-----------+----------+--------------------------+
|1          |CUST-001     |Emma      |Johnson  |emma.j@example.com  |555-1111|Gold        |2023-01-10 |1998-04-12|2025-12-14 09:08:29.984037|
|2          |CUST-002     |Liam      |Smith    |liam.s@example.com  |555-2222|Silver      |2023-03-15 |1995-09-22|2025-12-14 09:08:29.984037|
|3          |CUST-003     |Olivia    |Brown    |olivia.b@example.com|555-3333|Bronze      |2024-01-02 |2000-07-18|2025-12-14 09:08:29.984037|
|4          |CUST-004     |Noah      |Davis    |noah.d@example.com  |555-4444|Gold        |2022-11-08 |1992-02-05|2025-12-14 09:08:29.984037|
|5    

In [8]:
dim_customer_path = str(WAREHOUSE_DIR / "dim_customer")

(customers_sdf
 .write.format("delta")
 .mode("overwrite")
 .save(dim_customer_path)
)

spark.read.format("delta").load(dim_customer_path).show(5, truncate=False)

+-----------+-------------+----------+---------+--------------------+--------+------------+-----------+----------+-------------------------+
|customer_id|customer_code|first_name|last_name|email               |phone   |loyalty_tier|signup_date|birthdate |created_at               |
+-----------+-------------+----------+---------+--------------------+--------+------------+-----------+----------+-------------------------+
|3          |CUST-003     |Olivia    |Brown    |olivia.b@example.com|555-3333|Bronze      |2024-01-02 |2000-07-18|2025-12-14 09:08:56.76229|
|2          |CUST-002     |Liam      |Smith    |liam.s@example.com  |555-2222|Silver      |2023-03-15 |1995-09-22|2025-12-14 09:08:56.76229|
|1          |CUST-001     |Emma      |Johnson  |emma.j@example.com  |555-1111|Gold        |2023-01-10 |1998-04-12|2025-12-14 09:08:56.76229|
|5          |CUST-005     |Ava       |Wilson   |ava.w@example.com   |555-5555|Silver      |2023-06-21 |1999-11-30|2025-12-14 09:08:56.76229|
|4          |

WindowsPath('C:/Users/evasb_eqbyhhj/Documents/DS-2002-Capstone/notebooks/data/employees.csv')