In [None]:
import os
from pathlib import Path
from faker import Faker
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.window import Window


In [None]:
SPARK_APP_NAME = os.getenv('SPARK_APP_NAME', 'spark-comprehensive-tutorial')
BASE_DIR = Path('/workspace/notebooks')
OUTPUT_DIR = BASE_DIR / 'tutorial_output' / 'spark_only'
RAW_DIR = OUTPUT_DIR / 'raw'
CURATED_DIR = OUTPUT_DIR / 'curated'

spark = (
    SparkSession.builder
    .appName(SPARK_APP_NAME)
    .master(os.getenv('SPARK_MASTER', 'local[*]'))
    .config('spark.sql.shuffle.partitions', '8')
    .config('spark.sql.session.timeZone', 'UTC')
    .getOrCreate()
)

spark.sparkContext.setLogLevel('WARN')
print('Spark Version:', spark.version)
print('Output Dir:', OUTPUT_DIR)


## 1) Rohdaten erzeugen (Customers, Products, Orders, Order Items)

In [None]:
fake = Faker('en_US')
Faker.seed(42)

countries = ['DE', 'US', 'IN', 'JP', 'SE']
segments = ['SMB', 'Enterprise', 'Consumer']
categories = ['Hardware', 'Software', 'Accessories']

customers = [
    {'customer_id': i, 'country': countries[i % len(countries)], 'segment': segments[i % len(segments)]}
    for i in range(1, 201)
]

products = [
    {
        'product_id': i,
        'product_name': f'product_{i:03d}',
        'category': categories[i % len(categories)],
        'unit_price': float((i % 30 + 1) * 3.5),
    }
    for i in range(1, 101)
]

orders = []
for i in range(1, 1201):
    day = (i % 28) + 1
    orders.append({
        'order_id': i,
        'customer_id': (i % 200) + 1,
        'status': 'PAID' if i % 5 != 0 else 'CANCELLED',
        'order_ts': f'2026-01-{day:02d} {(i % 24):02d}:{(i % 60):02d}:00',
    })

items = []
for order_id in range(1, 1201):
    line_count = (order_id % 4) + 1
    for line in range(1, line_count + 1):
        product_id = ((order_id * line) % 100) + 1
        quantity = (line % 5) + 1
        items.append({
            'order_id': order_id,
            'line_id': line,
            'product_id': product_id,
            'quantity': quantity,
        })

customers_df = spark.createDataFrame(customers)
products_df = spark.createDataFrame(products)
orders_df = spark.createDataFrame(orders).withColumn('order_ts', F.to_timestamp('order_ts'))
items_df = spark.createDataFrame(items)

customers_df.show(5, truncate=False)
orders_df.show(5, truncate=False)
items_df.show(5, truncate=False)
products_df.show(5, truncate=False)
