# S3 и Apache Iceberg
Работа с S3 хранилищем и Iceberg таблицами

In [None]:
import os
from spark_config import get_spark_session

# Connect with S3 configuration
spark = get_spark_session(
    app_name="S3IcebergDemo",
    extra_configs={
        # Iceberg catalog config
        "spark.sql.catalog.iceberg.warehouse": "s3a://warehouse/iceberg"
    }
)

In [None]:
# Create sample data
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from datetime import datetime

data = [
    ("order_1", "product_A", 100, datetime(2024, 1, 15)),
    ("order_2", "product_B", 250, datetime(2024, 1, 15)),
    ("order_3", "product_A", 75, datetime(2024, 1, 16)),
    ("order_4", "product_C", 300, datetime(2024, 1, 16)),
]

schema = StructType([
    StructField("order_id", StringType(), False),
    StructField("product", StringType(), False),
    StructField("amount", IntegerType(), False),
    StructField("order_date", TimestampType(), False),
])

df = spark.createDataFrame(data, schema)
df.show()

In [None]:
# Write to S3 as Parquet
df.write.mode("overwrite").parquet("s3a://warehouse/raw/orders")

In [None]:
# Read back from S3
df_from_s3 = spark.read.parquet("s3a://warehouse/raw/orders")
df_from_s3.show()

In [None]:
# Create Iceberg table with partitioning
df.writeTo("iceberg.db.orders") \
    .partitionedBy("product") \
    .createOrReplace()

In [None]:
# Query Iceberg table
spark.sql("SELECT * FROM iceberg.db.orders").show()

In [None]:
# Time travel - view history
spark.sql("SELECT * FROM iceberg.db.orders.history").show()

In [None]:
# Add more data
new_data = [
    ("order_5", "product_A", 150, datetime(2024, 1, 17)),
]
new_df = spark.createDataFrame(new_data, schema)

# Append to Iceberg
new_df.writeTo("iceberg.db.orders").append()

In [None]:
# Query snapshots
spark.sql("SELECT * FROM iceberg.db.orders.snapshots").show(truncate=False)

In [None]:
spark.stop()