# Libraries


In [1]:
from delta import configure_spark_with_delta_pip
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, substring, input_file_name, current_date, year


from pyspark.sql.types import (
    IntegerType,
    LongType,
    StructField,
    StructType,
    DateType,
    DoubleType,
    StringType,
    TimestampType,
)

# Spark Session


In [2]:
builder = SparkSession.builder.config(
    "spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension"
).config(
    "spark.sql.catalog.spark_catalog",
    "org.apache.spark.sql.delta.catalog.DeltaCatalog",
)
spark = configure_spark_with_delta_pip(builder).getOrCreate()

# Loading Data into spark


In [6]:
my_user_schema = StructType(
    [
        StructField("index", IntegerType(), nullable=False),
        StructField("organization_id", StringType(), nullable=False),
        StructField("name", StringType(), nullable=False),
        StructField("website", StringType(), nullable=False),
        StructField("country", StringType(), nullable=False),
        StructField("description", StringType(), nullable=False),
        StructField("founded", IntegerType(), nullable=False),
        StructField("industry", StringType(), nullable=False),
        StructField("employee_no", IntegerType(), nullable=False),
    ]
)

users_df = (
    spark.read.option("header", True)
    .schema(my_user_schema)
    .csv("./datasets/organizations-2000000.csv")
)


click_data_schema = StructType(
    [
        StructField("session_id", IntegerType(), nullable=False),
        StructField("IPID", IntegerType(), nullable=False),
        StructField("timestamp", TimestampType(), nullable=False),
        StructField("VHOST", StringType(), nullable=False),
        StructField("URL_FILE", StringType(), nullable=False),
        StructField("PAGE_NAME", StringType(), nullable=False),
        StructField("REF_URL_category", StringType(), nullable=False),
        StructField("page_load_error", IntegerType(), nullable=False),
        StructField("page_action_detail", StringType(), nullable=False),
        StructField("tip", StringType(), nullable=False),
        StructField("service_detail", StringType(), nullable=False),
        StructField("xps_info", StringType(), nullable=False),
        StructField("page_action_detail_EN", StringType(), nullable=False),
        StructField("service_detail_EN", StringType(), nullable=False),
        StructField("tip_EN", StringType(), nullable=False),
    ]
)

click_data_df = (
    spark.read.option("header", True)
    .option("delimiter", ";")
    .schema(click_data_schema)
    .csv("./datasets/BPI2016_Clicks_NOT_Logged_In.csv")
)

In [17]:
click_data_df.limit(100).agg({"timestamp": "min"}).show(truncate=False)

+-----------------------+
|min(timestamp)         |
+-----------------------+
|2015-09-08 09:28:54.777|
+-----------------------+



In [None]:
{}