## User registrations and app-loaded exploration Notebook

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkFiles

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL wikipedia pageview") \
    .getOrCreate()

Let's have a glance into the data and get the data schema out of input

In [2]:
#pickup data for dev
data_input = "./data-input"
df = spark.read.json(data_input)
df.show(10, truncate=False)

+---------------+--------+-------+-----------+----------+-------------------+------------------------+
|browser_version|campaign|channel|device_type|event     |initiator_id       |timestamp               |
+---------------+--------+-------+-----------+----------+-------------------+------------------------+
|null           |null    |null   |null       |registered|3074457347135400447|2020-01-08T06:21:14.000Z|
|79.0           |null    |null   |desktop    |app_loaded|3074457345816644047|2020-01-08T06:24:42.000Z|
|               |null    |null   |tablet-app |app_loaded|3074457346184244610|2020-01-08T06:25:10.000Z|
|79.0           |null    |null   |desktop    |app_loaded|3074457347135385819|2020-01-08T06:25:11.000Z|
|78.0           |null    |null   |desktop    |app_loaded|3074457346246864126|2020-01-08T06:27:23.000Z|
|76.0           |null    |null   |desktop    |app_loaded|3074457346612629694|2020-01-08T17:54:39.000Z|
|79.0           |null    |null   |desktop    |app_loaded|3074457347100151

In [3]:
df.schema

StructType([StructField('browser_version', StringType(), True), StructField('campaign', StringType(), True), StructField('channel', StringType(), True), StructField('device_type', StringType(), True), StructField('event', StringType(), True), StructField('initiator_id', LongType(), True), StructField('timestamp', StringType(), True)])

Define schema according to the exploration
`|browser_version|campaign|channel|device_type|event|initiator_id|timestamp|`


In [4]:
'''
StructType([StructField('browser_version', StringType(), True), StructField('campaign', StringType(), True), StructField('channel', StringType(), True), StructField('device_type', StringType(), True), StructField('event', StringType(), True), StructField('initiator_id', LongType(), True), StructField('timestamp', StringType(), True)])
'''
from pyspark.sql.types import (
    LongType,
    StringType,
    StructType,
    TimestampType,
)
input_data_schema = StructType() \
      .add("browser_version",StringType(),True) \
      .add("campaign",StringType(),True) \
      .add("channel",StringType(),True) \
      .add("device_type",StringType(),True) \
      .add("event",StringType(),True) \
      .add("initiator_id",LongType(),True) \
      .add("timestamp",TimestampType(),True)


Read data again with data schema

In [5]:
df = spark.read.option("inferSchema", True).schema(input_data_schema) \
    .json(data_input)
    
df.show(5)

+---------------+--------+-------+-----------+----------+-------------------+-------------------+
|browser_version|campaign|channel|device_type|     event|       initiator_id|          timestamp|
+---------------+--------+-------+-----------+----------+-------------------+-------------------+
|           null|    null|   null|       null|registered|3074457347135400447|2020-01-08 06:21:14|
|           79.0|    null|   null|    desktop|app_loaded|3074457345816644047|2020-01-08 06:24:42|
|               |    null|   null| tablet-app|app_loaded|3074457346184244610|2020-01-08 06:25:10|
|           79.0|    null|   null|    desktop|app_loaded|3074457347135385819|2020-01-08 06:25:11|
|           78.0|    null|   null|    desktop|app_loaded|3074457346246864126|2020-01-08 06:27:23|
+---------------+--------+-------+-----------+----------+-------------------+-------------------+
only showing top 5 rows



Split dataframe into two sub dataframes -- user_registration and app_loaded

In [12]:
user_registration_df = df.select("event",
                                 "timestamp",
                                 "initiator_id",
                                 "channel"
                                )
app_loaded_df = df.select("event",
                          "timestamp",
                          "initiator_id",
                          "device_type"
                          )

Data transformation then Write dataframes partition by day in parquet format

In [13]:
from pyspark.sql.functions import (
    col,
    date_format
)

user_registration_df = user_registration_df.withColumn(
        "derived_tstamp_day", date_format(col("timestamp"), "yyyy-MM-dd")
    ).withColumnRenamed(
    'timestamp', 'time'
    )

app_loaded_df = app_loaded_df.withColumn(
        "derived_tstamp_day", date_format(col("timestamp"), "yyyy-MM-dd")
    ).withColumnRenamed(
    'timestamp', 'time'
    )

Load data in parquet format

In [14]:
write_path = "data-output/user_registration"

user_registration_df.repartition(1)\
  .write.option("compression", "snappy")\
  .save(
    path=write_path,
    format="parquet",
    mode="overwrite",
    partitionBy="derived_tstamp_day",
  )

In [15]:
write_path = "data-output/app_loaded"

app_loaded_df.repartition(1)\
  .write.option("compression", "snappy")\
  .save(
    path=write_path,
    format="parquet",
    mode="overwrite",
    partitionBy="derived_tstamp_day",
  )