In [7]:
from datetime import date
from pyspark.sql.types import *
from pyspark.sql.functions import lit
import shutil
import os
import subprocess

In [8]:
# this has been added for scenarios where you might
# wish to alter some of the churn label prediction
# logic but do not wish to rerun the whole notebook
skip_reload = False

# please use a personalized database name here if you wish to avoid interfering with other users who might be running this accelerator in the same workspace
database_name = 'kkbox_churn'
data_dir = f"{os.getenv('HOME')}/databricks/kkbox_churn"

In [None]:
spark.stop()  # Properly stop Spark
del spark     # Delete the variable

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("ChurnCluster") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1") \
    .config("spark.executor.memory", "56g") \
    .config("spark.driver.memory", "56g") \
    .getOrCreate()

os.environ["SPARK_APP_NAME"] = spark.conf.get("spark.app.name")
os.environ["SPARK_MASTER"] = spark.conf.get("spark.master")

print("Spark Version:", spark.version)

In [None]:
if skip_reload:
  # create database to house SQL tables
  _ = spark.sql(f'CREATE DATABASE IF NOT EXISTS {database_name}')
  _ = spark.sql(f'USE {database_name}')
else:
  # delete the old database if needed
  _ = spark.sql(f'DROP DATABASE IF EXISTS {database_name} CASCADE')
  _ = spark.sql(f'CREATE DATABASE {database_name}')
  _ = spark.sql(f'USE {database_name}')

  # drop any old delta lake files that might have been created
  folder_path = f'{data_dir}/silver/members'
  if os.path.exists(folder_path):
      shutil.rmtree(folder_path)
    
  # members dataset schema
  member_schema = StructType([
    StructField('msno', StringType()),
    StructField('city', IntegerType()),
    StructField('bd', IntegerType()),
    StructField('gender', StringType()),
    StructField('registered_via', IntegerType()),
    StructField('registration_init_time', DateType())
    ])

  # read data from csv
  members = (
    spark
      .read
      .csv(
        f'{data_dir}/members/members_v3.csv',
        schema=member_schema,
        header=True,
        dateFormat='yyyyMMdd'
        )
      )

  # persist in delta lake format
  (
    members
      .write
      .format('delta')
      .mode('overwrite')
      .save(f'{data_dir}/silver/members')
    )

    # create table object to make delta lake queryable
  _ = spark.sql('''
      CREATE TABLE members 
      USING DELTA 
      LOCATION '/home/dinindu/databricks/kkbox_churn/silver/members'
      ''')


In [None]:
# print(members.show())
result = spark.sql("SELECT * FROM kkbox_churn.members LIMIT 10")
result.show()

In [None]:
if not skip_reload:

  # drop any old delta lake files that might have been created
  folder_path = f'{data_dir}/silver/transactions'
  if os.path.exists(folder_path):
      shutil.rmtree(folder_path)

  # transaction dataset schema
  transaction_schema = StructType([
    StructField('msno', StringType()),
    StructField('payment_method_id', IntegerType()),
    StructField('payment_plan_days', IntegerType()),
    StructField('plan_list_price', IntegerType()),
    StructField('actual_amount_paid', IntegerType()),
    StructField('is_auto_renew', IntegerType()),
    StructField('transaction_date', DateType()),
    StructField('membership_expire_date', DateType()),
    StructField('is_cancel', IntegerType())  
    ])

  # read data from csv
  transactions = (
    spark
      .read
      .csv(
        f'{data_dir}/transactions/transactions.csv',
        schema=transaction_schema,
        header=True,
        dateFormat='yyyyMMdd'
        )
      )

  # persist in delta lake format
  ( transactions
      .write
      .format('delta')
      .partitionBy('transaction_date')
      .mode('overwrite')
      .save(f'{data_dir}/silver/transactions')
    )

    # create table object to make delta lake queryable
  _ = spark.sql('''
      CREATE TABLE transactions
      USING DELTA 
      LOCATION '/home/dinindu/databricks/kkbox_churn/silver/transactions'
      ''')

In [None]:
# print(transactions.show())
result = spark.sql("SELECT * FROM kkbox_churn.transactions LIMIT 10")
result.show()

In [None]:
if not skip_reload:
  # drop any old delta lake files that might have been created
  folder_path = f'{data_dir}/silver/user_logs'
  if os.path.exists(folder_path):
      shutil.rmtree(folder_path)

  # transaction dataset schema
  user_logs_schema = StructType([ 
    StructField('msno', StringType()),
    StructField('date', DateType()),
    StructField('num_25', IntegerType()),
    StructField('num_50', IntegerType()),
    StructField('num_75', IntegerType()),
    StructField('num_985', IntegerType()),
    StructField('num_100', IntegerType()),
    StructField('num_uniq', IntegerType()),
    StructField('total_secs', FloatType())  
    ])

  # read data from csv
  user_logs = (
    spark
      .read
      .csv(
        f'{data_dir}/user_logs/user_logs.csv',
        schema=user_logs_schema,
        header=True,
        dateFormat='yyyyMMdd'
        )
      )

  # persist in delta lake format
  ( user_logs
      .write
      .format('delta')
      .partitionBy('date')
      .mode('overwrite')
      .save(f'{data_dir}/silver/user_logs')
    )

  # create table object to make delta lake queryable
  _ = spark.sql('''
    CREATE TABLE IF NOT EXISTS user_logs
    USING DELTA 
    LOCATION '/home/dinindu/databricks/kkbox_churn/silver/user_logs'
    ''')

In [None]:
# Delete training labels if exists before create
%%sh -e
rm -rf "$data_dir/silver/train"

In [None]:
# Delete training labels if exists before create
_ = spark.sql('DROP TABLE IF EXISTS train')

In [3]:
%%sh -e

# docker network create spark-net
# docker build -t dini-spark:3.5.4 .

docker run -d --rm --network spark-net --name spark-master \
    -p 8080:8080 -p 7077:7077 -p 4040:4040 \
    bitnami/spark spark-class org.apache.spark.deploy.master.Master \
    # --packages io.delta:delta-core_2.13:2.4.0

docker run -d --rm --network spark-net --name spark-worker \
    --env SPARK_MODE=worker \
    --env SPARK_MASTER_URL=spark://spark-master:7077 \
    bitnami/spark spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 \
    # --packages io.delta:delta-core_2.13:2.4.0

# docker run -d --rm --network spark-net --name spark-master \
#     -p 8080:8080 -p 7077:7077 -p 4040:4040 \
#     apache/spark spark-class org.apache.spark.deploy.master.Master \
#     --packages io.delta:delta-core_2.12:2.3.0

# docker run -d --rm --network spark-net --name spark-worker \
#     --env SPARK_MODE=worker \
#     --env SPARK_MASTER_URL=spark://spark-master:7077 \
#     apache/spark spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077 \
#     --packages io.delta:delta-core_2.12:2.3.0


e3885f44e867279da8f2bccd336017454ceececd5e32b0053ce7bb68babc0327
ecc0bdeac27a5c6e65c348ec477db82f6e5fc6afdad31600e8b1ef4381e2f67f


In [None]:
%%sh -e

docker run --rm --network spark-net  \
    -v "/home/dinindu/databricks/kkbox_churn:/opt/spark/work/kkbox_churn" \
    -v "$PWD:/opt/bitnami/spark/work" \
    bitnami/spark  spark-shell --master local[*] \
    --packages io.delta:delta-core_2.12:1.2.1 \
    -i /opt/bitnami/spark/work/scripts/generate_training_labels.scala

[38;5;6mspark [38;5;5m08:08:31.81 [0m[38;5;2mINFO [0m ==> 
[38;5;6mspark [38;5;5m08:08:31.81 [0m[38;5;2mINFO [0m ==> [1mWelcome to the Bitnami spark container[0m
[38;5;6mspark [38;5;5m08:08:31.81 [0m[38;5;2mINFO [0m ==> Subscribe to project updates by watching [1mhttps://github.com/bitnami/containers[0m
[38;5;6mspark [38;5;5m08:08:31.82 [0m[38;5;2mINFO [0m ==> Did you know there are enterprise versions of the Bitnami catalog? For enhanced secure software supply chain features, unlimited pulls from Docker, LTS support, or application customization, see Bitnami Premium or Tanzu Application Catalog. See https://www.arrow.com/globalecs/na/vendors/bitnami/ for more information.
[38;5;6mspark [38;5;5m08:08:31.82 [0m[38;5;2mINFO [0m ==> 



:: loading settings :: url = jar:file:/opt/bitnami/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /opt/bitnami/spark/.ivy2/cache
The jars for the packages stored in: /opt/bitnami/spark/.ivy2/jars
io.delta#delta-core_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f539d592-96ef-4162-8b73-d9ea1b728e95;1.0
	confs: [default]
	found io.delta#delta-core_2.13;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
downloading https://repo1.maven.org/maven2/io/delta/delta-core_2.13/2.4.0/delta-core_2.13-2.4.0.jar ...
	[SUCCESSFUL ] io.delta#delta-core_2.13;2.4.0!delta-core_2.13.jar (1559ms)
downloading https://repo1.maven.org/maven2/io/delta/delta-storage/2.4.0/delta-storage-2.4.0.jar ...
	[SUCCESSFUL ] io.delta#delta-storage;2.4.0!delta-storage.jar (473ms)
downloading https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.9.3/antlr4-runtime-4.9.3.jar ...
	[SUCCESSFUL ] org.antlr#antlr4-runtime;4.9.3!antlr4-runtime.jar (411ms)
:: resolution report :: resolve 46

Spark context Web UI available at http://949af931d6cc:4040
Spark context available as 'sc' (master = local[*], app id = local-1739606923278).
Spark session available as 'spark'.
java.util.ServiceConfigurationError: org.apache.spark.sql.sources.DataSourceRegister: org.apache.spark.sql.delta.sources.DeltaDataSource Unable to get public no-arg constructor
  at java.base/java.util.ServiceLoader.fail(Unknown Source)
  at java.base/java.util.ServiceLoader.getConstructor(Unknown Source)
  at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNextService(Unknown Source)
  at java.base/java.util.ServiceLoader$LazyClassPathLookupIterator.hasNext(Unknown Source)
  at java.base/java.util.ServiceLoader$2.hasNext(Unknown Source)
  at java.base/java.util.ServiceLoader$3.hasNext(Unknown Source)
  at scala.collection.convert.Wrappers$JIteratorWrapper.hasNext(Wrappers.scala:45)
  at scala.collection.Iterator.foreach(Iterator.scala:943)
  at scala.collection.Iterator.foreach$(Iterator.scala

In [None]:
# Delete training labels if exists before create
%%sh -e
# rm -rf "$data_dir/silver/train"
scala --version
# scala "scripts/generate_traning_labels.scala"