In [7]:
from datetime import date
from pyspark.sql.types import *
from pyspark.sql.functions import lit
import shutil
import os
import subprocess

In [20]:
# this has been added for scenarios where you might
# wish to alter some of the churn label prediction
# logic but do not wish to rerun the whole notebook
skip_reload = False

# please use a personalized database name here if you wish to avoid interfering with other users who might be running this accelerator in the same workspace
database_name = 'kkbox_churn'
data_dir = f"{os.getenv('HOME')}/databricks/kkbox_churn"

In [None]:
spark.stop()  # Properly stop Spark
del spark     # Delete the variable

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("ChurnCluster") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:1.2.1") \
    .config("spark.executor.memory", "56g") \
    .config("spark.driver.memory", "56g") \
    .getOrCreate()

os.environ["SPARK_APP_NAME"] = spark.conf.get("spark.app.name")
os.environ["SPARK_MASTER"] = spark.conf.get("spark.master")

print("Spark Version:", spark.version)

In [None]:
if skip_reload:
  # create database to house SQL tables
  _ = spark.sql(f'CREATE DATABASE IF NOT EXISTS {database_name}')
  _ = spark.sql(f'USE {database_name}')
else:
  # delete the old database if needed
  _ = spark.sql(f'DROP DATABASE IF EXISTS {database_name} CASCADE')
  _ = spark.sql(f'CREATE DATABASE {database_name}')
  _ = spark.sql(f'USE {database_name}')

  # drop any old delta lake files that might have been created
  folder_path = f'{data_dir}/silver/members'
  if os.path.exists(folder_path):
      shutil.rmtree(folder_path)
    
  # members dataset schema
  member_schema = StructType([
    StructField('msno', StringType()),
    StructField('city', IntegerType()),
    StructField('bd', IntegerType()),
    StructField('gender', StringType()),
    StructField('registered_via', IntegerType()),
    StructField('registration_init_time', DateType())
    ])

  # read data from csv
  members = (
    spark
      .read
      .csv(
        f'{data_dir}/members/members_v3.csv',
        schema=member_schema,
        header=True,
        dateFormat='yyyyMMdd'
        )
      )

  # persist in delta lake format
  (
    members
      .write
      .format('delta')
      .mode('overwrite')
      .save(f'{data_dir}/silver/members')
    )

    # create table object to make delta lake queryable
  _ = spark.sql('''
      CREATE TABLE members 
      USING DELTA 
      LOCATION '/home/dinindu/databricks/kkbox_churn/silver/members'
      ''')


In [None]:
# print(members.show())
result = spark.sql("SELECT * FROM kkbox_churn.members LIMIT 10")
result.show()

In [None]:
if not skip_reload:

  # drop any old delta lake files that might have been created
  folder_path = f'{data_dir}/silver/transactions'
  if os.path.exists(folder_path):
      shutil.rmtree(folder_path)

  # transaction dataset schema
  transaction_schema = StructType([
    StructField('msno', StringType()),
    StructField('payment_method_id', IntegerType()),
    StructField('payment_plan_days', IntegerType()),
    StructField('plan_list_price', IntegerType()),
    StructField('actual_amount_paid', IntegerType()),
    StructField('is_auto_renew', IntegerType()),
    StructField('transaction_date', DateType()),
    StructField('membership_expire_date', DateType()),
    StructField('is_cancel', IntegerType())  
    ])

  # read data from csv
  transactions = (
    spark
      .read
      .csv(
        f'{data_dir}/transactions/transactions.csv',
        schema=transaction_schema,
        header=True,
        dateFormat='yyyyMMdd'
        )
      )

  # persist in delta lake format
  ( transactions
      .write
      .format('delta')
      .partitionBy('transaction_date')
      .mode('overwrite')
      .save(f'{data_dir}/silver/transactions')
    )

    # create table object to make delta lake queryable
  _ = spark.sql('''
      CREATE TABLE transactions
      USING DELTA 
      LOCATION '/home/dinindu/databricks/kkbox_churn/silver/transactions'
      ''')

In [None]:
# print(transactions.show())
result = spark.sql("SELECT * FROM kkbox_churn.transactions LIMIT 10")
result.show()

In [None]:
if not skip_reload:
  # drop any old delta lake files that might have been created
  folder_path = f'{data_dir}/silver/user_logs'
  if os.path.exists(folder_path):
      shutil.rmtree(folder_path)

  # transaction dataset schema
  user_logs_schema = StructType([ 
    StructField('msno', StringType()),
    StructField('date', DateType()),
    StructField('num_25', IntegerType()),
    StructField('num_50', IntegerType()),
    StructField('num_75', IntegerType()),
    StructField('num_985', IntegerType()),
    StructField('num_100', IntegerType()),
    StructField('num_uniq', IntegerType()),
    StructField('total_secs', FloatType())  
    ])

  # read data from csv
  user_logs = (
    spark
      .read
      .csv(
        f'{data_dir}/user_logs/user_logs.csv',
        schema=user_logs_schema,
        header=True,
        dateFormat='yyyyMMdd'
        )
      )

  # persist in delta lake format
  ( user_logs
      .write
      .format('delta')
      .partitionBy('date')
      .mode('overwrite')
      .save(f'{data_dir}/silver/user_logs')
    )

  # create table object to make delta lake queryable
  _ = spark.sql('''
    CREATE TABLE IF NOT EXISTS user_logs
    USING DELTA 
    LOCATION '/home/dinindu/databricks/kkbox_churn/silver/user_logs'
    ''')

In [None]:
# Delete training labels if exists before create
_ = spark.sql('DROP TABLE IF EXISTS train')

In [None]:
%%sh -e

docker network create spark-net

docker run -d --rm --network spark-net --name spark-master \
    -p 8080:8080 -p 7077:7077 -p 4040:4040 \
    bitnami/spark spark-class org.apache.spark.deploy.master.Master

docker run -d --rm --network spark-net --name spark-worker \
    --env SPARK_MODE=worker \
    --env SPARK_MASTER_URL=spark://spark-master:7077 \
    bitnami/spark spark-class org.apache.spark.deploy.worker.Worker spark://spark-master:7077


In [None]:
%%sh -e

kkbox_churn_dir="/home/dinindu/databricks/kkbox_churn"
sudo chmod 777 $kkbox_churn_dir
sudo rm -rf $kkbox_churn_dir/silver/train

docker run --rm --network host  \
    -v "$kkbox_churn_dir:/opt/spark/work/kkbox_churn" \
    -v "$PWD:/opt/bitnami/spark/work" \
    bitnami/spark:3.4.1 spark-shell --master local[*] \
    --executor-memory 48G \
    --driver-memory 16G \
    --packages io.delta:delta-core_2.12:2.4.0 \
    --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \
    --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \
    -i /opt/bitnami/spark/work/scripts/generate_training_labels.scala

sudo chown -R dinindu:dinindu $kkbox_churn_dir/silver/train
sudo chmod -R 777 $kkbox_churn_dir/silver/train

In [None]:
# Delete training labels if exists before create
%%sh -e
# rm -rf "$data_dir/silver/train"
scala --version
# scala "scripts/generate_traning_labels.scala"