In [None]:
from datetime import datetime, timedelta, date

import pyspark
import pyspark.sql.functions as f
import pyspark.sql.types as t
from IPython.core.interactiveshell import InteractiveShell
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window


In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
%%time
conf = (SparkConf()
        .setMaster('yarn-client')
        .setAppName('scoring-master-table')
        .set("spark.driver.maxResultSize", "10g")
        .set("spark.driver.memory", "16g")
        .set("spark.driver.memoryOverhead", "4096")
        .set("spark.dynamicAllocation.enabled", "true")
        .set("spark.dynamicAllocation.initialExecutors", "1")
        .set("spark.dynamicAllocation.maxExecutors", "75")
        .set("spark.dynamicAllocation.minExecutors", "1")
        .set("spark.executor.cores", "4")
        .set("spark.executor.memory", "16g")
        .set("spark.hadoop.fs.permissions.umask-mode", "002")
        .set("spark.kryoserializer.buffer.max", "512m")
        .set("spark.shuffle.service.enabled", "true")
        .set("spark.sql.broadcastTimeout", "1000")
        .set("spark.sql.hive.convertMetastoreParquet", "false")
        .set("spark.sql.parquet.compression.codec", "snappy")
        .set("spark.sql.shuffle.partitions", "1000")
        .set("spark.sql.sources.partitionOverwriteMode", "dynamic")
        .set("spark.yarn.driver.memoryOverhead", "4096")
        .set("spark.yarn.executor.memoryOverhead", "4096")
        .set("spark.yarn.maxAppAttempts", "2")
        .set("spark.yarn.queue", "root.hue_dmp_prod")
        .set("yarn.nodemanager.vmem-check-enabled", "false")
        )
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

# Master Table Weekstart

### <font color='red'>Note: Update the weekstart before running the Split</font>

In [None]:
# Update the weekstart before running the Notebook
weekstart = (
    (date.today() - timedelta(days=3))
    - timedelta(days=(date.today() - timedelta(days=3)).weekday())
).strftime("%Y-%m-%d")


# Features Path

In [None]:
scaffold_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/02_primary/scaffold_weekly.parquet/weekstart={weekstart}"
internet_app_usage_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/internet_apps_usage/fea_internet_apps_usage.parquet/weekstart={weekstart}"
revenue_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/revenue/fea_revenue.parquet/weekstart={weekstart}"
recharge_1_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/recharge/fea_rech_full_topup_behav.parquet/weekstart={weekstart}"
recharge_2_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/recharge/fea_rech_full_chg_pck_prchse.parquet/weekstart={weekstart}"
recharge_3_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/recharge/fea_topup_behav_full_acc_bal_neg_or_zero.parquet/weekstart={weekstart}"
handset_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/handset/fea_device.parquet/weekstart={weekstart}"
voice_calls_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/voice_calls/fea_inc_out_uniq_bnums.parquet/weekstart={weekstart}"
internet_usage_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/internet_usage/fea_internet_usage.parquet/weekstart={weekstart}"
cust_prof_path = "hdfs:///data/landing/gx_pnt/mck_dmp_pipeline/01_aggregation/customer_profile/customer_profile_weekly.parquet/weekstart={weekstart}"
sms_1_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/sms_2/fea_text_messaging.parquet/weekstart={weekstart}"
sms_2_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/04_features/sms_2/fea_commercial_text_messaging.parquet/weekstart={weekstart}"

master_path = "hdfs:///data/landing/gx_pnt/mck_dmp_score/05_model_input/scoring_master.parquet/weekstart={weekstart}"


# Features DF

In [None]:
scaffold_df = spark.read.parquet(scaffold_path.format(weekstart=weekstart))
internet_app_usage_df = spark.read.parquet(internet_app_usage_path.format(weekstart=weekstart))
revenue_df = spark.read.parquet(revenue_path.format(weekstart=weekstart))
recharge_1_df = spark.read.parquet(recharge_1_path.format(weekstart=weekstart))
recharge_2_df = spark.read.parquet(recharge_2_path.format(weekstart=weekstart))
recharge_3_df = spark.read.parquet(recharge_3_path.format(weekstart=weekstart))
handset_df = spark.read.parquet(handset_path.format(weekstart=weekstart))
voice_calls_df = spark.read.parquet(voice_calls_path.format(weekstart=weekstart))
internet_usage_df = spark.read.parquet(internet_usage_path.format(weekstart=weekstart))
cust_prof_df = spark.read.parquet(cust_prof_path.format(weekstart=weekstart))
sms_1_df = spark.read.parquet(sms_1_path.format(weekstart=weekstart))
sms_2_df = spark.read.parquet(sms_2_path.format(weekstart=weekstart))


# Master DF

In [None]:
master_df = (
    scaffold_df.join(handset_df, ["msisdn"], how="left")
    .join(internet_app_usage_df, ["msisdn"], how="left")
    .join(internet_usage_df, ["msisdn"], how="left")
    .join(recharge_1_df, ["msisdn"], how="left")
    .join(recharge_2_df, ["msisdn"], how="left")
    .join(recharge_3_df, ["msisdn"], how="left")
    .join(revenue_df, ["msisdn"], how="left")
    .join(sms_1_df, ["msisdn"], how="left")
    .join(sms_2_df, ["msisdn"], how="left")
    .join(voice_calls_df, ["msisdn"], how="left")
    .join(cust_prof_df, ["msisdn"], how="left")
)


In [None]:
# Mode "overwrite" is removed from the write statement to prevent the accidental removal of old master files.

master_df.write.parquet(master_path.format(weekstart=weekstart))
# master_df.write.mode("overwrite").parquet(master_path.format(weekstart=weekstart))
