In [None]:
from datetime import datetime, timedelta, date

import pyspark
import pyspark.sql.functions as f
import pyspark.sql.functions as F
import pyspark.sql.types as t
from IPython.core.interactiveshell import InteractiveShell
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window


In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
%%time
conf = (SparkConf()
        .setMaster('yarn-client')
        .setAppName('master-split-training')
        .set("spark.driver.maxResultSize", "10g")
        .set("spark.driver.memory", "16g")
        .set("spark.driver.memoryOverhead", "4096")
        .set("spark.dynamicAllocation.enabled", "true")
        .set("spark.dynamicAllocation.initialExecutors", "1")
        .set("spark.dynamicAllocation.maxExecutors", "75")
        .set("spark.dynamicAllocation.minExecutors", "1")
        .set("spark.executor.cores", "4")
        .set("spark.executor.memory", "16g")
        .set("spark.hadoop.fs.permissions.umask-mode", "002")
        .set("spark.kryoserializer.buffer.max", "512m")
        .set("spark.shuffle.service.enabled", "true")
        .set("spark.sql.broadcastTimeout", "1000")
        .set("spark.sql.hive.convertMetastoreParquet", "false")
        .set("spark.sql.parquet.compression.codec", "snappy")
        .set("spark.sql.shuffle.partitions", "1000")
        .set("spark.sql.sources.partitionOverwriteMode", "dynamic")
        .set("spark.yarn.driver.memoryOverhead", "4096")
        .set("spark.yarn.executor.memoryOverhead", "4096")
        .set("spark.yarn.maxAppAttempts", "2")
        .set("spark.yarn.queue", "root.hue_dmp_prod")
        .set("yarn.nodemanager.vmem-check-enabled", "false")
        )
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

# Create Partner MSISDNs File

In [None]:
partner_df = spark.read.parquet("hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns.parquet")


In [None]:
internal_msisdn = partner_df.filter(
    f.array_contains(partner_df.partner, "internal") |
    f.array_contains(partner_df.partner, "internal-random-sample") |
    f.array_contains(partner_df.partner, "internal_cmp0") |
    f.array_contains(partner_df.partner, "internal_cmp1") |
    f.array_contains(partner_df.partner, "internal_cmp10") |
    f.array_contains(partner_df.partner, "internal_cmp2") |
    f.array_contains(partner_df.partner, "internal_cmp3") |
    f.array_contains(partner_df.partner, "internal_cmp4") |
    f.array_contains(partner_df.partner, "internal_cmp5") |
    f.array_contains(partner_df.partner, "internal_cmp6") |
    f.array_contains(partner_df.partner, "internal_cmp7") |
    f.array_contains(partner_df.partner, "internal_cmp8") |
    f.array_contains(partner_df.partner, "internal_cmp9") |
    f.array_contains(partner_df.partner, "internal_hvc_comboLCG") |
    f.array_contains(partner_df.partner, "internal_hvc_comboTARGET") |
    f.array_contains(partner_df.partner, "internal_ucg_false") |
    f.array_contains(partner_df.partner, "internal_ucg_true")
)


In [None]:
internal_msisdn_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/internal_partner_msisdns.parquet"
internal_msisdn.select("msisdn").repartition(1).write.mode("overwrite").parquet("hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/internal_partner_msisdns.parquet")


In [None]:
kred_test_msisdn = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns_kredivo_test.parquet"
kred_lookalike_msisdn = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns_kredivo_lookalike.parquet"
kred_msisdn = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns_kredivo.parquet"

kred_test = partner_df.filter(f.array_contains(partner_df.partner, "kredivo-test")).select("msisdn")
kred_lookalike = partner_df.filter(f.array_contains(partner_df.partner, "kredivo-lookalike")).select("msisdn")
kred = partner_df.filter(f.array_contains(partner_df.partner, "kredivo")).select("msisdn")


In [None]:
kred_test.repartition(1).write.mode("overwrite").parquet(kred_test_msisdn)
kred_lookalike.repartition(1).write.mode("overwrite").parquet(kred_lookalike_msisdn)
kred.repartition(1).write.mode("overwrite").parquet(kred_msisdn)


# Master Table Date

### <font color='red'>Note: Update the master_date before running the Split</font>

In [None]:
# Update the master_date before running the Notebook
master_date = date.today().strftime("%Y%m%d")


# Master DF

In [None]:
master_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/05_model_input/master_{master_date}.parquet".format(master_date=master_date)
master_df = spark.read.parquet(master_path)


# Internal Master Table

In [None]:
internal_msisdn_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/internal_partner_msisdns.parquet"
internal_msisdn = spark.read.parquet(internal_msisdn_path)


In [None]:
internal_master_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/05_model_input/master/internal/{master_date}.parquet".format(master_date=master_date)
internal_master_df = master_df.join(f.broadcast(internal_msisdn), ["msisdn"])


In [None]:
# Mode "overwrite" is removed from the write statement to prevent the accidental removal of old master files.
internal_master_df.repartition(50).write.partitionBy("weekstart").parquet(internal_master_path)

# internal_master_df.repartition(50).write.mode("overwrite").partitionBy("weekstart").parquet(internal_master_path)


# External Master Table

In [None]:
kred_test_msisdn_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns_kredivo_test.parquet"
kred_lookalike_msisdn_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns_kredivo_lookalike.parquet"
kred_msisdn_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns_kredivo.parquet"

kred_test_msisdn = spark.read.parquet(kred_test_msisdn_path)
kred_lookalike_msisdn = spark.read.parquet(kred_lookalike_msisdn_path)
kred_msisdn = spark.read.parquet(kred_msisdn_path)


In [None]:
kredivo_master_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/05_model_input/master/kredivo/{master_date}.parquet".format(master_date=master_date)
kredivo_lookalike_master_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/05_model_input/master/kredivo_lookalike/{master_date}.parquet".format(master_date=master_date)
kredivo_test_master_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/05_model_input/master/kredivo_test/{master_date}.parquet".format(master_date=master_date)


## Filter Data for Kredivo

In [None]:
kred_data = master_df.filter(f.col("fea_los").isNotNull())
kred_data.cache()


In [None]:
kredivo_master_df = kred_data.join(f.broadcast(kred_msisdn), ["msisdn"])
kredivo_lookalike_master_df = kred_data.join(f.broadcast(kred_lookalike_msisdn), ["msisdn"])
kredivo_test_master_df = kred_data.join(f.broadcast(kred_test_msisdn), ["msisdn"])


In [None]:
# Mode "overwrite" is removed from the write statement to prevent the accidental removal of old master files.

kredivo_master_df.repartition(5).write.parquet(kredivo_master_path)
kredivo_lookalike_master_df.repartition(5).parquet(kredivo_lookalike_master_path)
kredivo_test_master_df.repartition(5).write.parquet(kredivo_test_master_path)

# kredivo_master_df.repartition(5).write.mode("overwrite").parquet(kredivo_master_path)
# kredivo_lookalike_master_df.repartition(5).write.mode("overwrite").parquet(kredivo_lookalike_master_path)
# kredivo_test_master_df.repartition(5).write.mode("overwrite").parquet(kredivo_test_master_path)
