In [None]:
from datetime import datetime, timedelta

import pyspark
import pyspark.sql.functions as f
import pyspark.sql.types as t
from IPython.core.interactiveshell import InteractiveShell
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.window import Window

In [None]:
InteractiveShell.ast_node_interactivity = "all"

In [None]:
%%time
conf = (SparkConf()
        .setMaster('yarn-client')
        .setAppName('all-msisdns')
        .set("spark.driver.maxResultSize", "10g")
        .set("spark.driver.memory", "16g")
        .set("spark.driver.memoryOverhead", "4096")
        .set("spark.dynamicAllocation.enabled", "true")
        .set("spark.dynamicAllocation.initialExecutors", "1")
        .set("spark.dynamicAllocation.maxExecutors", "50")
        .set("spark.dynamicAllocation.minExecutors", "15")
        .set("spark.executor.cores", "4")
        .set("spark.executor.memory", "16g")
        .set("spark.hadoop.fs.permissions.umask-mode", "002")
        .set("spark.kryoserializer.buffer.max", "512m")
        .set("spark.shuffle.service.enabled", "true")
        .set("spark.sql.broadcastTimeout", "1000")
        .set("spark.sql.hive.convertMetastoreParquet", "false")
        .set("spark.sql.parquet.compression.codec", "snappy")
        .set("spark.sql.shuffle.partitions", "1000")
        .set("spark.sql.sources.partitionOverwriteMode", "dynamic")
        .set("spark.yarn.driver.memoryOverhead", "4096")
        .set("spark.yarn.executor.memoryOverhead", "4096")
        .set("spark.yarn.maxAppAttempts", "2")
        .set("spark.yarn.queue", "root.hue_dmp_prod")
        .set("yarn.nodemanager.vmem-check-enabled", "false")
        )
sc = SparkContext.getOrCreate(conf=conf)
spark = SparkSession(sc)

# Kredivo MSISDNs

In [None]:
# Kredivo Original MSISDNs
kredivo = 'mck.kr_badflags_sample_20200106'
df_kredivo = spark.read.table(kredivo).select("msisdn")
# Add Category
df_kredivo = df_kredivo.withColumn("category", f.lit("kredivo"))

# Kredivo Lookalike MSISDNs
kredivo_lookalike = "mck.kr_random_lookalike_20200121"
df_kredivo_lookalike = spark.read.table(kredivo_lookalike).select("msisdn")
# Add Category
df_kredivo_lookalike = df_kredivo_lookalike.withColumn("category", f.lit("kredivo-lookalike"))

# Kredivo Test MSISDNs
kredivo_test = "mck.kr_ts_sample_20200113"
df_kredivo_test = spark.read.table(kredivo_test).select("msisdn")
# Add Category
df_kredivo_test = df_kredivo_test.withColumn("category", f.lit("kredivo-test"))


# Internal MSISNDs

In [None]:
internal_1 = "mck.mck_int_random_sample"
internal_2 = "mck.int_mck_hvc"
internal_3 = "hdfs:///data/landing/gx_pnt/mck_dmp_int/02_intermediate/msisdns_new_campaigns.parquet"
internal_4 = "hdfs:///data/landing/gx_pnt/mck_dmp_int/02_intermediate/msisdns_old_campaign.parquet"
internal_5 = "mck.cb_prepaid_postpaid_201910_ucg_samples"


df_int_1 = spark.read.table(internal_1).select("msisdn")
df_int_1 = df_int_1.withColumn("category", f.lit("internal-random-sample"))

df_int_2 = spark.read.table(internal_2).select("msisdn", "category")

df_int_3 = spark.read.parquet(internal_3).select("msisdn", "cmp_code")
df_int_3 = df_int_3.withColumn("category", f.concat(f.lit("internal_"), f.col("cmp_code"))).select("msisdn", "category")

df_int_4 = spark.read.parquet(internal_4).select("msisdn", "category")

df_int_5 = spark.read.table(internal_5).select("msisdn", "ucg")
df_int_5 = df_int_5.withColumn("category", f.concat(f.lit("internal_ucg_"), f.col("ucg"))).select('msisdn', 'category')


# All Training MSISDNs

In [None]:
# Union of all the Training MSISDNs
df_msisdn = df_kredivo.union(df_kredivo_lookalike).union(df_kredivo_test).union(df_int_1).union(df_int_2).union(df_int_3).union(df_int_4).union(df_int_5)

# Aggregating Categories for the MSISDNs
df_partner_msisdn = df_msisdn.groupBy("msisdn").agg(f.collect_set(f.col("category")).alias("category"))

In [None]:
partner_path = "hdfs:///data/landing/gx_pnt/mck_dmp_training/02_primary/partner_msisdns.parquet"
df_partner_msisdn.coalesce(1).write.mode("overwrite").parquet(partner_path)

In [None]:
df_msisdn.groupBy("category").count().show(1000, False)