In [0]:
# common python packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from functools import reduce
from datetime import datetime, timedelta

# spark packages
import pyspark.sql.functions as f
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import DataFrame, SparkSession
from pyspark.sql.types import *

# internal 8451 packages, please search on 8451 github for reference
from effodata import ACDS, golden_rules, Sifter, Equality, Joiner
import kayday as kd
from toolbox.config import segmentation
import toolbox.config as config
# from toolbox.config import add_facebook_pandora_flags
from toolbox import config
from kpi_metrics import KPI, AliasMetric, CustomMetric, AliasGroupby, available_metrics, get_metrics
import seg
from seg.utils import DateType

# The Work

In [0]:
def create_df_groupby_propensity_time(segmentation:str ):

  root = config.get_directory(segmentation)

  propensity = config.audience_dict[segmentation]["propensity_compisition"]

  segment_type = config.get_type(segmentation)

  frntend_name = config.audience_dict[segmentation]["frontend_name"]

  formatting_propensity = ''.join(propensity)

  one_segment_master_file = None

  for i in config.get_files(segmentation):

    one_file = root + i
    read_in = spark.read.format("delta").load(one_file)

    if one_segment_master_file is None:
      one_segment_master_file = read_in
    else:
      one_segment_master_file = one_segment_master_file.union(read_in)
 
  one_segment_master_file_propensity = one_segment_master_file.filter(f.col("segment").isin(propensity))
  
  group_by = (one_segment_master_file_propensity
              .withColumnRenamed("stratum_week", "Time")
              .withColumn("Backend_Name", f.lit(segmentation))
              .withColumn("Propensity", f.lit(formatting_propensity))
              .withColumn("Propensity_type", f.lit(segment_type))
              .withColumn("Segment", f.lit(frntend_name))
              .withColumn("Offsite_Eligibility_Count", f.lit(None))
              .withColumn("Onsite_Eligibility_Count", f.lit(None))
  )

  return group_by

In [0]:
display(group_by)

In [0]:
display(one_segment_master_file_propensity
        .sort('ehhn'))

In [0]:
tdc_flag = (spark.read.parquet('abfss://landingzone@sa8451entlakegrnprd.dfs.core.windows.net/mart/comms/prd/fact/digital_load_offer_fact/')
            .filter(f.col('KROGER_OFFER_ID').like('8000000%'))
            .select('EHHN').distinct()
            )


In [0]:
from flowcate.files import FilePath

egg_path = FilePath('abfss://landingzone@sa8451entlakegrnprd.dfs.core.windows.net/mart/comms/prd/fact/eligibility_fact')

latest_eg_pat = egg_path.find_latest_file()


            

In [0]:
elig = (spark.read.parquet(latest_eg_pat))
        #.select('EHHN','ROKU_FLAG','PINTEREST_ELIGIBLE_FLAG','PUSH_FLAG','PANDORA_FLAG','FACEBOOK_FLAG','EMAIL_ELIGIBLE_FLAG','SSE_ELIGIBLE_FLAG','TDC_ELIGIBLE_FLAG'))
display(elig
        .sort('ehhn'))

##write to blob

In [0]:
import toolbox.config as con
import pyspark.sql.functions as f
import pyspark.sql.types as t
import datetime as dt
 
segs = con.segmentations.all_segmentations
my_schema = t.StructType([
    t.StructField("ehhn", t.StringType(), True),
    t.StructField("segment", t.StringType(), True),
    t.StructField("segmentation", t.StringType(), True),
    t.StructField("front_name", t.StringType(), True),
    t.StructField("Propensity", t.StringType(), True),
    t.StructField("Segment_type", t.StringType(), True),
    t.StructField("percentile_seg", t.StringType(), True),
])
df = spark.createDataFrame([], schema=my_schema)
for s in segs:
  #Get latest upc file of each segmentation
  #Also get timestamp to keep upc_lists and percentile_segmentations in sync
  segment = con.segmentation(s)
  front_n = segment.frontend_name
  prop = segment.propensities
  final_prop = "".join(prop)
  type_seg = segment.segment_type
  percentile_seg = segment.type

  latest_file = segment.files[-1]
  reading_file = segment.directory + latest_file
  temp = spark.read.format("delta").load(reading_file)
  temp = (temp
          .withColumn("SEGMENTATION", f.lit(s))
          .withColumn('FRONTEND_NAME', f.lit(front_n))
          .withColumn("PROPENSITY", f.lit(final_prop))
          .withColumn("SEGMENT_TYPE", f.lit(type_seg))
          .withColumn("PERCENTILE_SEGMENT", f.lit(percentile_seg))
  )
  temp = temp.filter(f.col("segment").isin(segment.propensities))
  temp = temp.select("EHHN", "SEGMENT", "SEGMENTATION","FRONTEND_NAME","PROPENSITY", "SEGMENT_TYPE","PERCENTILE_SEGMENT")
  df = df.union(temp)
 
# df = df.groupBy("segmentation").count()
# df.show(10, truncate=False)

In [0]:
display(df
        #.filter(f.col('front_name').isin(''))
        .sort('ehhn')
        )

In [0]:
display(df.groupBy('segmentation')
        .agg(f.count('ehhn').alias("count_hh"),
             f.max("front_name").alias("Frontend_name"),
             f.max("segment").alias("segment_extra"),
             f.max("Propensity").alias("Propensity"),
             f.max("Segment_type").alias("Segment_type"),
             f.max("percentile_seg").alias("percentil_seg")

             ) 
        )

In [0]:
display(df.join(elig, on='ehhn', how='left')
        .sort('ehhn','segmentation')
        )   #still theri are duplicate ehnn

In [0]:
segs = con.segmentations.all_segmentations
segs.sort()
problem_segs =[]
seg = segs[0]
segment = con.segmentation(seg)
file = segment.files
prope = segment.propensities
dirr = segment.directory
fornt = segment.frontend_name
seggg = segment.segment_type
tt = segment.type

In [0]:
times = []
for i in file:
  names = i[0:-1]
  times.append(names)
times

In [0]:
times[-1]

In [0]:
prope

In [0]:
dirr

In [0]:
fornt

In [0]:
seggg

In [0]:
tt