In [0]:
# IMPORTING DATA ANALYSIS AND VISUALIZATION LIBRARIES
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', None)

# SPARK LIBRARIES 
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType
from pyspark.sql.types import IntegerType
from pyspark.sql.types import StringType
from pyspark.sql.types import DoubleType

# WARNINGS AND LOGISTICS 
#   Make sure we do not get line breaks when doing show on wide dataframes
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
import warnings
warnings.filterwarnings("ignore")
#   disable informational messages from prophet
logging.getLogger('py4j').setLevel(logging.ERROR)



In [0]:
%run ./Snowflake_Connection

In [0]:
sfdt = SnowflakeDataTool()

In [0]:
gl_demand = spark.table('ltf_db.LTF_GOLDEN_LAYER')

In [0]:
# TWO FILES THAT WILL PROVIDE A LEVEL OF GROUP 

sel_zip = spark.table('ltf_db.sel_to_zip_code')


# pa_zip = spark.table('ltf_db.pa_to_zip_code')

In [0]:
grouping_features = sfdt.sql("select DELIVERY_EXECUTION.ACTUAL_GOODS_MOVEMENT_DATE as ACTUAL_DATE, \
                        DELIVERY_EXECUTION.SHIPTO_CUSTOMER_ID as CUSTOMER_ID, \
                        DELIVERY_EXECUTION.MATERIAL_ID as MATERIAL_ID, \
                        MATERIAL.BRAND_DESC as BRAND_DESC, \
                        MATERIAL.SWIRE_PACKAGE_CATEGORY_DESCRIPTION, \
                        Customer.SUPER_CHANNEL_DEFINITION, \
                        Customer.ADDRESS_ZIP_CODE as ZIP_CODE \
                    from \
                        db_bi_p_edw.general_use_bas.bas_fact_delivery_execution as DELIVERY_EXECUTION \
                    inner join \
                        DB_SWIRE_BI_P_EDW.TRANSFORMED.DIM_MDM_CUSTOMER_MASTER as CUSTOMER on (DELIVERY_EXECUTION.SHIPTO_CUSTOMER_ID = CUSTOMER.customer_number) \
                    inner join \
                        DB_SWIRE_BI_P_EDW.TRANSFORMED.DIM_MDM_MATERIAL_MASTER as MATERIAL on (DELIVERY_EXECUTION.MATERIAL_SK = MATERIAL.MATERIAL_SK) \
                    where \
                        DELIVERY_EXECUTION.ACTUAL_GOODS_MOVEMENT_DATE >= date('2019-01-01') \
                    and \
                        DELIVERY_EXECUTION.ACTUAL_GOODS_MOVEMENT_DATE < CURRENT_DATE() \
                    and \
                        DELIVERY_EXECUTION.ACTUAL_GOODS_MOVEMENT_DATE != date('9999-12-31') \
                    and \
                        MATERIAL.MATERIAL_TYPE in ('ZFER') \
                    and \
                        CUSTOMER.DIVISION = 'SA'")
grouping_features = grouping_features.orderBy('ACTUAL_DATE')
# display(grouping_features)

In [0]:
sel_zip_grouping_feat_channel = grouping_features.join(sel_zip, \
                                                   on=(grouping_features.ZIP_CODE == sel_zip.Zip), \
                                                   how= 'inner') \
                                            .select(grouping_features.ACTUAL_DATE, grouping_features.CUSTOMER_ID, grouping_features.MATERIAL_ID, (sel_zip.SEL_Cluster).alias('SEL_CLUSTER'), grouping_features.BRAND_DESC, grouping_features.SWIRE_PACKAGE_CATEGORY_DESCRIPTION, grouping_features.SUPER_CHANNEL_DEFINITION)

In [0]:
gl_sel_brand = gl_demand.join(sel_zip_grouping_feat_channel, \
                            on=((gl_demand.CUSTOMER_ID == sel_zip_grouping_feat_channel.CUSTOMER_ID) & (gl_demand.MATERIAL_ID == sel_zip_grouping_feat_channel.MATERIAL_ID)), \
                             how='inner') \
                                 .select(gl_demand.TS_DATE, gl_demand.CUSTOMER_ID, gl_demand.MATERIAL_ID, sel_zip_grouping_feat_channel.SUPER_CHANNEL_DEFINITION, sel_zip_grouping_feat_channel.SEL_CLUSTER, sel_zip_grouping_feat_channel.BRAND_DESC, gl_demand.ORDERED_WEEKLY_SUM)

display(gl_sel_brand)

TS_DATE,CUSTOMER_ID,MATERIAL_ID,SUPER_CHANNEL_DEFINITION,SEL_CLUSTER,BRAND_DESC,ORDERED_WEEKLY_SUM
2021-01-29,500246183,410069,ON PREMISE,SEL-4,COCA-COLA FUNCTIONAL,4.0
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083


In [0]:
data_for_training_notebook = gl_sel_brand.filter(year("TS_DATE") != 2023)

data_for_training_notebook = data_for_training_notebook.withColumn('y', col('y').cast('double'))

display(data_for_training_notebook)

TS_DATE,CUSTOMER_ID,MATERIAL_ID,SUPER_CHANNEL_DEFINITION,SEL_CLUSTER,BRAND_DESC,ORDERED_WEEKLY_SUM
2021-01-29,500246183,410069,ON PREMISE,SEL-4,COCA-COLA FUNCTIONAL,4.0
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083
2022-05-13,500264326,103029,FULL SERVICE,SEL-4,SPRITE,0.083


In [0]:
data_for_training_notebook.write.format("delta").mode('overwrite').saveAsTable('ltf_db.modeling_01_sc_sel_brand_grouping')
