# Overall Informations

In [65]:
# Title: Automatização da lógica Corredor de Descontos
# Description: Captura dos dados da tabela de Gestão de Preços e Condições - HANA e estruturação das Conditions 
#            para cálculo do Corredor de Desconto (ZD01 e ZD Tático[ZD01 + ZD13 + ZD06])
#            e dispersão (ZD02, ZD03, ZD04, ZD05, ZD06 e ZD13), com armazenamento dos resultados no AWS.
# Frequency: Monthly
# Time Period: (Mês Atual - 1)
# Structure: Modelo estruturado em sequencia de comandos.
# Version: V1
# Created on: 12-Jun-2019
# Last Update: 12-Jun-2019
# @author: Leandro Antunes

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Importing Libraries

In [34]:
import findspark                                              #Import library to Search for Spark Installation  

findspark.init()                                              #Search Spark Installation

import pyspark                                                #Only run after findspark.init()

from pyspark.sql import SparkSession                          #Import of Spark Session
from pyspark import SparkContext as spark                     #Import the Regular Spark Contex 
from pyspark.sql import SQLContext                            #Import the SQL Spark Contex 
from pyspark.sql.functions import *                           #Import the SQL Spark Functions
from pyspark.sql.types import *                               #Import the SQL Spark Variables Types
spark = SparkSession.builder.getOrCreate()                    #Creating Spark Session
import os

sc = spark.sparkContext                                       #Initialize Spark Session

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-------------------

# Creating Variables

## Buckets

In [None]:
SPG_MANUAL_INPUT_BUCKET = os.environ['MANUAL_INPUT_BUCKET']

In [None]:
SPG_INTEGRATION_INPUT_BUCKET = os.environ['INTEGRATION_INPUT_BUCKET']

In [None]:
SPG_QUERY_BUCKET = os.environ['QUERY_BUCKET']

In [None]:
GERDAU_BUCKET = os.environ['GERDAU_BUCKET']

In [None]:
SPG_OUTPUT_BUCKET = os.environ['OUTPUT_BUCKET']

In [None]:
SPG_INPUT_BUCKET = os.environ['INPUT_BUCKET_FROM_OUTPUT']

## Input Paths

In [None]:
# Reading parquet file from the SandBox Data Lake
SPG_INTEGRATION_INPUT_BUCKET_HANA = "SPG_FACTS/SPG_HANA/SPG_HANA.parquet"

In [None]:
# SPG Regions
SPG_INTEGRATION_INPUT_BUCKET_REGIONS = "SPG_DIMENSIONS/SPG_REGIOES/SPG_RG_REGIOES/SPG_RG_REGIOES.parquet"

In [None]:
# Reading parquet file from the SandBox Data Lake
SPG_INPUT_BUCKET_COLLECT = "SGP_PRICE_COLLECT/SPG_PC_COLETA.parquet"

## Output Paths

In [None]:
SPG_OUTPUT_BUCKET_HANA = "SGP_PRICE_COLLECT/SPG_PC_HANA/tb_SPG_PC_HANA.parquet"

----------------

# Creating Defined Functions

In [35]:
def remove_some_accents(col_name):
    removed_array = [('Á', 'A'), ('Ã', 'A'),('À', 'A'),('Â', 'A'),('Ä', 'A'),
                    ('É', 'E'),('È', 'E'),('Ê', 'E'),('Ë', 'E'),
                    ('Í', 'I'),('Ì', 'I'),('Î', 'I'),('Ï', 'I'),
                    ('Ó', 'O'),('Õ', 'O'), ('Ò', 'O'),('Ô', 'O'),('Ö', 'O'),
                    ('Ú', 'U'),('Ù', 'U'),('Û', 'U'),('Ü', 'U'),
                    ('Ç', 'C')]
    r = col_name
    for a, b in removed_array:
        r = regexp_replace(r, a, b)
    return r

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [36]:
# Replacing hifens by /
def remove_some_hifen(col_name):
    removed_chars = ("-")
    regexp = "|".join('\{0}'.format(i) for i in removed_chars)
    return regexp_replace(col_name, regexp, "/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [37]:
# Replacing dots by /
def remove_some_dots(col_name):
    removed_chars = (".")
    regexp = "|".join('\{0}'.format(i) for i in removed_chars)
    return regexp_replace(col_name, regexp, "/")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [38]:
# Removing hifens
def replace_some_hifen(col_name):
    removed_chars = ("-")
    regexp = "|".join('\{0}'.format(i) for i in removed_chars)
    return regexp_replace(col_name, regexp, "")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [39]:
# Removing spaces
def replace_some_space(col_name):
    removed_chars = (" ")
    regexp = "|".join('\{0}'.format(i) for i in removed_chars)
    return regexp_replace(col_name, regexp, "")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [40]:
# Removing apostrophes
def replace_some_apostrophe(col_name):
    removed_chars = ("'")
    regexp = "|".join('\{0}'.format(i) for i in removed_chars)
    return regexp_replace(col_name, regexp, "")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------

# Importing Tables

In [41]:
# Reading parquet file from the SandBox Data Lake
df_hana = spark.read.parquet("s3a://"+SPG_INTEGRATION_INPUT_BUCKET+"/"+SPG_INTEGRATION_INPUT_BUCKET_HANA)   #Billing Hana 

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [42]:
df_hana.write.partitionBy(144);
df_hana = df_hana.repartition(144);
df_hana.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[ABCCustomerClassification: string, BilDocNumber: string, BilDocItem: string, BILDOCDATE: string, BillingType: string, BillingTypeText: string, BusinessDivision: string, BusinessDivisionText: string, ShipToCity: string, SoldToCity: string, CompanyCode: string, CompanyCodeText: string, Conv33: string, SalesDoc_CorpGroup: string, CorporateGroupName: string, ShipToCountry: string, SoldToCountry: string, CustomerGroup: string, CustomerGroupName: string, CustomerGroup3: string, CustomerGroup3BRName: string, DateMonth: string, DateQuarter: string, DateWeek: string, DateYearMonth: string, DateYear: string, DistributionChannel: string, DistributionChannelName: string, MaterialDivision: string, MaterialDivisionName: string, Division: string, DivisionName: string, DocumentCurrency: string, EndMarket: string, GDD: string, GPD: string, GPDName: string, GPM: string, GPP: string, SoldToGrupoNISEBR: string, SalesDoc_InsideRep: string, SalesOrderInsideRepName: string, ShipToInsideRep: string,

In [43]:
# SPG Regions
df_region = spark.read.parquet("s3a://"+SPG_INTEGRATION_INPUT_BUCKET+"/"+SPG_INTEGRATION_INPUT_BUCKET_REGIONS)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [44]:
df_region.write.partitionBy(144);
df_region = df_region.repartition(144);
df_region.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[TIMESTAMP: string, Cod_Vendedor: string, Nome_vendedor: string, COD_CLIENTE: string, NOME_CLIENTE: string, COD_PRODUCT: string, DESC_PRODUCT: string, GPM: string, FABRICANTE: string, preco_com_impostos: double, PRAZO_DE_PAGAMENTO: string, Tipo_de_Preco: string, UNIDADE_DE_MEDIDA: string, CIDADE: string, ESTADO: string, OVAjustada: string, MATERIAL: string, Preco21diasKG: double, MesAno: string, DiasCorridos: double, DiasFaltantes: double, ChaveVendedorMetas: string, Aprovado: string, Filial: string, Regiao: string, Median_Avg_PrecoPraticadoKG: double, Count_Qtd_Fatura: string, precoSemana: double, DESC_GPD: string, StatusAtual: string, Tipo_de_Preco_Ajustes: string, Unidade_de_analise: string, StdDev_PrecoConcorrente: double]

In [45]:
# Reading parquet file from the SandBox Data Lake
df_competitors = spark.read.parquet("s3a://"+SPG_INPUT_BUCKET+"/"+SPG_INPUT_BUCKET_COLLECT)   #Billing Hana 

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [46]:
df_competitors.write.partitionBy(144);
df_competitors = df_competitors.repartition(144);
df_competitors.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[IBGE_CITY_ID: bigint, IBGE_MICRO_ID: bigint, IBGE_UF_ID: bigint, IBGE_UF_NAME: string, IBGE_REGION_ID: bigint, IBGE_REGION_NAME: string, IBGE_REGION_ACRONYMS: string, IBGE_UF_ACRONYMS: string, IBGE_MESO_ID: bigint, IBGE_MESO_NAME: string, IBGE_MICRO_NAME: string, IBGE_CITY_NAME: string, SAP_COD: string, SAP_NAME: string, COUNTRY: string, CITY_ZF: int, RG_COD_IBGE: string, RG_REG_SAP: string, IBGE_SAP_CITY_ID: string, IBGE_SAP_CITY_NAME: string, BRANCH: string]

----------------

# Preparing Tables

## Region

In [47]:
# Normalizing the column IBGE_SAP_CITY_NAME
df_region=df_region.withColumn("ISSUING_CITY", remove_some_accents(upper(replace_some_apostrophe(replace_some_space(replace_some_hifen(df_region["IBGE_SAP_CITY_NAME"]))))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [48]:
df_region=df_region.select(df_region.ISSUING_CITY
                           ,df_region.IBGE_UF_ACRONYMS.alias("ISSUING_STATE")
                           ,df_region.BRANCH)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [49]:
df_region.write.partitionBy(144);
df_region = df_region.repartition(144);
df_region.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[ISSUING_CITY: string, ISSUING_STATE: string, BRANCH: string]

-------------

## Hana

In [50]:
df_hana=df_hana.filter(df_hana.SalesOrg.like('%BRIN%') |
                       df_hana.SalesOrg.like('%BRIO%') |
                       df_hana.SalesOrg.like('%BRDI%') |
                       df_hana.SalesOrg.like('%BRDO%') |
                       df_hana.SalesOrg.like('%BRCC%') |
                       df_hana.SalesOrg.like('%BRCO%') |
                       df_hana.SalesOrg.like('%BRCG%') |
                       df_hana.SalesOrg.like('%BRGO%'))\
                .filter(df_hana.BillingType.like('%ZF2B%'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [51]:
#choosing specifics GPD's (business call)
df_hana = df_hana.filter(df_hana.GPDName.like('%BR-PERFIS ESTRUTURAIS%') |
                        df_hana.GPDName.like('%BR-CA50%') |
                        df_hana.GPDName.like('%BR-CA60%') |
                        df_hana.GPDName.like('%BR-RECOZIDO%') |
                        df_hana.GPDName.like('%BR-TEL TUBO%') |
                        df_hana.GPDName.like('%BR-TELA P/ CONCRETO%') |
                        df_hana.GPDName.like('%BR-CORTE E DOBR%') |
                        df_hana.GPDName.like('%BR-CONSTRUCAO CIVIL%') |
                        df_hana.GPDName.like('%BR-CA25%') |
                        df_hana.GPDName.like('%BR-PERFIS COMERCIAIS%') |
                        df_hana.GPDName.like('%BR-BTG%') |
                        df_hana.GPDName.like('%BR-MALHA POP%') |
                        df_hana.GPDName.like('%BR-TELA COLUNA%') |
                        df_hana.GPDName.like('%BR-TRELIÇA%') |
                        df_hana.GPDName.like('%BR-B.TREFILADA%') |
                        df_hana.GPDName.like('%BR-OVALADO%') |
                        df_hana.GPDName.like('%BR-GALVANIZADO%') |
                        df_hana.GPDName.like('%BR-BARRAS COMERCIAIS%') |
                        df_hana.GPDName.like('%BR-CAIXOTARIA%') |
                        df_hana.GPDName.like('%BR-ARTEFATOS%') |
                        df_hana.GPDName.like('%BR-FARPADO%') |
                        df_hana.GPDName.like('%BR-ELETRODO%') |
                        df_hana.GPDName.like('%BR-SOLDA - MIG%') |
                        df_hana.GPDName.like('%BR-CANTON A572%') |
                        df_hana.GPDName.like('%BR-GGS%') |
                        df_hana.GPDName.like('%BR-ARMADO%') |
                        df_hana.GPDName.like('%BR-CORDOALHA%') |
                        df_hana.GPDName.like('%BR-ESTACA PRANCHA%') |
                        df_hana.GPDName.like('%BR-ARAME PREGO%') |
                        df_hana.GPDName.like('%BR-CABEÇA DUPLA%') |
                        df_hana.GPDName.like('%BR-CORDOALHA AGRO%') |
                        df_hana.GPDName.like('%BR-GRAMPO%') |
                        df_hana.GPDName.like('%BR-COBREADOS%') |
                        df_hana.GPDName.like('%BR-CHAPA LQ%') |
                        df_hana.GPDName.like('%BR-UDC%') |
                        df_hana.GPDName.like('%BR-CHAPA ZN%') |
                        df_hana.GPDName.like('%BR-TELHA AZ%') |
                        df_hana.GPDName.like('%BR-TUBO ZN%') |
                        df_hana.GPDName.like('%BR-MARCENARIA%') |
                        df_hana.GPDName.like('%BR-PREGOES%') |
                        df_hana.GPDName.like('%BR-TELHEIRO%') |
                        df_hana.GPDName.like('%BR-COLUNA%') |
                        df_hana.GPDName.like('%BR-ESTRIBO%') |
                        df_hana.GPDName.like('%BR-ACESSORIOS%') |
                        df_hana.GPDName.like('%BR-CHAPA LCG%') |
                        df_hana.GPDName.like('%BR-CHAPA LF%') |
                        df_hana.GPDName.like('%BR-TUBO LF%') |
                        df_hana.GPDName.like('%BR-CHAPA LQ PISO%') |
                        df_hana.GPDName.like('%BR-BOBININHA%') |
                        df_hana.GPDName.like('%BR-ESPECIAIS%') |
                        df_hana.GPDName.like('%BR-BOBINA LQ%') |
                        df_hana.GPDName.like('%BR-FITA LQ%') |
                        df_hana.GPDName.like('%BR-BOBINA AZ%') |
                        df_hana.GPDName.like('%BR-AÇOS ESPECIAIS%') |
                        df_hana.GPDName.like('%BR-PARAFUSOS%') |
                        df_hana.GPDName.like('%BR-CIMENTO%') |
                        df_hana.GPDName.like('%BR-TUBO LQ%') |
                        df_hana.GPDName.like('%BR-TELHA ZN%') |
                        df_hana.GPDName.like('%BR-BTC GLV CP%') |
                        df_hana.GPDName.like('%BR-BOBINA ZN%') |
                        df_hana.GPDName.like('%BR-S-BARRA LAMINADA MÉDIA%') |
                        df_hana.GPDName.like('%BR-S-BARRA LAMINADA PESADA%') |
                        df_hana.GPDName.like('%BR-FITA LF%') |
                        df_hana.GPDName.like('%BR-FITA AZ%') |
                        df_hana.GPDName.like('%BR-S-BARRA LAMINADA LEVE%') |
                        df_hana.GPDName.like('%BR-ALAMBRADO%') |
                        df_hana.GPDName.like('%BR-SAPATA%') |
                        df_hana.GPDName.like('%BR-MOURÃO%') |
                        df_hana.GPDName.like('%BR-ATC CLARO IND%') |
                        df_hana.GPDName.like('%BR-POLIDO%') |
                        df_hana.GPDName.like('%BR-PERFIL BENEFICIADO%') |
                        df_hana.GPDName.like('%BR-BOBINA LF%') |
                        df_hana.GPDName.like('%BR-S-BARRA CTT%') |
                        df_hana.GPDName.like('%BR-CHAPA AZ%') |
                        df_hana.GPDName.like('%BR-FITA ZN%') |
                        df_hana.GPDName.like('%BR-LDA%') |
                        df_hana.GPDName.like('%BR-PIATINA CLARA%') |
                        df_hana.GPDName.like('%BR-S-BARRA DESCASCADA%') |
                        df_hana.GPDName.like('%BR-MESH%') |
                        df_hana.GPDName.like('%BR-S-BARRA FORJADA FINA%') |
                        df_hana.GPDName.like('%BR-S-BARRA RETIFICADA%') |
                        df_hana.GPDName.like('%BR-S-BARRA TREFILADA%') |
                        df_hana.GPDName.like('%BR-PLACA%') |
                        df_hana.GPDName.like('%BR-COIL%') |
                        df_hana.GPDName.like('%BR-S-FERRAMENTA%') |
                        df_hana.GPDName.like('%BR-S-FIO MÁQUINA STT%') |
                        df_hana.GPDName.like('%BR-ATC CLARO MOL COL%') |
                        df_hana.GPDName.like('%BR-ATC GLV CP%') |
                        df_hana.GPDName.like('%BR-B.CHATA LONGARINA%') |
                        df_hana.GPDName.like('%BR-BT FORJARIA BLOCOS%') |
                        df_hana.GPDName.like('%BR-TARUGO%') |
                        df_hana.GPDName.like('%BR-DRAWING%') |
                        df_hana.GPDName.like('%BR-ATC CLARO ENF ALG%') |
                        df_hana.GPDName.like('%BR-PIATINA GLV%') |
                        df_hana.GPDName.like('%BR-PERFIL DORMENTE%') |
                        df_hana.GPDName.like('%BR-ATC%') |
                        df_hana.GPDName.like('%BR-CHQ%') |
                        df_hana.GPDName.like('%BR-ALMA DE ELETRODO%') |
                        df_hana.GPDName.like('%BR-FIO MAQUINA%') |
                        df_hana.GPDName.like('%BR-CHQ BORO%') |
                        df_hana.GPDName.like('%BR-PERFIL GUIA ELEV%'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [52]:
df_hana = df_hana.filter(df_hana.GPM.like('%PERFIS_ESTRUTURAIS%') |
                        df_hana.GPM.like('%VERGALHAO%') |
                        df_hana.GPM.like('%AMPLIADOS%') |
                        df_hana.GPM.like('%CORTE_DOBRA%') |
                        df_hana.GPM.like('%PREGOS%') |
                        df_hana.GPM.like('%B&P%') |
                        df_hana.GPM.like('%ARAMES_AGRO%') |
                        df_hana.GPM.like('%ARAMES_IND%') |
                        df_hana.GPM.like('%PLANOS_LQ%') |
                        df_hana.GPM.like('%PLANOS_REVESTIDOS%') |
                        df_hana.GPM.like('%PLANOS_LF%') |
                        df_hana.GPM.like('%PLANOS_LCG%') |
                        df_hana.GPM.like('%FIO_MAQUINA%') |
                        df_hana.GPM.like('%BT_FORJARIA%') |
                        df_hana.GPM.like('%PLACAS%') |
                        df_hana.GPM.like('%TARUGO%') | 
                        df_hana.GPM.isNull())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [53]:
# Selecting necessary columns from Hana Table
df_hana=df_hana.select(df_hana.SalesOrg.alias("SALES_ORG_COD")
                       ,df_hana.SalesGroupName.alias("SALES_GROUP_DESC")
                       ,df_hana.SoldToCity.alias("ISSUING_CITY")
                       ,df_hana.SoldToState.alias("ISSUING_STATE")
                       ,df_hana.BILDOCDATE.alias("BILLING_DATE")
                       ,df_hana.SalesOrderDate.alias("ORDER_DATE")
                       ,df_hana.Material.alias("MATERIAL_COD")
                       ,df_hana.MA_BillingQuantity_P.alias("QUANTITY_TON")
                       ,df_hana.DP_CM_PRECOPRATICADO_D.alias("PRACTICED_PRICE")
                       ,df_hana.DA_CM_CD_ZD05_D.alias("ZD05")
                       ,df_hana.AA_CM_CD_ZEAF_D.alias("ZEAF")
                       ,df_hana.MA_CM_CD_BX41_D.alias("BX41")).dropDuplicates()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [54]:
df_hana=df_hana.withColumn("SALES_ORG_COD", when(df_hana.SALES_ORG_COD.like("%BRCO%"), "BRCC")\
                                              .otherwise(when(df_hana.SALES_ORG_COD.like("%BRGO%"), "BRCG")\
                                                        .otherwise(when(df_hana.SALES_ORG_COD.like("%BRIO%"), "BRIN")\
                                                                  .otherwise(when(df_hana.SALES_ORG_COD.like("%BRDO%"), "BRDI")
                                                                            .otherwise(df_hana.SALES_ORG_COD)))))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [55]:
# Converting numeric values from "text" (String) to Float from Hana Table 
for col_name in ["QUANTITY_TON"
                 ,"PRACTICED_PRICE"
                 ,"ZD05"
                 ,"ZEAF"
                 ,"BX41"]:
    df_hana = df_hana.withColumn(col_name, col(col_name).cast('float'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [56]:
# Filtering From Hana Table Rows with Positive Quantity 
df_hana = df_hana.filter(df_hana.QUANTITY_TON >= 0)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [57]:
# Reconstructing the Column practiced price from Hana Table
df_hana=df_hana.withColumn("PRACTICED_PRICE", when((df_hana.SALES_ORG_COD.like("%BRCG%")), 
                                                   col("PRACTICED_PRICE")+abs(col("ZD05"))-col("ZEAF")+col("BX41"))\
                                              .otherwise(col("PRACTICED_PRICE")+abs(col("ZD05"))-col("ZEAF")+col("BX41")))\
               .withColumn("PRACTICED_PRICE", col("PRACTICED_PRICE")/col("QUANTITY_TON"))\
               .withColumn("PRACTICED_PRICE", (col("PRACTICED_PRICE")/1000))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [58]:
df_hana = df_hana.withColumn("TYPE_PRICE", when(df_hana["SALES_ORG_COD"].like("%CC%"),"Usina-Mercado")\
                                            .otherwise(when(df_hana["SALES_ORG_COD"].like("%CG%"),"Distribuição-Mercado")\
                                                      .otherwise("Usina-Distribuição")))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [59]:
df_hana=df_hana.withColumn('ORDER_DATE'
                   ,concat(substring('ORDER_DATE',1,4)
                           ,lit("-")
                           ,substring('ORDER_DATE',5,2)
                           ,lit("-")
                           ,substring('ORDER_DATE',7,3)))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [60]:
# Normalizing the column ISSUING_CITY
df_hana=df_hana.withColumn("ISSUING_CITY", remove_some_accents(upper(replace_some_apostrophe(replace_some_space(replace_some_hifen(df_hana["ISSUING_CITY"]))))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [61]:
# Uniting Hana With Branch Data Frame
df_hana = df_hana.join(df_region, on=["ISSUING_CITY", "ISSUING_STATE"], how='inner')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [62]:
df_hana.write.partitionBy(144);
df_hana = df_hana.repartition(144);
df_hana.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[ISSUING_CITY: string, ISSUING_STATE: string, SALES_ORG_COD: string, SALES_GROUP_DESC: string, BILLING_DATE: string, ORDER_DATE: string, MATERIAL_COD: string, QUANTITY_TON: float, PRACTICED_PRICE: double, ZD05: float, ZEAF: float, BX41: float, TYPE_PRICE: string, BRANCH: string]

-----------------

## Competitors

In [64]:
df_competitors = df_competitors.withColumn("KEY_DATE", concat(col("ADJ_SALES_ORG")
                                                              ,col("COD_PRODUCT")
                                                              ,col("BRANCH")
                                                              ,replace_some_space("PRICE_TYPE")
                                                              ,col("TIMESTAMP")))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
df_competitors=df_competitors.withColumnRenamed("CITY","CIDADE")\
                             .withColumnRenamed("COD_SALES_AGENT","Cod_Vendedor")\
                             .withColumnRenamed("SALES_AGENT","Nome_vendedor")\
                             .withColumnRenamed("STATE", "ESTADO")\
                             .withColumnRenamed("COD_CLIENT","COD_CLIENTE")\
                             .withColumnRenamed("CLIENT_NAME","NOME_CLIENTE")\
                             .withColumnRenamed("UNIT_MEASURE","UNIDADE_DE_MEDIDA")\
                             .withColumnRenamed("PAYMENT_TERM","PRAZO_DE_PAGAMENTO")\
                             .withColumnRenamed("PRICE_WITH_TAXES","PRECO_COM_IMPOSTOS")\
                             .withColumnRenamed("COMPETITOR","FABRICANTE")\
                             .withColumnRenamed("CITY_STATE","CIDADEESTADO")\
                             .withColumnRenamed("DESC_PRODUCT","DESC_MATERIAL")\
                             .withColumnRenamed("ADJ_SALES_ORG","OVajustada")\
                             .withColumnRenamed("KEY_SELLERS_MARKS","ChaveVendedorMetas")\
                             .withColumnRenamed("KEY_SELLERS","ChaveVendedor")\
                             .withColumnRenamed("REGION","Regiao")\
                             .withColumnRenamed("PRACTICED_PRICE_KG_MEDIAN_AVG","Median_Avg_PrecoPraticadoKG")\
                             .withColumnRenamed("BRANCH","Filial")\
                             .withColumnRenamed("BRANCH_CAPACITY","CapacidadeFilial")\
                             .withColumnRenamed("PRE_APPROVED","Pre_Aprovado")\
                             .withColumnRenamed("COMPETITOR_PRICE_STD_DEVIATION","StdDev_PrecoConcorrente")\
                             .withColumnRenamed("WEEK_PRICE","precoSemana")\
                             .withColumnRenamed("CURRENT_STATUS","StatusAtual")\
                             .withColumnRenamed("APPROVED","Aprovado")\
                             .withColumnRenamed("ADJ_PRICE_TYPE","Tipo_de_Preco_Ajustes")\
                             .withColumnRenamed("ANALYSIS_UNIT","Unidade_de_analise")\
                             .withColumnRenamed("BILLING_KEY","ChaveFatura")\
                             .withColumnRenamed("PRICE_21_DAYS_KG","Preco21diasKG")\
                             .withColumnRenamed("COUTN_BILLING_QTT","Count_Qtd_Fatura")\
                             .withColumnRenamed("MONTH_YEAR","MesAno")\
                             .withColumnRenamed("PASS_DAYS","DiasCorridos")\
                             .withColumnRenamed("REMAINING_DAYS","DiasFaltantes")\
                             .withColumnRenamed("PRICE_TYPE","TIPO_DE_PRECO")

-------------

# Processing Tables

In [324]:
# Creating key columns and support column from Hana Table
df_hana = df_hana.withColumn("BILLING_DATE", substring("BILLING_DATE", 1, 10))\
                 .withColumn("KEY_DATE_BILL", concat(substring("SALES_ORG_COD", 3, 2)
                                                ,col("MATERIAL_COD").cast('int')
                                                ,col("BRANCH")
                                                ,col("TYPE_PRICE")
                                                ,col("BILLING_DATE")))\
                .withColumn("KEY_DATE_ORDER", concat(substring("SALES_ORG_COD", 3, 2)
                                                ,col("MATERIAL_COD").cast('int')
                                                ,col("BRANCH")
                                                ,col("TYPE_PRICE")
                                                ,col("ORDER_DATE")))\
                 .withColumn("WEEK_NUMBER_BILL", weekofyear("BILLING_DATE"))\
                 .withColumn("YEAR_NUMBER_BILL", year("BILLING_DATE"))\
                 .withColumn("KEY_WEEK_BILL", concat(substring("SALES_ORG_COD", 3, 2)
                                                ,col("MATERIAL_COD").cast('int')
                                                ,col("BRANCH")
                                                ,col("TYPE_PRICE")
                                                ,col("YEAR_NUMBER_BILL")
                                                ,col("WEEK_NUMBER_BILL")))\
                .withColumn("WEEK_NUMBER_ORDER", weekofyear("ORDER_DATE"))\
                 .withColumn("YEAR_NUMBER_ORDER", year("ORDER_DATE"))\
                 .withColumn("KEY_WEEK_ORDER", concat(substring("SALES_ORG_COD", 3, 2)
                                                ,col("MATERIAL_COD").cast('int')
                                                ,col("BRANCH")
                                                ,col("TYPE_PRICE")
                                                ,col("YEAR_NUMBER_ORDER")
                                                ,col("WEEK_NUMBER_ORDER")))\
                 .withColumn("PRACTICED_PRICE_DATE_COUNT", col("PRACTICED_PRICE"))\
                 .withColumn("PRACTICED_PRICE_DATE_AVG", col("PRACTICED_PRICE"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

---------------

## Creating a Grouped Data And Uniting With the Main Data Frame

In [325]:
#If DI then we use Order Date, otherwise Billing date
df_hana=df_hana.withColumn('KEY_DATE',
                   when(col('SALES_ORG_COD')=="BRDI"
                       ,col('KEY_DATE_ORDER')).otherwise(col('KEY_DATE_BILL')))

df_hana=df_hana.withColumn('KEY_WEEK',
                   when(col('SALES_ORG_COD')=="BRDI"
                       ,col('KEY_WEEK_ORDER')).otherwise(col('KEY_WEEK_BILL')))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [326]:
# Creating a grouped data frame with "date" as key
df_hana_grouped_day = df_hana.groupby(['KEY_DATE'])\
                         .agg({'PRACTICED_PRICE_DATE_AVG':'mean'
                               ,'PRACTICED_PRICE_DATE_COUNT':'count'})\
                         .withColumnRenamed("KEY_DATE","KEY_DATE_drop")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [327]:
# Uniting data frame "df_hana_grouped_day" with Hana Table
df_hana = df_hana.join(df_hana_grouped_day, df_hana.KEY_DATE==df_hana_grouped_day.KEY_DATE_drop, how='inner')\
                 .drop("PRACTICED_PRICE_DATE_COUNT")\
                 .drop("PRACTICED_PRICE_DATE_AVG")\
                 .drop("KEY_DATE_drop")\
                 .drop("TYPE_PRICE")   

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [328]:
# Renaming some column from Hana table 
df_hana = df_hana.withColumnRenamed("avg(PRACTICED_PRICE_DATE_AVG)","PRACTICED_PRICE_DATE_AVG")\
                 .withColumnRenamed("count(PRACTICED_PRICE_DATE_COUNT)","PRACTICED_PRICE_DATE_COUNT")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [329]:
# Creating a grouped data frame with "week" as key
df_hana_grouped_week = df_hana.groupby(['KEY_WEEK'])\
                              .agg({'PRACTICED_PRICE_DATE_AVG':'mean'})\
                              .withColumnRenamed("avg(PRACTICED_PRICE_DATE_AVG)", 'PRACTICED_PRICE_WEEK_AVG')\
                              .withColumnRenamed("KEY_WEEK","KEY_WEEK_drop")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [330]:
# Uniting data frame "df_hana_grouped_week" with Hana Table
df_hana = df_hana.join(df_hana_grouped_week, df_hana.KEY_WEEK == df_hana_grouped_week.KEY_WEEK_drop, how='inner')\
                 .drop("KEY_WEEK_drop")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [331]:
# Dropping unecessary columns
df_hana = df_hana.drop("SALES_ORG_COD")\
                 .drop("SALES_GROUP_DESC")\
                 .drop("ISSUING_CITY")\
                 .drop("MATERIAL_COD")\
                 .drop("QUANTITY_TON")\
                 .drop("PRACTICED_PRICE")\
                 .drop("ZD05")\
                 .drop("ZEAF")\
                 .drop("BX41")\
                 .drop("BRANCH")\
                 .drop("YEAR_NUMBER")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [332]:
# Dropping duplicates
df_hana = df_hana.dropDuplicates()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [333]:
df_hana.write.partitionBy(144);
df_hana = df_hana.repartition(144);
df_hana.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[ISSUING_STATE: string, BILLING_DATE: string, ORDER_DATE: string, KEY_DATE_BILL: string, KEY_DATE_ORDER: string, WEEK_NUMBER_BILL: int, YEAR_NUMBER_BILL: int, KEY_WEEK_BILL: string, WEEK_NUMBER_ORDER: int, YEAR_NUMBER_ORDER: int, KEY_WEEK_ORDER: string, KEY_DATE: string, KEY_WEEK: string, PRACTICED_PRICE_DATE_AVG: double, PRACTICED_PRICE_DATE_COUNT: bigint, PRACTICED_PRICE_WEEK_AVG: double]

----------------

# Inserting New Lines

In [334]:
# Creating data frame "df_competitors" with Hana Table
df_out = df_competitors.join(df_hana, on=["KEY_DATE"], how='left_anti')\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [335]:
# Creating new columns with the same name from df_hana.
df_out = df_out.withColumn("BILLING_DATE", col("TIMESTAMP"))\
                 .withColumn("KEY_DATE", concat(col("OVAjustada")
                                                ,col("COD_PRODUCT")
                                                ,col("Filial")
                                                ,replace_some_space("TIPO_DE_PRECO")
                                                ,col("TIMESTAMP")))\
                 .withColumn("WEEK_NUMBER", weekofyear("BILLING_DATE"))\
                 .withColumn("YEAR_NUMBER", year("BILLING_DATE"))\
                 .withColumn("KEY_WEEK", concat(col("OVAjustada")
                                                ,col("COD_PRODUCT")
                                                ,col("Filial")
                                                ,col("YEAR_NUMBER")
                                                ,col("WEEK_NUMBER")))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [336]:
# Selecting just the necessary columns
df_out=df_out.select("BILLING_DATE", "KEY_DATE","KEY_WEEK")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [337]:
# Creating unused columns
df_out=df_out.withColumn("PRACTICED_PRICE_DATE_AVG",lit("0"))\
             .withColumn("PRACTICED_PRICE_DATE_COUNT",lit("0"))\
             .withColumn("PRACTICED_PRICE_WEEK_AVG",lit("0"))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [338]:
df_hana=df_hana.drop('KEY_DATE_BILL'
                    ,'KEY_DATE_ORDER'
                    ,'KEY_WEEK_ORDER'
                    ,'KEY_WEEK_BILL'
                    ,'ORDER_DATE'
                    ,'WEEK_NUMBER_BILL'
                    ,'YEAR_NUMBER_BILL'
                    ,'WEEK_NUMBER_ORDER'
                    ,'YEAR_NUMBER_ORDER'
                    ,'ISSUING_STATE')

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [339]:
# Unitting df_out with df_hana while using the columns name 
df_hana=df_hana.unionByName(df_out)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [340]:
df_hana=df_hana.withColumn("PRACTICED_PRICE_DATE_AVG",when(df_hana.PRACTICED_PRICE_DATE_AVG.isNull(),lit("0"))\
                                                      .otherwise(df_hana.PRACTICED_PRICE_DATE_AVG))\
               .withColumn("PRACTICED_PRICE_DATE_COUNT",when(df_hana.PRACTICED_PRICE_DATE_COUNT.isNull(),lit("0"))\
                                                      .otherwise(df_hana.PRACTICED_PRICE_DATE_COUNT))\
               .withColumn("PRACTICED_PRICE_WEEK_AVG",when(df_hana.PRACTICED_PRICE_WEEK_AVG.isNull(),lit("0"))\
                                                      .otherwise(df_hana.PRACTICED_PRICE_WEEK_AVG))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

# Uploading Table


In [342]:
df_hana.write.parquet("s3a://"+SPG_OUTPUT_BUCKET+"/"+SPG_OUTPUT_BUCKET_HANA, mode = "overwrite")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…