# Importing Libraries

In [108]:
#imports Libs

import os
import pandas as pd
import boto3
import time
from botocore.client import ClientError

import pyarrow.parquet as pq
import s3fs
import calendar

import findspark                                              #Import library to Search for Spark Installation  

findspark.init()                                              #Search Spark Installation

import pyspark                                                #Only run after findspark.init()

from pyspark.sql import SparkSession                          #Import of Spark Session
from pyspark import SparkContext as spark                     #Import the Regular Spark Contex 
from pyspark.sql import SQLContext                            #Import the SQL Spark Contex
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext                                       #Initialize Spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------

# Creating Variables

## Database

In [None]:
ATHENA_SPG = os.environ['SPG_DATABASE']

In [None]:
ATHENA_BW = os.environ['GERDAU_BW_DATABASE']

## Buckets

In [None]:
SPG_MANUAL_INPUT_BUCKET = os.environ['MANUAL_INPUT_BUCKET']

In [None]:
SPG_INTEGRATION_INPUT_BUCKET = os.environ['INTEGRATION_INPUT_BUCKET']

In [None]:
SPG_QUERY_BUCKET = os.environ['QUERY_BUCKET']

In [None]:
GERDAU_BUCKET = os.environ['GERDAU_BUCKET']

In [None]:
SPG_OUTPUT_BUCKET = os.environ['OUTPUT_BUCKET']

In [None]:
SPG_INPUT_BUCKET = os.environ['INPUT_BUCKET_FROM_OUTPUT']

## Input Paths

In [None]:
# Reading temporary CSV file from the SandBox Data Lake
SPG_MANUAL_INPUT_BUCKET_PV = "SPG_GLOBAL/INPUT/tb_spg_support_pv_all_next.csv"

In [None]:
# Reading temporary CSV file from the SandBox Data Lake
SPG_MANUAL_INPUT_BUCKET_PEX = "SPG_GLOBAL/SUPPORT/tb_spg_support_pex.csv"

In [None]:
SPG_INPUT_BUCKET_TEMP = "SPG_TEMP/tb_SPG_TEMP.parquet"

In [None]:
# SPG Products
SPG_INTEGRATION_INPUT_BUCKET_PRODUCTS = "SPG_DIMENSIONS/SPG_PRODUTOS/SPG_PD_PRODUTOS.parquet"

In [None]:
# Query View

QUERY_PEX_PV = "SELECT * FROM db_bw.tb_global_pv_pex_parquet"

## Output Paths

In [None]:
SPG_OUTPUT_BUCKET_TEMP = "SPG_TEMP/tb_SPG_TEMP.parquet"

In [None]:
SPG_OUTPUT_BUCKET_PV = "SPG_GLOBAL/SUPPORT/TB_SPG_SUPPORT_PV.parquet"

In [None]:
SPG_OUTPUT_BUCKET_PEX = "SPG_GLOBAL/SUPPORT/TB_SPG_SUPPORT_PEX.parquet"

## Bato3 Variables

In [None]:
#S3 Configuration
S3_ATHENA_INPUT =  's3://'+SPG_QUERY_BUCKET+'/'+SPG_QUERY_BUCKET_ATHENA

In [None]:
S3_ATHENA_OUTPUT = 's3://'+SPG_QUERY_BUCKET+'/'+SPG_QUERY_BUCKET_ATHENA

In [None]:
region_name = os.environ['AWS_REGION']

In [None]:
aws_access_key_id = os.environ['AWS_ACCESS_KEY']

In [None]:
aws_secret_access_key = os.environ['AWS_SECRET_KEY']

-------------

# Creating Defined Functions

In [None]:
# Run Query

def run_query(query, database, s3_output):
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
            },
        ResultConfiguration={
            'OutputLocation': s3_output,
            }
        )
    return response

In [None]:
def get_aws_path(query,database,s3_output):
    response = run_query(query, database, s3_output)
    file_query = response['QueryExecutionId']
    file_metadata = response['QueryExecutionId'] + '.metadata'
    return file_query

In [None]:
# Wating for 300 seconds until the end of the upload

def wait_athena_load(Bucket, Key):
    time_to_wait = 300
    time_counter = 0

    while True:
        try:
            s3.meta.client.head_object(Bucket=Bucket,Key=Key)
        except ClientError:
            time.sleep(1)
            time_counter += 1
            if time_counter > time_to_wait:
                break
        else:
            break

In [145]:
# Replacing Characters

def normalizing_characters(col_name):
    removed_array = [(',', '.')]
    r = col_name
    for a, b in removed_array:
        r = regexp_replace(r, a, b)
    return r

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [146]:
# Replacing Characters

def normalizing_characters_2(col_name):
    removed_array = [(' ', '')]
    r = col_name
    for a, b in removed_array:
        r = regexp_replace(r, a, b)
    return r

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-----------

# Configuring Boto3 Connection

In [141]:
#Athena Client Configuration

client = boto3.client('athena', 
    aws_access_key_id = aws_access_key_id, 
    aws_secret_access_key = aws_secret_access_key, 
    region_name = region_name )

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [142]:
#S3 Resource Configuration

s3 = boto3.resource('s3',
    aws_access_key_id = aws_access_key_id,
    aws_secret_access_key = aws_secret_access_key,
    region_name = region_name)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

------------------

# Importing Tables 

In [None]:
# Reading temporary CSV file from the SandBox Data Lake
df_pv_futuro = spark.read.format("csv").option("header","true").option("sep",";").option("encoding", "ISO-8859-1").load("s3a://"+SPG_MANUAL_INPUT_BUCKET+"/"+SPG_MANUAL_INPUT_BUCKET_PV)

In [None]:
df_pv_futuro.write.partitionBy(144);
df_pv_futuro = df_pv_futuro.repartition(144);
df_pv_futuro.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
# Reading temporary CSV file from the SandBox Data Lake
df_pex = spark.read.format("csv").option("header","true").option("sep",";").option("encoding", "ISO-8859-1").load("s3a://"+SPG_MANUAL_INPUT_BUCKET+"/"+SPG_MANUAL_INPUT_BUCKET_PEX)

In [None]:
df_pex.write.partitionBy(144);
df_pex = df_pex.repartition(144);
df_pex.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
# SPG Products
df_products = spark.read.parquet("s3a://"+SPG_INTEGRATION_INPUT_BUCKET+"/"+SPG_INTEGRATION_INPUT_BUCKET_PRODUCTS)

In [None]:
df_products.write.partitionBy(144);
df_products = df_products.repartition(144);
df_products.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
# Import CSV from View

athena_response = get_aws_path(QUERY_PEX_PV,ATHENA_SPG,S3_ATHENA_OUTPUT)

wait_athena_load(SPG_QUERY_BUCKET, SPG_QUERY_BUCKET_ATHENA+"/"+athena_response+".csv")

In [None]:
# Import CSV from View

df_view = spark.read.csv(view_path, header = 'true')

In [None]:
df_view.write.partitionBy(144);
df_view = df_view.repartition(144);
df_view.persist(pyspark.StorageLevel.MEMORY_ONLY)

----------------

# Preparing SPG PV

In [151]:
df_pv_futuro = df_pv_futuro.withColumn("VOLUME", normalizing_characters_2(normalizing_characters(col("VOLUME"))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [152]:
df_pv_futuro=df_pv_futuro.withColumn("CURRENT_DATE", concat(year(date_add(current_timestamp(),-1))
                                                          ,when(month(date_add(current_timestamp(),-1)) >= "10", month(date_add(current_timestamp(),-1)))
                                                               .otherwise(concat(lit("0")
                                                                                ,month(date_add(current_timestamp(),-1))))))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [153]:
df_pv_futuro=df_pv_futuro.filter((df_pv_futuro["MONTH_YEAR"] > df_pv_futuro["CURRENT_DATE"]))\
                          .drop("CURRENT_DATE")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [154]:
# SPG Support PV
df_pv = spark.read.parquet("s3a://smartpricinggerdaubucket/SPG_GLOBAL/SUPPORT/TB_SPG_SUPPORT_PV.parquet")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [155]:
df_pv=df_pv.withColumn("CURRENT_DATE", concat(year(date_add(current_timestamp(),-1))
                                              ,when(month(date_add(current_timestamp(),-1)) >= "10", month(date_add(current_timestamp(),-1)))
                                                   .otherwise(concat(lit("0")
                                                                    ,month(date_add(current_timestamp(),-1))))))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [156]:
df_pv=df_pv.filter((df_pv["MONTH_YEAR"] < df_pv["CURRENT_DATE"]))\
                  .drop("CURRENT_DATE")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [157]:
df_pv = df_pv.select("MEASURE_UNIT", "MONTH_YEAR", "STATE", "GPD_COD", "SALES_ORG_COD", "SALES_OFFICE", "VOLUME")\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [158]:
df_pv = df_pv.withColumn("VOLUME", normalizing_characters_2(normalizing_characters(col("VOLUME"))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [159]:
df_pv=df_pv.unionByName(df_pv_futuro).dropDuplicates()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

--------------

# Preparing Products

In [161]:
df_products = df_products.select("COD_GPD", "DESC_GPD", "DESC_GPM", "SALES_ORG_COD")\
                         .filter(~df_products.DESC_GPD.isNull())\
                         .filter(~df_products.DESC_GPM.isNull())\
                         .dropDuplicates(["COD_GPD","SALES_ORG_COD"])

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [162]:
#choosing specifics GPD's (business call)
df_products = df_products.filter(df_products.DESC_GPD.like('%BR-PERFIS ESTRUTURAIS%') |
                        df_products.DESC_GPD.like('%BR-CA50%') |
                        df_products.DESC_GPD.like('%BR-CA60%') |
                        df_products.DESC_GPD.like('%BR-RECOZIDO%') |
                        df_products.DESC_GPD.like('%BR-TEL TUBO%') |
                        df_products.DESC_GPD.like('%BR-TELA P/ CONCRETO%') |
                        df_products.DESC_GPD.like('%BR-CORTE E DOBR%') |
                        df_products.DESC_GPD.like('%BR-CONSTRUCAO CIVIL%') |
                        df_products.DESC_GPD.like('%BR-CA25%') |
                        df_products.DESC_GPD.like('%BR-PERFIS COMERCIAIS%') |
                        df_products.DESC_GPD.like('%BR-BTG%') |
                        df_products.DESC_GPD.like('%BR-MALHA POP%') |
                        df_products.DESC_GPD.like('%BR-TELA COLUNA%') |
                        df_products.DESC_GPD.like('%BR-TRELIÇA%') |
                        df_products.DESC_GPD.like('%BR-B.TREFILADA%') |
                        df_products.DESC_GPD.like('%BR-OVALADO%') |
                        df_products.DESC_GPD.like('%BR-GALVANIZADO%') |
                        df_products.DESC_GPD.like('%BR-BARRAS COMERCIAIS%') |
                        df_products.DESC_GPD.like('%BR-CAIXOTARIA%') |
                        df_products.DESC_GPD.like('%BR-ARTEFATOS%') |
                        df_products.DESC_GPD.like('%BR-FARPADO%') |
                        df_products.DESC_GPD.like('%BR-ELETRODO%') |
                        df_products.DESC_GPD.like('%BR-SOLDA - MIG%') |
                        df_products.DESC_GPD.like('%BR-CANTON A572%') |
                        df_products.DESC_GPD.like('%BR-GGS%') |
                        df_products.DESC_GPD.like('%BR-ARMADO%') |
                        df_products.DESC_GPD.like('%BR-CORDOALHA%') |
                        df_products.DESC_GPD.like('%BR-ESTACA PRANCHA%') |
                        df_products.DESC_GPD.like('%BR-ARAME PREGO%') |
                        df_products.DESC_GPD.like('%BR-CABEÇA DUPLA%') |
                        df_products.DESC_GPD.like('%BR-CORDOALHA AGRO%') |
                        df_products.DESC_GPD.like('%BR-GRAMPO%') |
                        df_products.DESC_GPD.like('%BR-COBREADOS%') |
                        df_products.DESC_GPD.like('%BR-CHAPA LQ%') |
                        df_products.DESC_GPD.like('%BR-UDC%') |
                        df_products.DESC_GPD.like('%BR-CHAPA ZN%') |
                        df_products.DESC_GPD.like('%BR-TELHA AZ%') |
                        df_products.DESC_GPD.like('%BR-TUBO ZN%') |
                        df_products.DESC_GPD.like('%BR-MARCENARIA%') |
                        df_products.DESC_GPD.like('%BR-PREGOES%') |
                        df_products.DESC_GPD.like('%BR-TELHEIRO%') |
                        df_products.DESC_GPD.like('%BR-COLUNA%') |
                        df_products.DESC_GPD.like('%BR-ESTRIBO%') |
                        df_products.DESC_GPD.like('%BR-ACESSORIOS%') |
                        df_products.DESC_GPD.like('%BR-CHAPA LCG%') |
                        df_products.DESC_GPD.like('%BR-CHAPA LF%') |
                        df_products.DESC_GPD.like('%BR-TUBO LF%') |
                        df_products.DESC_GPD.like('%BR-CHAPA LQ PISO%') |
                        df_products.DESC_GPD.like('%BR-BOBININHA%') |
                        df_products.DESC_GPD.like('%BR-ESPECIAIS%') |
                        df_products.DESC_GPD.like('%BR-BOBINA LQ%') |
                        df_products.DESC_GPD.like('%BR-FITA LQ%') |
                        df_products.DESC_GPD.like('%BR-BOBINA AZ%') |
                        df_products.DESC_GPD.like('%BR-AÇOS ESPECIAIS%') |
                        df_products.DESC_GPD.like('%BR-PARAFUSOS%') |
                        df_products.DESC_GPD.like('%BR-CIMENTO%') |
                        df_products.DESC_GPD.like('%BR-TUBO LQ%') |
                        df_products.DESC_GPD.like('%BR-TELHA ZN%') |
                        df_products.DESC_GPD.like('%BR-BTC GLV CP%') |
                        df_products.DESC_GPD.like('%BR-BOBINA ZN%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA LAMINADA MÉDIA%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA LAMINADA PESADA%') |
                        df_products.DESC_GPD.like('%BR-FITA LF%') |
                        df_products.DESC_GPD.like('%BR-FITA AZ%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA LAMINADA LEVE%') |
                        df_products.DESC_GPD.like('%BR-ALAMBRADO%') |
                        df_products.DESC_GPD.like('%BR-SAPATA%') |
                        df_products.DESC_GPD.like('%BR-MOURÃO%') |
                        df_products.DESC_GPD.like('%BR-ATC CLARO IND%') |
                        df_products.DESC_GPD.like('%BR-POLIDO%') |
                        df_products.DESC_GPD.like('%BR-PERFIL BENEFICIADO%') |
                        df_products.DESC_GPD.like('%BR-BOBINA LF%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA CTT%') |
                        df_products.DESC_GPD.like('%BR-CHAPA AZ%') |
                        df_products.DESC_GPD.like('%BR-FITA ZN%') |
                        df_products.DESC_GPD.like('%BR-LDA%') |
                        df_products.DESC_GPD.like('%BR-PIATINA CLARA%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA DESCASCADA%') |
                        df_products.DESC_GPD.like('%BR-MESH%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA FORJADA FINA%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA RETIFICADA%') |
                        df_products.DESC_GPD.like('%BR-S-BARRA TREFILADA%') |
                        df_products.DESC_GPD.like('%BR-PLACA%') |
                        df_products.DESC_GPD.like('%BR-COIL%') |
                        df_products.DESC_GPD.like('%BR-S-FERRAMENTA%') |
                        df_products.DESC_GPD.like('%BR-S-FIO MÁQUINA STT%') |
                        df_products.DESC_GPD.like('%BR-ATC CLARO MOL COL%') |
                        df_products.DESC_GPD.like('%BR-ATC GLV CP%') |
                        df_products.DESC_GPD.like('%BR-B.CHATA LONGARINA%') |
                        df_products.DESC_GPD.like('%BR-BT FORJARIA BLOCOS%') |
                        df_products.DESC_GPD.like('%BR-TARUGO%') |
                        df_products.DESC_GPD.like('%BR-DRAWING%') |
                        df_products.DESC_GPD.like('%BR-ATC CLARO ENF ALG%') |
                        df_products.DESC_GPD.like('%BR-PIATINA GLV%') |
                        df_products.DESC_GPD.like('%BR-PERFIL DORMENTE%') |
                        df_products.DESC_GPD.like('%BR-ATC%') |
                        df_products.DESC_GPD.like('%BR-CHQ%') |
                        df_products.DESC_GPD.like('%BR-ALMA DE ELETRODO%') |
                        df_products.DESC_GPD.like('%BR-FIO MAQUINA%') |
                        df_products.DESC_GPD.like('%BR-CHQ BORO%') |
                        df_products.DESC_GPD.like('%BR-PERFIL GUIA ELEV%'))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [163]:
# Filter the GPM within Smart Pricing scope
df_products = df_products.filter(df_products.DESC_GPM.like('%PERFIS_ESTRUTURAIS%') |
                        df_products.DESC_GPM.like('%VERGALHAO%') |
                        df_products.DESC_GPM.like('%AMPLIADOS%') |
                        df_products.DESC_GPM.like('%CORTE_DOBRA%') |
                        df_products.DESC_GPM.like('%PREGOS%') |
                        df_products.DESC_GPM.like('%B&P%') |
                        df_products.DESC_GPM.like('%ARAMES_AGRO%') |
                        df_products.DESC_GPM.like('%ARAMES_IND%') |
                        df_products.DESC_GPM.like('%PLANOS_LQ%') |
                        df_products.DESC_GPM.like('%PLANOS_REVESTIDOS%') |
                        df_products.DESC_GPM.like('%PLANOS_LF%') |
                        df_products.DESC_GPM.like('%PLANOS_LCG%') |
                        df_products.DESC_GPM.like('%FIO_MAQUINA%') |
                        df_products.DESC_GPM.like('%BT_FORJARIA%') |
                        df_products.DESC_GPM.like('%PLACAS%') |
                        df_products.DESC_GPM.like('%TARUGO%') | 
                        df_products.DESC_GPM.isNull())

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [164]:
df_products = df_products.withColumnRenamed("COD_GPD","GPD_COD")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

---------------

# Preparing SPG PEX

In [166]:
df_pex = df_pex.withColumn("VOLUME", normalizing_characters_2(normalizing_characters(col("VOLUME"))))\
               .withColumn("VALUE_BRL", normalizing_characters_2(normalizing_characters(col("VALUE_BRL"))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [167]:
# Creating a new column with the most recent date
df_pex=df_pex.join(df_products, on=["GPD_COD","SALES_ORG_COD"], how="left")\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [168]:
df_pex.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[GPD_COD: string, SALES_ORG_COD: string, VALUE_BRL: string, STATE: string, MONTH_YEAR: string, SALES_OFFICE: string, MEASURE_UNIT: string, VOLUME: string, DESC_GPD: string, DESC_GPM: string]

In [169]:
# Creating partition with GPD

df_pex.write.partitionBy("GPD_COD")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.readwriter.DataFrameWriter object at 0x7f122c3bb290>

In [170]:
# Repartitioning by GPD

df_pex = df_pex.repartition("GPD_COD")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [171]:
df_pex.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[GPD_COD: string, SALES_ORG_COD: string, VALUE_BRL: string, STATE: string, MONTH_YEAR: string, SALES_OFFICE: string, MEASURE_UNIT: string, VOLUME: string, DESC_GPD: string, DESC_GPM: string]

-----------

# Preparing PV Table

In [172]:
df_view = df_view.withColumnRenamed("0unit","MEASURE_UNIT")\
                 .withColumnRenamed("0calmonth","MONTH_YEAR")\
                 .withColumnRenamed("gtc100208","STATE")\
                 .withColumnRenamed("gtc100362","GPD_COD")\
                 .withColumnRenamed("gtc100511","SALES_ORG_COD")\
                 .withColumnRenamed("gtc100507","SALES_OFFICE")\
                 .withColumnRenamed("gtk100312","VOLUME")\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [173]:
df_view=df_view.fillna({'STATE':'NA'})

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [174]:
df_view=df_view.withColumn("CURRENT_DATE", concat(year(date_add(current_timestamp(),-1))
                                              ,when(month(date_add(current_timestamp(),-1)) >= "10", month(date_add(current_timestamp(),-1)))
                                                   .otherwise(concat(lit("0")
                                                                    ,month(date_add(current_timestamp(),-1))))))\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [175]:
df_view=df_view.filter((df_view["MONTH_YEAR"] >= df_view["CURRENT_DATE"]))\
                  .drop("CURRENT_DATE")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [176]:
df_view = df_view.select("MEASURE_UNIT", "MONTH_YEAR", "STATE", "GPD_COD", "SALES_ORG_COD", "SALES_OFFICE", "VOLUME")\

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [177]:
df_view = df_view.withColumn("VOLUME", normalizing_characters_2(normalizing_characters(col("VOLUME"))))

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [178]:
df_pv=df_pv.unionByName(df_view).dropDuplicates()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [179]:
# Creating a new column with the most recent date
df_pv=df_pv.join(df_products, on=["GPD_COD","SALES_ORG_COD"], how="left")\
            .dropDuplicates()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [180]:
# Creating partition with GPD

df_pv.write.partitionBy("GPD_COD")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.readwriter.DataFrameWriter object at 0x7f122c3bb290>

In [181]:
# Repartitioning by GPD

df_pv = df_pv.repartition("GPD_COD")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [182]:
df_pv.persist(pyspark.StorageLevel.MEMORY_ONLY)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

DataFrame[GPD_COD: string, SALES_ORG_COD: string, MEASURE_UNIT: string, MONTH_YEAR: string, STATE: string, SALES_OFFICE: string, VOLUME: string, DESC_GPD: string, DESC_GPM: string]

---------------

# Uploading Tables

In [183]:
df_pv.write.parquet("s3a://"+SPG_OUTPUT_BUCKET+"/"+SPG_OUTPUT_BUCKET_TEMP, mode = "overwrite")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [184]:
df_temp = spark.read.parquet("s3a://"+SPG_INPUT_BUCKET+"/"+SPG_INPUT_BUCKET_TEMP).dropDuplicates().cache()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [185]:
df_temp.write.parquet("s3a://"+SPG_OUTPUT_BUCKET+"/"+SPG_OUTPUT_BUCKET_PV, mode = "overwrite")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [107]:
df_pex.write.parquet("s3a://"+SPG_OUTPUT_BUCKET+"/"+SPG_OUTPUT_BUCKET_PEX, mode = "overwrite")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…