# Importing Libraries

In [27]:
import findspark                                              #Import library to Search for Spark Installation  

findspark.init()                                              #Search Spark Installation

import pyspark                                                #Only run after findspark.init()

from pyspark.sql import SparkSession                          #Import of Spark Session
from pyspark import SparkContext as spark                     #Import the Regular Spark Contex 
from pyspark.sql import SQLContext                            #Import the SQL Spark Contex 
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
spark = SparkSession.builder.getOrCreate()
import os

sc = spark.sparkContext                                       #Initialize Spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

----------------

# Creating Variables

## Buckets

In [None]:
GERDAU_BUCKET = os.environ['GERDAU_BUCKET']

In [None]:
SPG_OUTPUT_BUCKET = os.environ['INTEGRATION_INPUT_BUCKET']

## Input Paths

In [None]:
#Profitability COPA BW
GERDAU_BUCKET_INPUT_BW_PARQUET = "global/co/co_pa_dem/part-00000-1eb8fa1b-2207-438d-ba71-42b166869afb-c000.snappy.parquet"

In [None]:
#Profitability COPA BW
GERDAU_BUCKET_INPUT_BW = "global/co/co_pa_dem"

## Output Paths

In [None]:
# Uploading the Data Frame to the Smart Pricing bucket
SPG_OUTPUT_BUCKET_BW = "SPG_FACTS/SPG_BW/SPG_COPA.parquet"

------------

# Creating Defined Functions

In [None]:
# Creating method to remove characters from string
def remove_some_chars(col_name):
    removed_chars = (".")
    regexp = "|".join('\{0}'.format(i) for i in removed_chars)
    return regexp_replace(col_name, regexp, "/")

----------

# Importing the Profitability Table

In [None]:
# Reading parquet file from the Gerdau Analytics Data Lake
rawReportSNAPPY = spark.read.parquet("s3://"+GERDAU_BUCKET+"/"+GERDAU_BUCKET_INPUT_BW_PARQUET)

In [None]:
rawReportSNAPPY.write.partitionBy(144);
rawReportSNAPPY = rawReportSNAPPY.repartition(144);
rawReportSNAPPY.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
rawReport = spark.read.parquet("s3://"+GERDAU_BUCKET+"/"+GERDAU_BUCKET_INPUT_BW)

In [None]:
rawReport.write.partitionBy(144);
rawReport = rawReport.repartition(144);
rawReport.persist(pyspark.StorageLevel.MEMORY_ONLY)

--------------

# Preparing Table

In [None]:
rawReport = rawReport.drop('year')
rawReport = rawReport.drop('month')
rawReport = rawReport.drop('day')
rawReport = rawReport.unionByName(rawReportSNAPPY)

In [None]:
rawReport = rawReport.drop('year')
rawReport = rawReport.drop('month')
rawReport = rawReport.drop('day')

In [None]:
rawReport= rawReport.dropDuplicates()

In [None]:
rawReport.write.partitionBy(144);
rawReport = rawReport.repartition(144);
rawReport.persist(pyspark.StorageLevel.MEMORY_ONLY)

# Processing Section 

In [None]:
for col in rawReport.columns:
    rawReport = rawReport.withColumnRenamed(col, col.upper())

In [None]:
rawReport=rawReport.filter(rawReport['0CURRENCY'].like('BRL'))

In [None]:
# Replacing the existing column with a corrected one
rawReport = rawReport.withColumn('DI_EXTRACT_TIME', remove_some_chars('DI_EXTRACT_TIME'))\
                    .withColumn('GTC100025', remove_some_chars('GTC100025'))

In [None]:
# Ordering by the most recent Processing (Extract) Timestamp
rawReport=rawReport.withColumn('DI_EXTRACT_TIME', from_unixtime(unix_timestamp('DI_EXTRACT_TIME', 'yyyy/MM/dd')))\
                   .withColumn('GTC100025', from_unixtime(unix_timestamp('GTC100025', 'yyyy/MM/dd')))
                    
rawReport = rawReport.sort("DI_EXTRACT_TIME", ascending=False)

In [None]:
# Get 24 months worth of data
rawReport=rawReport.where(rawReport['GTC100025']>=add_months(current_date(),-24))

In [None]:
# Creating Key using 8 fields
# GTC100257 -> Tipo de Documento de Venda (Sales Document Type)
# GTC101667 -> Número do Documento de Faturamento (Billing Document Number)
# GTC101671 -> Item do Documento de Faturamento (Billing Document Item)
# GTC100137 -> Tipo do Documento de Faturamento (Billing Document Type)
# GTC100511 -> Organização de Vendas (Sales Organization)
# GTC100255 -> Número do Documento de Venda (Sales Document Number)
# GTC100504 -> Item de Documento de Venda (Sales Document Item)
# 0FISCPER  -> Período (Period)

rawReport = rawReport.withColumn('BW_KEY', 
                                 concat(col('GTC100257'),
                                        col('GTC101667'), 
                                        col('GTC101671'), 
                                        col('GTC100137'),
                                        col('GTC100511'),
                                        col('GTC100255'),
                                        col('GTC100504'),
                                        col('0FISCPER')))

In [None]:
# Dropping Duplicates using the created Key as Parameter 
ProcessedReport = rawReport.dropDuplicates(['BW_KEY']).distinct()

# Upload Section

In [None]:
ProcessedReport.write.partitionBy("GTC100362")

In [None]:
ProcessedReport = ProcessedReport.repartition("GTC100362")

In [None]:
ProcessedReport.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
# Uploading the Data Frame to the Smart Pricing bucket
ProcessedReport.write.parquet("s3a://"+SPG_OUTPUT_BUCKET+"/"+SPG_OUTPUT_BUCKET_BW, mode = "overwrite")