# Base Lucro Bruto

Esta base prepara uma tabela .parquet (s3a://smartpricinggerdaubucket/SPG_LB/LB_FULL) 

O resultado final contém as colunas que copõe o Lucro Bruto e colunas auxiliares:

GROSS_PROFIT = NET_SALES + COGS_TOTAL + FREIGHT + SHIP_DEL_LOAD_C_C + PORT_EXPENSES + COGS_ADJUSTMENTS

A tabela esta sumarizada no nível: [DATA DA FATURA  + DOCUMENTO DE VENDAS + ITEM DO DOCUMENTO DE VENDAS]

Todo código utiliza PySpark

## Imports & Configs

In [None]:
#imports Libs

import os
import pandas as pd
import boto3
import time
from botocore.client import ClientError

import pyarrow.parquet as pq
import s3fs

import findspark                                              #Import library to Search for Spark Installation  

findspark.init()                                              #Search Spark Installation

import pyspark                                                #Only run after findspark.init()

from pyspark.sql import SparkSession                          #Import of Spark Session
from pyspark import SparkContext as spark                     #Import the Regular Spark Contex 
from pyspark.sql import SQLContext                            #Import the SQL Spark Contex 
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import year, month, dayofmonth
spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext                                       #Initialize Spark

--------------

# Creating Variables

## Database

In [None]:
ATHENA_SPG = os.environ['SPG_DATABASE']

In [None]:
ATHENA_HANA = os.environ['GERDAU_HANA_DATABASE']

In [None]:
ATHENA_SALES = os.environ['GERDAU_SALES_DATABASE']

## Buckets

In [None]:
SPG_MANUAL_INPUT_BUCKET = os.environ['MANUAL_INPUT_BUCKET']

In [None]:
SPG_INTEGRATION_INPUT_BUCKET = os.environ['INTEGRATION_INPUT_BUCKET']

In [None]:
SPG_QUERY_BUCKET = os.environ['QUERY_BUCKET']

In [None]:
GERDAU_BUCKET = os.environ['GERDAU_BUCKET']

In [None]:
SPG_OUTPUT_BUCKET = os.environ['OUTPUT_BUCKET']

In [None]:
SPG_INPUT_BUCKET = os.environ['INPUT_BUCKET_FROM_OUTPUT']

## Input Paths

In [None]:
QUERY_CLIENT = "SELECT DISTINCT cast(O.cod_client as bigint) as cod_client,O.organization_sales_cod,O.registration_date,O.state FROM db_smart_pricing.tb_spg_cl_cliente AS O INNER JOIN (SELECT DISTINCT cod_client,organization_sales_cod,max(registration_date) AS max_date FROM db_smart_pricing.tb_spg_cl_cliente GROUP BY  cod_client, organization_sales_cod) AS S ON S.cod_client=O.cod_client AND S.organization_sales_cod=O.organization_sales_cod AND S.max_date=O.registration_date;"

In [None]:
SPG_INTEGRATION_INPUT_BUCKET_BW = "SPG_FACTS/SPG_BW/SPG_COPA.parquet"

## Output Paths

In [None]:
SPG_OUTPUT_BUCKET_GP = "SPG_LB/LB_FULL"

## Boto3 Variables

In [None]:
#S3 Configuration
S3_ATHENA_INPUT =  's3://'+SPG_QUERY_BUCKET+'/'+SPG_QUERY_BUCKET_ATHENA

In [None]:
S3_ATHENA_OUTPUT = 's3://'+SPG_QUERY_BUCKET+'/'+SPG_QUERY_BUCKET_ATHENA

In [None]:
region_name = os.environ['AWS_REGION']

In [None]:
aws_access_key_id = os.environ['AWS_ACCESS_KEY']

In [None]:
aws_secret_access_key = os.environ['AWS_SECRET_KEY']

------------

# Creating Defined Functions

In [None]:
# Run Query

def run_query(query, database, s3_output):
    response = client.start_query_execution(
        QueryString=query,
        QueryExecutionContext={
            'Database': database
            },
        ResultConfiguration={
            'OutputLocation': s3_output,
            }
        )
    return response

In [None]:
def get_aws_path(query,database,s3_output):
    response = run_query(query, database, s3_output)
    file_query = response['QueryExecutionId']
    file_metadata = response['QueryExecutionId'] + '.metadata'
    return file_query

In [None]:
# Wating for 300 seconds until the end of the upload

def wait_athena_load(Bucket, Key):
    time_to_wait = 300
    time_counter = 0

    while True:
        try:
            s3.meta.client.head_object(Bucket=Bucket,Key=Key)
        except ClientError:
            time.sleep(1)
            time_counter += 1
            if time_counter > time_to_wait:
                break
        else:
            break

-----------------

# Configuring Boto3

In [None]:
#Athena Client Configuration

client = boto3.client('athena', 
    aws_access_key_id = aws_access_key_id, 
    aws_secret_access_key = aws_secret_access_key, 
    region_name = region_name )

In [None]:
#S3 Resource Configuration

s3 = boto3.resource('s3',
    aws_access_key_id = aws_access_key_id,
    aws_secret_access_key = aws_secret_access_key,
    region_name = region_name)

------------

# Importing Tables

In [None]:
#BASE FULL BW COPA

df_bw = spark.read.parquet("s3a://"+SPG_INTEGRATION_INPUT_BUCKET+"/"+SPG_INTEGRATION_INPUT_BUCKET_BW)

In [None]:
df_bw.write.partitionBy("GTC100362")
df_bw = df_bw.repartition("GTC100362")
df_bw.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
# Import CSV from View

athena_response = get_aws_path(QUERY_CLIENT,ATHENA_SPG,S3_ATHENA_OUTPUT)

wait_athena_load(SPG_QUERY_BUCKET, SPG_QUERY_BUCKET_ATHENA+"/"+athena_response+".csv")

In [None]:
# Import CSV from View

df_client = spark.read.csv("s3a://"+SPG_QUERY_BUCKET+"/"+SPG_QUERY_BUCKET_ATHENA+"/"+athena_response+".csv", header = 'true')

-------------

# Preparing Table

# TESTE
## -----------------------------------------Rateio Custo de Expedição--------------------------------------------------

## 1. Buscamos na tabela "C-O-P-A" ("db_bw"."tb_global_co_pa_dem_parquet") todo o Volume de Entrega dos Centros de Lucro

In [None]:
df_copa = df_bw.select(col('0FISCPER').substr(1, 7).cast("int").alias("year_month"),\
        trim(col('GTC100450')).substr(1, 8).cast("int").alias("profit_corporate"),\
        trim(col('GTC100450')).substr(5, 10).cast("int").alias("profit_center"),\
        trim(col('GTC100020')).alias("material_cod"),\
        trim(col('GTK101129')).cast("double").alias("volume"),\
        trim(col('GTC100202'))\
       )\
.filter(~ col('GTC100450').like("%BR%") |
        ~ col('GTC100450').like("%GP%") |
        ~ col('GTC100450').like("%US%") |
        ~ col('GTC100450').like("%SS%") |
        ~ col('GTC100450').like("%MX%") |
        ~ col('GTC100450').like("%EX%") |
        ~ col('GTC100450').like("%CA%")\
        )\
.filter(col('GTC100511').like("%BRCG%") |
        col('GTC100511').like("%BRGO%") |
        col('GTC100511').like("%BRCC%") |
        col('GTC100511').like("%BRCO%") |
        col('GTC100511').like("%BRIN%") |
        col('GTC100511').like("%BRIO%") |
        col('GTC100511').like("%BREX%") |
        col('GTC100511').like("%BREZ%") |
        col('GTC100511').like("%BREO%") |
        col('GTC100511').like("%BRDI%") |
        col('GTC100511').like("%BRDO%")\
        )\
.filter(col('GTC100202') == '10')\
.filter(col('year_month') >= 2019001)\
.withColumn("profit_center", trim(col('profit_center')))\
.filter(col('profit_center') <> '')\
.filter(~ col('profit_center').isNull())\
.withColumn("profit_corporate", trim(col('profit_corporate')))\
.filter(col('profit_corporate') <> '')\
.filter(~ col('profit_corporate').isNull())

In [None]:
#trata campos com volume nulo
df_copa = df_copa.withColumn("volume", when(col('volume').isNull(),0.000000).\
    otherwise(df_copa.volume))\
    .drop("GTC100202")\

In [None]:
df_copa.write.partitionBy("year_month")
df_copa = df_copa.repartition("year_month")

## 1.2 Teste Unitário 1:
### Calcula volume do CL 240102 / year_month = 2019007, que deve possuir volume 964629.574

In [None]:
#testando profit_center = 240102 / year_month = 2019007
#df_teste = df_copa.filter(col('year_month') == 2019007).filter(col('profit_center') == 240102)\
#    .agg(sum(col("volume")).alias("volume_profit_center"))

In [None]:
#df_teste.printSchema()

In [None]:
#df_teste.show(100,truncate=False)

In [None]:
#Verifica se o valor do colume do profit center está correto
#def tdd1(volume):
#    if (volume > 964629.573) & (volume < 964629.575):
#        return "TDD1 - sem erros no calculo de volume do profit_center 240102 / year_month = 2019007"
#    else:
#        Log_erro_tdd1 = df_teste
#        Log_erro_tdd1.show()
#        return "ERRO TDD1 - erro no cálculo de volume do profit_center 240102 / year_month = 2019007" + " \n"

In [None]:
#x = df_teste.limit(1).select("volume_profit_center").collect()[0]

In [None]:
#print (x.volume_profit_center)
#tdd1(x.volume_profit_center)

## 1.3 Cálculo do Volume Mensal de Entrega POR Corporacao de Lucro (df_volume_profit_center)

# Agrupador 1

* Group:
    1. "Ano/Mês"(0FISCPER)
    2. "Corporação de Lucro" (GTC100450 - primeiros 4 dígitos)
   
* Sum
    1. "Volume"(GTK101129)

In [None]:
df_volume_profit_corporate = df_copa.groupBy(df_copa.year_month
                        ,df_copa.profit_corporate)\
.agg(sum(col("volume")).alias("volume_profit_corporate"))

## 1.4 Cálculo de Volume Mensal de Entrega POR Sku POR Corporação de Lucro (df_volume_material_cod)

# Agrupador 2

* Group:
    1. "Ano/Mês"(0FISCPER)
    2. "Corporação de Lucro" (GTC100450 - primeiros 4 dígitos)
    3. "SKU" (GTC100020)
   
* Sum
    1. "Volume"(GTK101129)

In [None]:
volume_material_cod = df_copa.groupBy(df_copa.year_month
                                       ,df_copa.profit_corporate
                                       ,df_copa.material_cod)\
.agg(sum(col("volume")).alias("volume_material_cod"))

## 1.5 Join das informações de "Volume X Corporação de Lucro" (df_volume_profit_center) com "Volume x SKU" (df_volume_material_cod)

In [None]:
df_volume_profit_corporate_material_cod=df_volume_profit_corporate.join(volume_material_cod,
                                                                  (df_volume_profit_corporate.year_month==volume_material_cod.year_month)&
                                                                  (df_volume_profit_corporate.profit_corporate==volume_material_cod.profit_corporate),
                                                                  how="inner")\
                                                            .drop(volume_material_cod.year_month)\
                                                            .drop(volume_material_cod.profit_corporate)\
                                                            .dropDuplicates()

In [None]:
# partition dataframe
df_volume_profit_corporate_material_cod.write.partitionBy("year_month")
df_volume_profit_corporate_material_cod = df_volume_profit_corporate_material_cod.repartition("year_month")

## 1.6 Cálculo da Distribuição Mensal do Volume (KG) dos Skus POR Corporação de Lucro (apportionment_volume_material_cod)

In [None]:
#trata campos com volume nulo
df_volume_profit_corporate_material_cod = df_volume_profit_corporate_material_cod.withColumn("apportionment_volume_material_cod",
                                                                                       when(col("volume_profit_corporate")>0,
                                                                                            (col("volume_material_cod")/col("volume_profit_corporate"))\
                                                                                           ).otherwise(0.000000))

## 2. Buscamos na Tabela "DRE" ("db_bw"."tb_global_dre_dpl00165_dem_parquet") o Custo Mensal de Expedição do Mercado Interno das Corporações de Lucro

### 2.1 Input

In [None]:
### 2.1 Input parquet tabela "DRE" (tb_global_dre_dpl00165_dem_parquet)
df_dre = spark.read.parquet("s3://gerdau-analytics/global/sales/dre_dpl00165")

In [None]:
# partition dataframe
df_dre.write.partitionBy("0FISCPER")
df_dre = df_dre.repartition("0FISCPER")

In [None]:
df_dre = df_dre.select(col('0FISCPER').substr(1, 7).cast("int").alias("year_month"),\
                       trim(col('GTC100450')).substr(1, 8).cast("int").alias("profit_corporate"),\
                       trim(col('GTC100450')).substr(5, 10).cast("int").alias("profit_center"),\
                       trim(col('GTC100450')).substr(1, 4).cast("int").alias("factory_center"),\
                       trim(col('GTC100450')).substr(5, 6).cast("int").alias("profit_center_class"),\
                       trim(col('GTK100676')).cast("double").alias("expedition_value"),\
                       trim(col('GTC100202'))\
                      )\
.withColumn("profit_center", trim(col('profit_center')))\
.filter(col('profit_center') <> '')\
.filter(~ col('profit_center').isNull())\
.withColumn("profit_corporate", trim(col('profit_corporate')))\
.filter(col('profit_corporate') <> '')\
.filter(~ col('profit_corporate').isNull())\
.filter(col('year_month') >= 2019001)\
.filter(col('GTC100202') == '10')\ -- 
.filter(col('GTC100316') == '6250059900')

In [None]:
#trata campos com volume nulo
df_dre = df_dre.withColumn("expedition_value", when(col('expedition_value').isNull(),0.000000).\
    otherwise(df_dre.expedition_value))\
    .drop("GTC100202")\

In [None]:
# partition dataframe
df_dre.write.partitionBy("year_month")
df_dre = df_dre.repartition("year_month")

### 2.2 Cálculo do Custo de Expedição (Relatório DRE) Mensal / POR Corporação de Lucro

# Agrupador 3

* Group:
    1. "Ano/Mês"(0FISCPER)
    2. "Corporação de Lucro" (GTC100450 - primeiros 4 dígitos)
   
* Sum
    1. "expedition_value_profit_corporate"(expedition_value)

In [None]:
df_dre = df_dre.groupBy(df_dre.year_month
                        ,df_dre.profit_corporate)\
.agg(sum(col("expedition_value")).alias("expedition_value_profit_corporate"))

## 3 JOIN Distribuicao Volume Profit Center (df_volume_profit_center_material_cod) x Custos de expedicao Profit Center (df_dre)

In [None]:
df_exp=df_dre.join(df_volume_profit_corporate_material_cod,
                   (df_dre.year_month==df_volume_profit_corporate_material_cod.year_month) &
                   (df_dre.profit_corporate==df_volume_profit_corporate_material_cod.profit_corporate),
                   how="inner")\
.drop(df_volume_profit_corporate_material_cod.year_month)\
.drop(df_volume_profit_corporate_material_cod.profit_corporate)\
.dropDuplicates()

In [None]:
# partition dataframe
df_exp.write.partitionBy("year_month")
df_exp = df_exp.repartition("year_month")
df_exp.persist(pyspark.StorageLevel.MEMORY_ONLY)

### 3.1 Cálculo do Custo de Expedição do SKU (expedition_value_material_cod) / POR CL / POR MES

In [None]:
#trata campos com volume nulo
df_exp = df_exp.withColumn("expedition_value_material_cod",
                           (df_exp.expedition_value_profit_corporate * df_exp.apportionment_volume_material_cod))

### 3.2 Cálculo do Custo de Expedição SKU / POR CL / POR MES / POR TON

In [None]:
#trata campos com volume nulo
df_exp = df_exp.withColumn("expedition_value_material_cod_per_kg",
                           (df_exp.expedition_value_material_cod / df_exp.volume_material_cod))

### -----------------------------------------------------FIM RATEIO CUSTO EXPEDICAO----------------------------------------------------------

# Lucro Bruto

## Prepara Tabela

In [None]:
#Transforma colunas em int64

for col_name in ['GTK101129',
                'GTK101085',
                'GTK101087',
                'GTK101096',
                'GTK101231',
                'GTK101126',
                'GTK101238',
                'GTK101124',
                'GTK101099',
                'GTK101236',
                'GTK103147',
                'GTK103148',
                'GTK103149',
                'GTK101118',
                'GTK101115',
                'GTK101112',
                'GTK101119',
                'GTK101120',
                'GTK101121',
                'GTK101125',
                'GTK101086',
                'GTK101111',
                'GTK101113',
                'GTK102652',
                'GTK101093',
                'GTK101123',
                'GTK101094',
                'GTK101095',
                'GTK101127',
                'GTK101233',
                'GTK102651',
                'GTK101088',
                'GTK103150',
                'GTK101091',
                'GTK101226',
                'GTK101103',
                'GTK101098',
                'GTK101221',
                'GTK101222',
                'GTK101229',
                'GTK101097',
                'GTK101100',
                'GTK101092',
                'GTK101116',
                'GTK101117',
                'GTK101114',
                'GTK101223',
                'GTK101090',
                'GTK101101',
                'GTK101122',
                'GTK101224',
                'GTK101102',
                'GTK101225',
                'GTK101105',
                'GTK101227',
                'GTK101104',
                'GTK101106',
                'GTK101228',
                'GTK101107',
                'GTK101108',
                'GTK101109',
                'GTK101110']:
    df_bw = df_bw.withColumn(col_name, col(col_name).cast('float'))

## Calculo dos Campos

In [None]:
df_bw=df_bw.withColumn("CUSTOOPERACIONAL", col('gtk101088') +
                                                    col('gtk101092') +
                                                    col('gtk101093') +
                                                    col('gtk101094') +
                                                    col('gtk101095') +
                                                    col('gtk101096') +
                                                    col('gtk101229') +
                                                    col('gtk101226') +
                                                    col('gtk101222') +
                                                    col('gtk101228') +
                                                    col('gtk101221') +
                                                    col('gtk101225') +
                                                    col('gtk101224') +
                                                    col('gtk101223') +
                                                    col('gtk101227') +
                                                    col('gtk101231') +
                                                    col('gtk103150') +
                                                    col('gtk101091'))

In [None]:
df_bw=df_bw.withColumn("Y009_COGS_ADJUSTMENTS", - col('gtk101107') - 
                                                             col('gtk101110') - 
                                                             col('gtk101115') - 
                                                             col('gtk101108') -
                                                             col('gtk101109'))

In [None]:
df_bw=df_bw.withColumn("Y007_PORT_EXPENSES", col('gtk101087'))

In [None]:
df_bw=df_bw.withColumn("Y008_SHIP_DEL_LOAD_C_C", - col('gtk101112') - 
                                                          col('gtk101114'))

In [None]:
df_bw=df_bw.withColumn("Y006_FREIGHT", - col('gtk101090') + 
                                                   col('gtk101086') - 
                                                   col('gtk101127') +
                                                   col('gtk101233') +
                                                   col('gtk102651') - 
                                                   col('gtk102652'))

In [None]:
df_bw=df_bw.withColumn("GTPM056", - col('gtk103150'))

In [None]:
df_bw=df_bw.withColumn("YXXX_COGS_TOTAL", (col('gtk101088') + col('gtk101091') + col('gtk101092') + col('gtk101093') + 
                                                    col('gtk101094') + col('gtk101095') + col('gtk101096') + col('gtk101103') + 
                                                    col('gtk101226') + col('gtk101097') + col('gtk101229') + col('gtk101098') +
                                                    col('gtk101221') + col('gtk101099') + col('gtk101222') + col('gtk101100') + 
                                                    col('gtk101223') + col('gtk101101') + col('gtk101224') + col('gtk101102') + 
                                                    col('gtk101225') + col('gtk101104') + col('gtk101105') + col('gtk101227') +
                                                    col('gtk101106') + col('gtk101228') + col('gtk101231')) * (-1))

In [None]:
df_bw=df_bw.withColumn("Y002_GROSS_SALES", col('gtk101085') + 
                                                     col('gtk101126') + 
                                                     col('gtk101238') + 
                                                     col('gtk101236') + 
                                                     col('gtk103147') + 
                                                     col('gtk103148') + 
                                                     col('gtk103149'))

In [None]:
df_bw=df_bw.withColumn("Y003_SALES_TAX_DISCOUNT",      - col('gtk101113') - 
                                                                 col('gtk101116') + 
                                                                 col('gtk101117') + 
                                                                 col('gtk101118') + 
                                                                 col('gtk101119') + 
                                                                 col('gtk101120') + 
                                                                 col('gtk101121') + 
                                                                 col('gtk101122') + 
                                                                 col('gtk101123') - 
                                                                 col('gtk101124'))

In [None]:
df_bw=df_bw.withColumn("Y004_NET_SALES", col('Y003_SALES_TAX_DISCOUNT') +
                                                   col('Y002_GROSS_SALES'))

In [None]:
#df_bw.show(truncate=False)

# TESTE
## -----------------Atribuindo Rateio Custo de Expedição ao Cálculo do Lucro Bruto--------------------

In [None]:
df_bw=df_bw.join(df_exp,
                 (col('0FISCPER').substr(1, 7).cast("int") == df_exp.year_month) &
                 (trim(col('GTC100450')).substr(1, 8).cast("int") == df_exp.profit_corporate) &
                 (trim(col('GTC100020')) == df_exp.material_cod),
                 how="left")\
.drop(df_exp.year_month)\
.drop(df_exp.profit_corporate)\
.drop(df_exp.expedition_value_profit_corporate)\
.drop(df_exp.volume_profit_corporate)\
.drop(df_exp.material_cod)\
.drop(df_exp.expedition_value_material_cod)\
.drop(df_exp.apportionment_volume_material_cod)\
.dropDuplicates()

In [None]:
df_bw=df_bw.withColumn("COMISSION_TO_AGENTS", when(df_bw.expedition_value_material_cod_per_kg.isNull(), 0.00000)\
                                        .otherwise(df_bw.GTK101129 * df_bw.expedition_value_material_cod_per_kg * -1))

## ---------------------------------FIM atribuição Rateio Custo de Expedição ------------------------------------

## Calculo do Lucro Bruto

In [None]:
df_bw=df_bw.withColumn("GROSS_PROFIT_BW", col('Y004_NET_SALES') +
                                                   col('YXXX_COGS_TOTAL') +
                                                   col('Y006_FREIGHT') +
                                                   col('Y008_SHIP_DEL_LOAD_C_C') +
                                                   col('Y007_PORT_EXPENSES') +
                                                   col('Y009_COGS_ADJUSTMENTS')+
                                                   col('COMISSION_TO_AGENTS'))

## Data Prep

In [None]:
#Seleciona colunas
df_lb = df_bw.select("GTC100511","GTC100020", "GTC101018","GTC100025", "GTC100362", "GTK101129","GTC100209"
                     ,"GTC100255","GTC100504","CUSTOOPERACIONAL", "Y009_COGS_ADJUSTMENTS"
                     ,"Y007_PORT_EXPENSES","Y008_SHIP_DEL_LOAD_C_C","Y006_FREIGHT","GTPM056"
                     ,"YXXX_COGS_TOTAL","Y002_GROSS_SALES","Y003_SALES_TAX_DISCOUNT","Y004_NET_SALES"
                     ,"COMISSION_TO_AGENTS","GROSS_PROFIT_BW")

df_lb = df_lb.withColumnRenamed("GTC100025", 'Data')\
             .withColumnRenamed("GTC100362", 'GPD_cod')\
             .withColumnRenamed("GTK101129", 'Quantity_ton')\
             .withColumnRenamed("GTC100255", 'Sales_Number')\
             .withColumnRenamed("GTC100209", 'cod_client')\
             .withColumnRenamed("GTC100504", 'Sales_Number_Item')

In [None]:
df_lb = df_lb.withColumn("Quantity_ton", col('Quantity_ton')/1000)\
             .withColumn("cod_client", col('cod_client').cast("int"))\

In [None]:
df_lb=df_lb.join(df_client.select(df_client.cod_client
                                  ,df_client.organization_sales_cod.alias("GTC100511")
                                  ,df_client.state), on=["GTC100511","cod_client"], how="left")

In [None]:
df_lb=df_lb.withColumn("GTC101018", when(trim(df_lb["GTC101018"]).isNull()|df_lb["GTC101018"].like(' '), df_lb.state)\
                                        .otherwise(df_lb["GTC101018"]))

In [None]:
df_lb=df_lb.drop("cod_client","state")

In [None]:
df_bw.write.partitionBy("GTC101018")

In [None]:
df_bw = df_bw.repartition("GTC101018")

In [None]:
df_bw.persist(pyspark.StorageLevel.MEMORY_ONLY)

In [None]:
# Prepara Coluna Data para criação de chave e Groupby
#df_lb = df_lb.withColumn("DataKey",expr("substring(Data, 1, length(Data)-12)"))

# Cria Coluna Mês e Ano

df_lb = df_lb \
            .withColumn('year', year(col('Data'))) \
            .withColumn('month', month(col('Data')))

# Limpa Sales Number
df_lb = df_lb.withColumn("Sales_Number",col("Sales_Number").cast("int").cast("string"))
df_lb = df_lb.withColumn("Sales_Number_Item",col("Sales_Number_Item").cast("int").cast("string"))

# Cria Chaves
df_lb = df_lb.withColumn("KEY_LB", (concat(col("Sales_Number"), lit("_"), col("Sales_Number_Item"))))
df_lb = df_lb.withColumn("KEY_LB_DATA", (concat(col("year"), lit("-"), col("month") ,lit("_"), col("Sales_Number"), lit("_"), col("Sales_Number_Item"))))

In [None]:
# Agrupamento por Data + Documento de Vendas + Item do documento de vendas

df_lucro_bruto = df_lb

df_lucro_bruto = df_lucro_bruto.groupby(['KEY_LB_DATA']).agg({'Quantity_ton':'sum','COMISSION_TO_AGENTS': 'sum', 'CUSTOOPERACIONAL': 'sum', 'Y009_COGS_ADJUSTMENTS': 'sum', 'Y007_PORT_EXPENSES': 'sum', 'Y008_SHIP_DEL_LOAD_C_C': 'sum', 'Y006_FREIGHT': 'sum', 'GTPM056': 'sum', 'YXXX_COGS_TOTAL': 'sum', 'Y002_GROSS_SALES': 'sum', 'Y003_SALES_TAX_DISCOUNT': 'sum', 'Y004_NET_SALES': 'sum', 'GROSS_PROFIT_BW': 'sum', 'Data': 'first', 'GPD_cod': 'first','Sales_Number': 'first','Sales_Number_Item': 'first','KEY_LB': 'first', 'GTC101018': 'first', 'GTC100511': 'first', 'GTC100020': 'first'})

# Renomeia devido a infelicidade do PySpark

df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y003_SALES_TAX_DISCOUNT)", 'SALES_TAX_DISCOUNT')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Quantity_ton)", 'QUANTITY_TON')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(CUSTOOPERACIONAL)", 'CUSTO_OPERACIONAL')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(YXXX_COGS_TOTAL)", 'COGS_TOTAL')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y008_SHIP_DEL_LOAD_C_C)", 'SHIP_DEL_LOAD_C_C')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(COMISSION_TO_AGENTS)", 'COMISSION_TO_AGENTS')

df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(GPD_cod)", 'GPD_COD')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(Sales_Number)", 'SALES_NUMBER')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(GTC101018)", 'GTC101018')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(GTC100511)", 'GTC100511')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(GTC100020)", 'GTC100020')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(GROSS_PROFIT_BW)", 'GROSS_PROFIT_BW')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(KEY_LB)", 'KEY_LB')

df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y002_GROSS_SALES)", 'GROSS_SALES')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y004_NET_SALES)", 'NET_SALES')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(GTPM056)", 'GTPM056')

df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(Sales_Number_Item)", 'SALES_NUMBER_ITEM')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y009_COGS_ADJUSTMENTS)", 'COGS_ADJUSTMENTS')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y007_PORT_EXPENSES)", 'PORT_EXPENSES')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("first(Data)", 'DATA')
df_lucro_bruto = df_lucro_bruto.withColumnRenamed("sum(Y006_FREIGHT)", 'FREIGHT')

## Teste Unitário 3:
### Calcula Custo de Expedição (COMISSION_TO_AGENTS) 

In [None]:
#df_teste_fim = df_lucro_bruto.groupBy(year(df_lucro_bruto.DATA).alias("ano")
#                                    ,month(df_lucro_bruto.DATA).alias("mes")
#                                    ,df_lucro_bruto.GTC100511
#                                    ,df_lucro_bruto.GTC100020)\
#.agg(sum(col("COMISSION_TO_AGENTS")).alias("exped"))

## Export LB-BW 

In [None]:
df_lucro_bruto.write.parquet("s3a://"+SPG_OUTPUT_BUCKET+"/"+SPG_OUTPUT_BUCKET_GP, mode = "overwrite")