In [2]:
!pip install azure-datalake-store



In [63]:
from azure.datalake.store import core, lib, multithread


In [64]:
from pyspark.sql import SparkSession


In [65]:
spark = SparkSession.builder \
            .master("local[8]") \
            .appName("airflow_app") \
            .config('spark.executor.memory', '16g') \
            .config('spark.driver.memory', '16g') \
            .config('spark.sql.execution.pandas.respectSessionTimeZone', False) \
            .config("spark.driver.maxResultSize", "2048MB") \
            .config("spark.port.maxRetries", "100") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .getOrCreate()


In [66]:
adlCreds = lib.auth(url_suffix='raizenprd01', resource='https://datalake.azure.net/')

To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code F7HL8QLDR to authenticate.


In [67]:
spark.conf.set("fs.adl.oauth2.access.token.provider.type", "RefreshToken")
spark.conf.set("fs.adl.oauth2.client.id", adlCreds.token['client'])
spark.conf.set("fs.adl.oauth2.refresh.token", adlCreds.token['refreshToken'])


In [114]:
import pandas as pd
from pyspark.sql.functions import *

In [115]:
df = spark.read.format('csv').option('sep','|').load('adl://raizenprd01.azuredatalakestore.net/ldt_dev/sandbox/previsao_demanda/01_raw/vendas_dia_complemento/mes')

In [116]:
from pyspark.sql.types import StringType,StructField, StructType  
schema = StructType([
        StructField('Empr', StringType(), True),
        StructField('_c1', StringType(), True),
        StructField('Cent', StringType(), True),
        StructField('Nota Fiscal', StringType(), True),
        StructField('Data Saida', StringType(), True),
        StructField('Data Criacao', StringType(), True),
        StructField('UF', StringType(), True),
        StructField('Faturamento', StringType(), True),
        StructField('Itm', StringType(), True),
        StructField('Vencimento', StringType(), True),
        StructField('TpOV', StringType(), True),
        StructField('Ordem Vendas', StringType(), True),
        StructField('IBM', StringType(), True),
        StructField('Razao Social', StringType(), True),
        StructField('Canal', StringType(), True),
        StructField('_c15', StringType(), True),
        StructField('Material', StringType(), True),
        StructField('Descricao', StringType(), True),
        StructField(' Quantidade', StringType(), True),
        StructField('UM', StringType(), True),
        StructField('Cond.Pgto', StringType(), True),
        StructField('Inco', StringType(), True),
        StructField('C.Exp', StringType(), True),
        StructField('Valor item', StringType(), True),
        StructField('Valor ZPFL', StringType(), True),
        StructField('Valor ZPFN', StringType(), True),
        StructField('Valor ZENC', StringType(), True),
        StructField('Imposto', StringType(), True),
        StructField('Valor ZDB1', StringType(), True),
        StructField('Target', StringType(), True),
        StructField('Block', StringType(), True),
        StructField('UN', StringType(), True),
        StructField('Valor ZDDB', StringType(), True),
        StructField('Valor ZDIA', StringType(), True),
        StructField('Valor ZDFR', StringType(), True),
        StructField('Valor ICM3', StringType(), True),
        StructField('Valor ICS3', StringType(), True),
        StructField('Rep.Vendas|Acesso ZPFL', StringType(), True),
        StructField('Volume', StringType(), True),
        StructField('UN_', StringType(), True),
        StructField('Hora Fat', StringType(), True),
        StructField('Referência', StringType(), True),
    ])

In [131]:
df_dia = spark.read.format('csv').option('sep',';').schema(schema).load('adl://raizenprd01.azuredatalakestore.net/ldt_dev/sandbox/previsao_demanda/01_raw/vendas_dia_complemento/dia').where("Empr is not null and Empr <> 'Empr' ").select('Data Saida','Cent','Material','IBM','Volume','Canal')

In [132]:
df_dia = df_dia.withColumnRenamed('Canal','Segmento')
df_dia.show(5)

+----------+----+--------+-------+---------+--------+
|Data Saida|Cent|Material|    IBM|   Volume|Segmento|
+----------+----+--------+-------+---------+--------+
|09.04.2020|BARC|22149801| 121410|10.000,00|       V|
|09.04.2020|BARC|22149801|1032445| 5.000,00|       V|
|09.04.2020|BARC|22150801|1032445| 5.000,00|       V|
|09.04.2020|BARC|24314801|1032445| 5.000,00|       V|
|09.04.2020|BARC|22149801|1046858|10.000,00|       V|
+----------+----+--------+-------+---------+--------+
only showing top 5 rows



In [133]:
df_dia.withColumn('Volume', regexp_replace("Volume", '\.', ''))\
        .withColumn('Data Saida', regexp_replace("Data saida", '\.', '-'))\
        .withColumn('Volume', regexp_replace("Volume", ',', '.').cast("double")).show()

+----------+----+--------+-------+-------+--------+
|Data Saida|Cent|Material|    IBM| Volume|Segmento|
+----------+----+--------+-------+-------+--------+
|09-04-2020|BARC|22149801| 121410|10000.0|       V|
|09-04-2020|BARC|22149801|1032445| 5000.0|       V|
|09-04-2020|BARC|22150801|1032445| 5000.0|       V|
|09-04-2020|BARC|24314801|1032445| 5000.0|       V|
|09-04-2020|BARC|22149801|1046858|10000.0|       V|
|09-04-2020|BARC|24144801|1053586| 5000.0|       V|
|09-04-2020|BARC|24314801|1053586|40000.0|       V|
|09-04-2020|BARC|22149801|  33715| 5000.0|       V|
|09-04-2020|BARC|22150801|  33715| 5000.0|       V|
|09-04-2020|BARC|24144801|  33715| 5000.0|       V|
|09-04-2020|BARC|22149801| 120570|10000.0|       V|
|09-04-2020|BARC|22150801| 120570| 5000.0|       V|
|09-04-2020|BARC|22149801|1033260| 5000.0|       V|
|09-04-2020|BARC|24144801|1033260|25000.0|       V|
|09-04-2020|BARC|24314801|1033260|25000.0|       V|
|09-04-2020|BARC|22150801|1032436| 5000.0|       V|
|09-04-2020|

In [118]:
df_dia = df_dia.withColumnRenamed('Data Saida','data_saida').show(5)

+----------+----+--------+-------+---------+-----+
|data_saida|Cent|Material|    IBM|   Volume|Canal|
+----------+----+--------+-------+---------+-----+
|09.04.2020|BARC|22149801| 121410|10.000,00|    V|
|09.04.2020|BARC|22149801|1032445| 5.000,00|    V|
|09.04.2020|BARC|22150801|1032445| 5.000,00|    V|
|09.04.2020|BARC|24314801|1032445| 5.000,00|    V|
|09.04.2020|BARC|22149801|1046858|10.000,00|    V|
+----------+----+--------+-------+---------+-----+
only showing top 5 rows



In [141]:
df_dia.withColumn("Data Tratada", F.to_date(F.col("Data Saida"), "dd.MM.yyyy")).drop("Data Saida").show()

+----+--------+-------+---------+--------+------------+
|Cent|Material|    IBM|   Volume|Segmento|Data Tratada|
+----+--------+-------+---------+--------+------------+
|BARC|22149801| 121410|10.000,00|       V|  2020-04-09|
|BARC|22149801|1032445| 5.000,00|       V|  2020-04-09|
|BARC|22150801|1032445| 5.000,00|       V|  2020-04-09|
|BARC|24314801|1032445| 5.000,00|       V|  2020-04-09|
|BARC|22149801|1046858|10.000,00|       V|  2020-04-09|
|BARC|24144801|1053586| 5.000,00|       V|  2020-04-09|
|BARC|24314801|1053586|40.000,00|       V|  2020-04-09|
|BARC|22149801|  33715| 5.000,00|       V|  2020-04-09|
|BARC|22150801|  33715| 5.000,00|       V|  2020-04-09|
|BARC|24144801|  33715| 5.000,00|       V|  2020-04-09|
|BARC|22149801| 120570|10.000,00|       V|  2020-04-09|
|BARC|22150801| 120570| 5.000,00|       V|  2020-04-09|
|BARC|22149801|1033260| 5.000,00|       V|  2020-04-09|
|BARC|24144801|1033260|25.000,00|       V|  2020-04-09|
|BARC|24314801|1033260|25.000,00|       V|  2020

In [64]:
df_vendas_dia = spark.read.parquet('adl://raizenprd01.azuredatalakestore.net/ldt_dev/sandbox/previsao_demanda/03_primary/vendas_dia')

AttributeError: 'NoneType' object has no attribute 'withColumn'

In [24]:
df_vendas_dia.show(5)

+-------------------+----+--------+-------+-------+--------+
|               Date|Base|     SKU|    IBM| Volume|Segmento|
+-------------------+----+--------+-------+-------+--------+
|2018-05-23 00:00:00|APVE|15521801|1004717|  300.0|       A|
|2018-05-23 00:00:00|APVE|15521801|1010270| 1929.0|       A|
|2018-05-23 00:00:00|ARBR|15521801|1010270|  324.0|       A|
|2018-05-23 00:00:00|ARBR|15521801|1014172|19949.0|       A|
|2018-05-23 00:00:00|AREC|15509801|1011068| 1000.0|       A|
+-------------------+----+--------+-------+-------+--------+
only showing top 5 rows



In [25]:
import plotly.express as px
from pyspark.sql import functions as F

In [26]:
vendas_dia_group = df_vendas_dia.groupBy("Mes","Ano","Produto","Segmento").agg({"Volume": "sum"})

AnalysisException: "cannot resolve '`Mes`' given input columns: [Segmento, IBM, Volume, Base, Date, SKU];;\n'Aggregate ['Mes, 'Ano, 'Produto, Segmento#140], ['Mes, 'Ano, 'Produto, Segmento#140, sum(Volume#139) AS sum(Volume)#172]\n+- Relation[Date#135,Base#136,SKU#137,IBM#138,Volume#139,Segmento#140] parquet\n"

In [27]:
vendas_dia_group.show()

NameError: name 'vendas_dia_group' is not defined

In [45]:
v_dia_group_pd = vendas_dia_group.orderBy("Date").toPandas()

In [46]:
px.line(v_dia_group_pd, x="Date", y=["meio", "sum(Volume)"])

ValueError: All arguments should have the same length. The length of argument `y` is 2, whereas the length of previous arguments ['Date'] is 3225