In [1]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import StringType,StructField, StructType  
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
            .master("local[8]") \
            .appName("airflow_app") \
            .config('spark.executor.memory', '16g') \
            .config('spark.driver.memory', '16g') \
            .config('spark.sql.execution.pandas.respectSessionTimeZone', False) \
            .config("spark.driver.maxResultSize", "2048MB") \
            .config("spark.port.maxRetries", "100") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .getOrCreate()

In [3]:
df = spark.read.format('csv').option('header', True).option('sep',';').load('dados/peso_vendas_calendario.csv')

In [62]:
df.show()

+--------+---+-----------+--------+-----------------+
|    Date| UF|    Produto|Segmento|             Peso|
+--------+---+-----------+--------+-----------------+
|1-Apr-12| AC|     Etanol|       V|                0|
|1-Apr-12| AC|   Gasolina|       V|                0|
|1-Apr-12| AL|Diesel S500|       C|                0|
|1-Apr-12| AL|Diesel S500|       V|0.000793939173022|
|1-Apr-12| AL|     Etanol|       V|0.001380148385852|
|1-Apr-12| AL|   Gasolina|       V|0.001043360657431|
|1-Apr-12| AM|     Etanol|       V|0.012257468537506|
|1-Apr-12| AM|   Gasolina|       V|0.005977460471064|
|1-Apr-12| BA|Diesel S500|       C|0.106229853387477|
|1-Apr-12| BA|Diesel S500|       V|0.165102927097304|
|1-Apr-12| BA|     Etanol|       V|0.145752385200027|
|1-Apr-12| BA|   Gasolina|       V|0.162125998225635|
|1-Apr-12| CE| Diesel S10|       C|0.030886005500449|
|1-Apr-12| CE| Diesel S10|       V|0.001777721194909|
|1-Apr-12| CE|     Etanol|       V|0.001916009334648|
|1-Apr-12| CE|   Gasolina|  

In [50]:
df.select(F.count(F.lit(1)).alias("contagem")).show()
# select count(1) from df
df.count()

+--------+
|contagem|
+--------+
|  464729|
+--------+



464729

In [49]:
df_varejo = df.where(((F.col("Segmento") == F.lit("V")) & F.col("Produto").isin({"Etanol", "Gasolina",})))

In [63]:
df.groupBy('Date','UF','Produto','Segmento').agg({'Peso': 'sum' })

DataFrame[Date: string, UF: string, Produto: string, Segmento: string, sum(Peso): double]

In [64]:
df.show()

+--------+---+-----------+--------+-----------------+
|    Date| UF|    Produto|Segmento|             Peso|
+--------+---+-----------+--------+-----------------+
|1-Apr-12| AC|     Etanol|       V|                0|
|1-Apr-12| AC|   Gasolina|       V|                0|
|1-Apr-12| AL|Diesel S500|       C|                0|
|1-Apr-12| AL|Diesel S500|       V|0.000793939173022|
|1-Apr-12| AL|     Etanol|       V|0.001380148385852|
|1-Apr-12| AL|   Gasolina|       V|0.001043360657431|
|1-Apr-12| AM|     Etanol|       V|0.012257468537506|
|1-Apr-12| AM|   Gasolina|       V|0.005977460471064|
|1-Apr-12| BA|Diesel S500|       C|0.106229853387477|
|1-Apr-12| BA|Diesel S500|       V|0.165102927097304|
|1-Apr-12| BA|     Etanol|       V|0.145752385200027|
|1-Apr-12| BA|   Gasolina|       V|0.162125998225635|
|1-Apr-12| CE| Diesel S10|       C|0.030886005500449|
|1-Apr-12| CE| Diesel S10|       V|0.001777721194909|
|1-Apr-12| CE|     Etanol|       V|0.001916009334648|
|1-Apr-12| CE|   Gasolina|  

In [20]:
df.withColumn("Date", F.to_date(F.col("Date"), "dd.MM.yyyy")) \
    .withColumn("Peso", F.col("Peso")).cast('double')

AttributeError: 'DataFrame' object has no attribute 'cast'

In [59]:
df_varejo = df.groupBy('Date', 'UF', 'Produto').pivot('Segmento').sum('Peso')

AnalysisException: '"Peso" is not a numeric column. Aggregation function can only be applied on a numeric column.;'

In [43]:
df.groupBy('Date', 'Produto', 'Segmento').pivot('UF').sum('Peso')

AnalysisException: 'Cannot resolve column name "Peso" among (Date, UF, Produto, Segmento, sum(Peso));'

In [31]:
df = df.where(((F.col("Segmento") == F.lit("V")) & F.col("Produto").isin({"Etanol", "Gasolina",})))

In [32]:
df.show(10)

+--------+---+--------+--------+-----------------+
|    Date| UF| Produto|Segmento|             Peso|
+--------+---+--------+--------+-----------------+
|1-Apr-12| AC|  Etanol|       V|                0|
|1-Apr-12| AC|Gasolina|       V|                0|
|1-Apr-12| AL|  Etanol|       V|0.001380148385852|
|1-Apr-12| AL|Gasolina|       V|0.001043360657431|
|1-Apr-12| AM|  Etanol|       V|0.012257468537506|
|1-Apr-12| AM|Gasolina|       V|0.005977460471064|
|1-Apr-12| BA|  Etanol|       V|0.145752385200027|
|1-Apr-12| BA|Gasolina|       V|0.162125998225635|
|1-Apr-12| CE|  Etanol|       V|0.001916009334648|
|1-Apr-12| CE|Gasolina|       V|0.002980799554151|
+--------+---+--------+--------+-----------------+
only showing top 10 rows

