## Importando bibliotecas

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, DateType
from pyspark.sql.functions import to_date, col, year
from pyspark import SparkConf, SparkContext

## Criando a [Spark Session](http://127.0.0.1:4040/)

### [spark-master](http://127.0.0.1:5050)

#### • [spark-worker-a](http://127.0.0.1:5051)

#### • [spark-worker-b](http://127.0.0.1:5052)


spark
Configurações padrão utilizadas no _spark-defaults.conf_ em $SPARK_HOME/conf/ 

In [2]:
if 'spark' in locals() or 'spark' in globals():
    spark.stop()
    
spark = SparkSession\
    .builder\
    .appName("Teste de Leitura e Escrita com Minio")\
    .getOrCreate()

spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/05/02 13:44:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/02 13:44:44 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


## Teste de leitura de arquivo CSV

[MinIO](http://127.0.0.1:9090/buckets)

- Leitura do arquivo CSV em um bucket do minio
- Criação de um DataFrame
- Correção de Schema
- Contagem de registros
- Exibir schema do DataFrame

In [3]:
path = "s3a://my-bucket/input/sample-data.csv"

schema = StructType() \
        .add("Row ID", IntegerType(), True) \
        .add("Order ID", StringType(), True) \
        .add("Order Date", StringType(), True) \
        .add("Ship Date", StringType(), True) \
        .add("Ship Mode", StringType(), True) \
        .add("Customer ID", StringType(), True) \
        .add("Customer Name", StringType(), True) \
        .add("Segment", StringType(), True) \
        .add("Country", StringType(), True) \
        .add("City", StringType(), True) \
        .add("State", StringType(), True) \
        .add("Postal Code", StringType(), True) \
        .add("Region", StringType(), True) \
        .add("Product ID", StringType(), True) \
        .add("Category", StringType(), True) \
        .add("Sub-Category", StringType(), True) \
        .add("Product Name", StringType(), True) \
        .add("Sales", DoubleType(), True) \
        .add("Quantity", IntegerType(), True) \
        .add("Discount", DoubleType(), True) \
        .add("Profit", DoubleType(), True)

df = spark.read.csv(path = path, schema = schema, header = True)

print(f'{df.count()} registros lidos')

[Stage 0:>                                                          (0 + 1) / 1]

9994 registros lidos


                                                                                

# Exibindo o Schema do DataFrame

Verificando se os data types estão aderentes com o especificado

In [4]:
df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: double (nullable = true)



## Exibindo parte do DataFrame

In [5]:
df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-BO-10001798|   

                                                                                

In [6]:
df = df.limit(10)

In [7]:
df.show()

+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|  Customer Name|  Segment|      Country|           City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+---------------+---------+-------------+---------------+----------+-----------+------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|    Claire Gute| Consumer|United States|      Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset 

## Realizando operações de agregação simples

Soma de 'Sales' agrupado por 'Region' e 'Category'

In [8]:
df = df.groupBy(['Region','Category'])\
    .sum('Sales')\
    .orderBy(['Region', 'Category'])\
    .withColumnRenamed('sum(Sales)', 'TotalSales')
df.show()

+------+---------------+----------+
|Region|       Category|TotalSales|
+------+---------------+----------+
| South|      Furniture| 1951.4775|
| South|Office Supplies|    22.368|
|  West|      Furniture|     48.86|
|  West|Office Supplies|   155.304|
|  West|     Technology|   907.152|
+------+---------------+----------+



## Realizando operações com SQL

Recortar o DataFrame onde 'Region' = _'South'_

In [9]:
df.createOrReplaceTempView("ALL_REGIONS_SALES")

query = \
'''
SELECT *
FROM ALL_REGIONS_SALES
WHERE Region = 'South'
'''
queryResults = spark.sql(sqlQuery = query)
queryResults.show()

+------+---------------+----------+
|Region|       Category|TotalSales|
+------+---------------+----------+
| South|      Furniture| 1951.4775|
| South|Office Supplies|    22.368|
+------+---------------+----------+



## Realizando escrita do DataFrame no Minio

Escrito em formato Parquet, particionado por 'Region' e de forma idepotente

In [10]:
df.write.option("compression", "snappy")\
    .mode('overwrite')\
    .partitionBy('Region')\
    .format('parquet')\
    .save(path = "s3a://my-bucket/output/exemplo_1/")

                                                                                

## Lendo o arquivo Parquet escrito

Exibindo o schema e a tabela

In [11]:
new_df = spark.read.parquet("s3a://my-bucket/output/exemplo_1/")
new_df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- TotalSales: double (nullable = true)
 |-- Region: string (nullable = true)



In [12]:
new_df.show()

+---------------+----------+------+
|       Category|TotalSales|Region|
+---------------+----------+------+
|      Furniture|     48.86|  West|
|Office Supplies|   155.304|  West|
|     Technology|   907.152|  West|
|      Furniture| 1951.4775| South|
|Office Supplies|    22.368| South|
+---------------+----------+------+



                                                                                

In [13]:
path = "s3a://my-bucket/input/sample-data.csv"

schema = StructType() \
        .add("Row ID", IntegerType(), True) \
        .add("Order ID", StringType(), True) \
        .add("Order Date", StringType(), True) \
        .add("Ship Date", StringType(), True) \
        .add("Ship Mode", StringType(), True) \
        .add("Customer ID", StringType(), True) \
        .add("Customer Name", StringType(), True) \
        .add("Segment", StringType(), True) \
        .add("Country", StringType(), True) \
        .add("City", StringType(), True) \
        .add("State", StringType(), True) \
        .add("Postal Code", StringType(), True) \
        .add("Region", StringType(), True) \
        .add("Product ID", StringType(), True) \
        .add("Category", StringType(), True) \
        .add("Sub-Category", StringType(), True) \
        .add("Product Name", StringType(), True) \
        .add("Sales", DoubleType(), True) \
        .add("Quantity", IntegerType(), True) \
        .add("Discount", DoubleType(), True) \
        .add("Profit", DoubleType(), True)

df = spark.read.csv(path = path, schema = schema, header = True)

In [14]:
df.show()

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-BO-10001798|   

In [15]:
df = df.withColumn("Order Date", to_date(col("Order Date"),"M/d/yyyy"))
df = df.withColumn("Year", year(col("Order Date")))
df.show()

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+----+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|Year|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+----+
|     1|CA-2016-152156|2016-11-08|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-

In [16]:
df.write.format('parquet').mode('overwrite').partitionBy("Year").saveAsTable("tb_sales_final")

Hive Session ID = c74f2d61-93f0-49d4-aa84-9a1500ef63e8


13:45:07.370 [Thread-3] ERROR org.apache.hadoop.hive.metastore.utils.FileUtils - Failed to delete s3a://warehouse/tb_sales_final


                                                                                

In [17]:
spark.sql("SHOW PARTITIONS default.tb_sales_final").show()
spark.sql("SELECT * FROM default.tb_sales_final").show()

+---------+
|partition|
+---------+
|Year=2014|
|Year=2015|
|Year=2016|
|Year=2017|
+---------+

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+-------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+-------+--------+--------+--------+----+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|         City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|  Sales|Quantity|Discount|  Profit|Year|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+-------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+-------+--------+--------+--------+----+
|    13|CA-2017-114412|2017-04-15| 4/20/2017|Standard Class|   AA-10480|      Andrew

In [18]:
spark.sql("SHOW TABLES IN default").show()

+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  default|append_test_table|      false|
|  default|          exemplo|      false|
|  default|   tb_sales_final|      false|
|  default|    tb_sales_test|      false|
|  default|    tbl_employees|      false|
|         |all_regions_sales|       true|
+---------+-----------------+-----------+



In [19]:
spark.stop()