## Importando bibliotecas

In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, DateType
from pyspark.sql.functions import to_date, col, year
from pyspark import SparkConf, SparkContext

## Criando a [Spark Session](http://127.0.0.1:4040/)
Configurações padrão utilizadas no _spark-defaults.conf_ em $SPARK_HOME/conf/ 

In [2]:
if 'spark' in locals() or 'spark' in globals():
    spark.stop()
    
spark = SparkSession\
    .builder\
    .appName("Teste de Leitura e Escrita com Minio")\
    .getOrCreate()

spark

:: loading settings :: url = jar:file:/usr/local/lib/python3.10/dist-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-4687124c-3a4b-48bf-988e-5b244457b5d7;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.1.1 in central
	found io.delta#delta-storage;2.1.1 in central
	found org.antlr#antlr4-runtime;4.8 in central
	found org.codehaus.jackson#jackson-core-asl;1.9.13 in central
:: resolution report :: resolve 161ms :: artifacts dl 5ms
	:: modules in use:
	io.delta#delta-core_2.12;2.1.1 from central in [default]
	io.delta#delta-storage;2.1.1 from central in [default]
	org.antlr#antlr4-runtime;4.8 from central in [default]
	org.codehaus.jackson#jackson-core-asl;1.9.13 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|

23/02/24 00:36:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/24 00:36:34 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


## Teste de leitura de arquivo CSV

[MinIO](http://127.0.0.1:9090/buckets)

- Leitura do arquivo CSV em um bucket do minio
- Criação de um DataFrame
- Correção de Schema
- Contagem de registros
- Exibir schema do DataFrame

In [3]:
path = "s3a://my-bucket/input/sample-data.csv"

schema = StructType() \
        .add("Row ID", IntegerType(), True) \
        .add("Order ID", StringType(), True) \
        .add("Order Date", StringType(), True) \
        .add("Ship Date", StringType(), True) \
        .add("Ship Mode", StringType(), True) \
        .add("Customer ID", StringType(), True) \
        .add("Customer Name", StringType(), True) \
        .add("Segment", StringType(), True) \
        .add("Country", StringType(), True) \
        .add("City", StringType(), True) \
        .add("State", StringType(), True) \
        .add("Postal Code", StringType(), True) \
        .add("Region", StringType(), True) \
        .add("Product ID", StringType(), True) \
        .add("Category", StringType(), True) \
        .add("Sub-Category", StringType(), True) \
        .add("Product Name", StringType(), True) \
        .add("Sales", DoubleType(), True) \
        .add("Quantity", IntegerType(), True) \
        .add("Discount", DoubleType(), True) \
        .add("Profit", DoubleType(), True)

df = spark.read.csv(path = path, schema = schema, header = True)

print(f'{df.count()} registros lidos')

[Stage 0:>                                                          (0 + 1) / 1]

9994 registros lidos


                                                                                

# Exibindo o Schema do DataFrame

Verificando se os data types estão aderentes com o especificado

In [4]:
df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: string (nullable = true)
 |-- Ship Date: string (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: double (nullable = true)



## Exibindo parte do DataFrame

In [5]:
df.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-BO-10001798|   

                                                                                

## Realizando operações de agregação simples

Soma de 'Sales' agrupado por 'Region' e 'Category'

In [6]:
df = df.groupBy(['Region','Category'])\
    .sum('Sales')\
    .orderBy(['Region', 'Category'])\
    .withColumnRenamed('sum(Sales)', 'TotalSales')
df.show()

+-------+---------------+------------------+
| Region|       Category|        TotalSales|
+-------+---------------+------------------+
|Central|      Furniture|162783.14380000005|
|Central|Office Supplies|164616.19700000016|
|Central|     Technology| 170401.5319999999|
|   East|      Furniture|205540.34800000011|
|   East|Office Supplies|201781.62299999985|
|   East|     Technology|264872.08300000033|
|  South|      Furniture| 116273.1360000001|
|  South|Office Supplies|123979.92499999993|
|  South|     Technology|148730.52399999992|
|   West|      Furniture|248450.23350000026|
|   West|Office Supplies|213125.18300000002|
|   West|     Technology|251895.92799999993|
+-------+---------------+------------------+



## Realizando operações com SQL

Recortar o DataFrame onde 'Region' = _'South'_

In [17]:
df.createOrReplaceTempView("ALL_REGIONS_SALES")

query = \
'''
SELECT *
FROM ALL_REGIONS_SALES
WHERE Region = 'South'
'''
queryResults = spark.sql(sqlQuery = query)
queryResults.show()

Py4JJavaError: An error occurred while calling o125.showString.
: java.lang.IllegalStateException: Cannot call methods on a stopped SparkContext.
This stopped SparkContext was created at:

org.apache.spark.api.java.JavaSparkContext.<init>(JavaSparkContext.scala:58)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247)
py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
py4j.Gateway.invoke(Gateway.java:238)
py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80)
py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69)
py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
py4j.ClientServerConnection.run(ClientServerConnection.java:106)
java.base/java.lang.Thread.run(Thread.java:829)

The currently active SparkContext was created at:

(No active SparkContext.)
         
	at org.apache.spark.SparkContext.assertNotStopped(SparkContext.scala:120)
	at org.apache.spark.SparkContext.broadcast(SparkContext.scala:1536)
	at org.apache.spark.sql.execution.datasources.csv.CSVFileFormat.buildReader(CSVFileFormat.scala:102)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues(FileFormat.scala:137)
	at org.apache.spark.sql.execution.datasources.FileFormat.buildReaderWithPartitionValues$(FileFormat.scala:128)
	at org.apache.spark.sql.execution.datasources.TextBasedFileFormat.buildReaderWithPartitionValues(FileFormat.scala:248)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD$lzycompute(DataSourceScanExec.scala:465)
	at org.apache.spark.sql.execution.FileSourceScanExec.inputRDD(DataSourceScanExec.scala:454)
	at org.apache.spark.sql.execution.FileSourceScanExec.doExecute(DataSourceScanExec.scala:543)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:194)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:232)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:229)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:190)
	at org.apache.spark.sql.execution.InputAdapter.inputRDD(WholeStageCodegenExec.scala:527)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs(WholeStageCodegenExec.scala:455)
	at org.apache.spark.sql.execution.InputRDDCodegen.inputRDDs$(WholeStageCodegenExec.scala:454)
	at org.apache.spark.sql.execution.InputAdapter.inputRDDs(WholeStageCodegenExec.scala:498)
	at org.apache.spark.sql.execution.FilterExec.inputRDDs(basicPhysicalOperators.scala:238)
	at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:51)
	at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:751)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:194)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:232)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:229)
	at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:190)
	at org.apache.spark.sql.execution.SparkPlan.getByteArrayRdd(SparkPlan.scala:340)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:473)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:459)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:48)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:3868)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:3858)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:510)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:3856)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3856)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:2863)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3084)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:288)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:327)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)


## Realizando escrita do DataFrame no Minio

Escrito em formato Parquet, particionado por 'Region' e de forma idepotente

In [8]:
df.write.option("compression", "snappy")\
    .mode('overwrite')\
    .partitionBy('Region')\
    .format('parquet')\
    .save(path = "s3a://my-bucket/output/exemplo_1/")

                                                                                

## Lendo o arquivo Parquet escrito

Exibindo o schema e a tabela

In [9]:
new_df = spark.read.parquet("s3a://my-bucket/output/exemplo_1/")
new_df.printSchema()

root
 |-- Category: string (nullable = true)
 |-- TotalSales: double (nullable = true)
 |-- Region: string (nullable = true)



In [10]:
new_df.show()

+---------------+------------------+-------+
|       Category|        TotalSales| Region|
+---------------+------------------+-------+
|      Furniture|162783.14380000005|Central|
|Office Supplies|164616.19700000016|Central|
|     Technology| 170401.5319999999|Central|
|      Furniture|205540.34800000011|   East|
|Office Supplies|201781.62299999985|   East|
|     Technology|264872.08300000033|   East|
|      Furniture| 116273.1360000001|  South|
|Office Supplies|123979.92499999993|  South|
|     Technology|148730.52399999992|  South|
|      Furniture|248450.23350000026|   West|
|Office Supplies|213125.18300000002|   West|
|     Technology|251895.92799999993|   West|
+---------------+------------------+-------+



                                                                                

In [11]:
path = "s3a://my-bucket/input/sample-data.csv"

schema = StructType() \
        .add("Row ID", IntegerType(), True) \
        .add("Order ID", StringType(), True) \
        .add("Order Date", StringType(), True) \
        .add("Ship Date", StringType(), True) \
        .add("Ship Mode", StringType(), True) \
        .add("Customer ID", StringType(), True) \
        .add("Customer Name", StringType(), True) \
        .add("Segment", StringType(), True) \
        .add("Country", StringType(), True) \
        .add("City", StringType(), True) \
        .add("State", StringType(), True) \
        .add("Postal Code", StringType(), True) \
        .add("Region", StringType(), True) \
        .add("Product ID", StringType(), True) \
        .add("Category", StringType(), True) \
        .add("Sub-Category", StringType(), True) \
        .add("Product Name", StringType(), True) \
        .add("Sales", DoubleType(), True) \
        .add("Quantity", IntegerType(), True) \
        .add("Discount", DoubleType(), True) \
        .add("Profit", DoubleType(), True)

df = spark.read.csv(path = path, schema = schema, header = True)

In [12]:
df.show()

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-BO-10001798|   

In [13]:
df = df.withColumn("Order Date", to_date(col("Order Date"),"M/d/yyyy"))
df = df.withColumn("Year", year(col("Order Date")))
df.show()

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+----+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|Year|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+----+
|     1|CA-2016-152156|2016-11-08|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-

In [14]:
df.write.format('parquet').partitionBy("Year").saveAsTable("tb_sales_final")

23/02/24 00:36:57 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/02/24 00:36:57 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


AnalysisException: Table `tb_sales_final` already exists.

In [15]:
spark.sql("Show tables").show()

23/02/24 00:37:11 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
+---------+-----------------+-----------+
|namespace|        tableName|isTemporary|
+---------+-----------------+-----------+
|  default|   tb_sales_final|      false|
|  default|    tb_sales_test|      false|
|         |all_regions_sales|      false|
+---------+-----------------+-----------+



In [16]:
spark.stop()