In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType
from pyspark import SparkConf, SparkContext

## Criando a [Spark Session](http://127.0.0.1:4040/)
Configurações padrão utilizadas no _spark-defaults.conf_ em $SPARK_HOME/conf/ 

In [21]:
spark.stop()

spark = SparkSession \
    .builder \
    .appName("Teste Metastore") \
    .config("spark.sql.warehouse.dir", "s3a://warehouse/") \
    .config("spark.sql.catalogImplementation", "hive")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")\
    .config("spark.hadoop.fs.s3a.access.key", "admin")\
    .config("spark.hadoop.fs.s3a.secret.key", "admin123")\
    .config("spark.hadoop.fs.s3a.fast.upload", "true")\
    .config("spark.hadoop.fs.s3a.path.style.access", "true")\
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
    .config("spark.hadoop.javax.jdo.option.ConnectionDriverName", "org.postgresql.Driver")\
    .config("spark.hadoop.javax.jdo.option.ConnectionURL", "jdbc:postgresql://postgres:5432/metastore_db")\
    .config("spark.hadoop.javax.jdo.option.ConnectionUserName", "hive")\
    .config("spark.hadoop.javax.jdo.option.ConnectionPassword", "hive123")\
    .config("spark.hadoop.datanucleus.schema.autoCreateAll", "true")\
    .config("spark.hadoop.datanucleus.schema.autoCreateTables", "true")\
    .config("spark.hadoop.datanucleus.fixedDatastore", "false")\
    .config("spark.hadoop.hive.metastore.schema.verification", "false")\
    .config("spark.hadoop.hive.metastore.schema.verification.record.version", "false")\
    .config("spark.sql.hive.metastore.version", "2.3.9")\
    .config("spark.sql.hive.metastore.uris", "thrift://0.0.0.0:9083")\
    .config("spark.sql.hive.metastore.jars", "builtin")\
    .config("spark.sql.hive.hiveserver2.jdbc.url=jdbc:postgresql://postgres:5432/metastore_db")\
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")
spark

In [23]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
|    delta|
+---------+



In [32]:
path = "s3a://my-bucket/input/sample-data.csv"

schema = StructType() \
        .add("Row ID", IntegerType(), True) \
        .add("Order ID", StringType(), True) \
        .add("Order Date", StringType(), True) \
        .add("Ship Date", StringType(), True) \
        .add("Ship Mode", StringType(), True) \
        .add("Customer ID", StringType(), True) \
        .add("Customer Name", StringType(), True) \
        .add("Segment", StringType(), True) \
        .add("Country", StringType(), True) \
        .add("City", StringType(), True) \
        .add("State", StringType(), True) \
        .add("Postal Code", StringType(), True) \
        .add("Region", StringType(), True) \
        .add("Product ID", StringType(), True) \
        .add("Category", StringType(), True) \
        .add("Sub-Category", StringType(), True) \
        .add("Product Name", StringType(), True) \
        .add("Sales", DoubleType(), True) \
        .add("Quantity", IntegerType(), True) \
        .add("Discount", DoubleType(), True) \
        .add("Profit", DoubleType(), True)

df = spark.read.csv(path = path, schema = schema, header = True)
df.show()

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|Quantity|Discount|  Profit|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+--------+--------+--------+
|     1|CA-2016-152156| 11/8/2016|11/11/2016|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-BO-10001798|   

In [33]:
df.write.option("compression", "snappy")\
    .mode('overwrite')\
    .partitionBy('Region')\
    .format('parquet')\
    .saveAsTable('tb_sales_test')

23/02/23 23:46:05 ERROR FileUtils: Failed to delete s3a://warehouse/tb_sales_test


                                                                                

In [35]:
spark.sql("SHOW TABLES").show()

+---------+--------------+-----------+
|namespace|     tableName|isTemporary|
+---------+--------------+-----------+
|  default|tb_sales_final|      false|
|  default| tb_sales_test|      false|
+---------+--------------+-----------+



In [36]:
spark.catalog.tableExists(tableName = "tb_sales_test")

True

In [37]:
spark.sql("SHOW PARTITIONS default.tb_sales_test").show()

+--------------+
|     partition|
+--------------+
|Region=Central|
|   Region=East|
|  Region=South|
|   Region=West|
+--------------+



In [38]:
spark.sql("SELECT * FROM default.tb_sales_test LIMIT 10").show()

+------+--------------+----------+---------+--------------+-----------+---------------+--------+-------------+------------+------------+-----------+---------------+---------------+------------+--------------------+-------+--------+--------+----------+------+
|Row ID|      Order ID|Order Date|Ship Date|     Ship Mode|Customer ID|  Customer Name| Segment|      Country|        City|       State|Postal Code|     Product ID|       Category|Sub-Category|        Product Name|  Sales|Quantity|Discount|    Profit|Region|
+------+--------------+----------+---------+--------------+-----------+---------------+--------+-------------+------------+------------+-----------+---------------+---------------+------------+--------------------+-------+--------+--------+----------+------+
|    24|US-2017-156909| 7/16/2017|7/18/2017|  Second Class|   SF-20065|Sandra Flanagan|Consumer|United States|Philadelphia|Pennsylvania|      19140|FUR-CH-10002774|      Furniture|      Chairs|Global Deluxe Sta...| 71.372| 

In [39]:
spark.sql("CREATE DATABASE IF NOT EXISTS delta")
spark.sql("SHOW DATABASES").show()

+---------+
|namespace|
+---------+
|  default|
|    delta|
+---------+



In [40]:
new_cols = [x.replace(' ', '_') for x in df.columns]

df = df.toDF(*new_cols)

In [41]:
df.printSchema()

root
 |-- Row_ID: integer (nullable = true)
 |-- Order_ID: string (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Ship_Date: string (nullable = true)
 |-- Ship_Mode: string (nullable = true)
 |-- Customer_ID: string (nullable = true)
 |-- Customer_Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal_Code: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product_ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product_Name: string (nullable = true)
 |-- Sales: double (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Profit: double (nullable = true)



In [42]:
df.write.format("delta").mode("overwrite").partitionBy("Region").saveAsTable("delta.superstore_table")

Py4JJavaError: An error occurred while calling o476.saveAsTable.
: java.lang.ClassNotFoundException: 
Failed to find data source: delta. Please find packages at
https://spark.apache.org/third-party-projects.html
       
	at org.apache.spark.sql.errors.QueryExecutionErrors$.failedToFindDataSourceError(QueryExecutionErrors.scala:587)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:675)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSourceV2(DataSource.scala:725)
	at org.apache.spark.sql.DataFrameWriter.lookupV2Provider(DataFrameWriter.scala:864)
	at org.apache.spark.sql.DataFrameWriter.saveAsTable(DataFrameWriter.scala:562)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:829)
Caused by: java.lang.ClassNotFoundException: delta.DefaultSource
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:476)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:589)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:522)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$5(DataSource.scala:661)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$lookupDataSource$4(DataSource.scala:661)
	at scala.util.Failure.orElse(Try.scala:224)
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:661)
	... 15 more


In [43]:
spark.catalog.listTables(dbName = 'delta')

[]

In [44]:
spark.read.table("delta.superstore_table").show(5)

AnalysisException: Table or view not found: delta.superstore_table;
'UnresolvedRelation [delta, superstore_table], [], false


In [None]:
spark.stop()