In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("myapp") \
        .master("local") \
        .config("spark.executor.memory", "1g") \
        .config("spark.mongodb.input.uri","mongodb://172.17.0.2:27017") \
        .config("spark.mongodb.output.uri","mongodb://172.17.0.2:27017") \
        .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .enableHiveSupport() \
        .getOrCreate()

In [2]:
conf = spark.sparkContext.getConf()
print("# spark.app.name = ", conf.get("spark.app.name"))
print("# spark.master = ", conf.get("spark.master"))
print("# spark.executor.memory = ", conf.get("spark.executor.memory"))
print("# spark.sql.warehouse.dir = ", conf.get("spark.sql.warehouse.dir"))
print("# spark.sql.catalogImplementation = ", conf.get("spark.sql.catalogImplementation"))

# spark.app.name =  myapp
# spark.master =  local
# spark.executor.memory =  1g
# spark.sql.warehouse.dir =  file:/home/jovyan/HiveMetastore/spark-warehouse
# spark.sql.catalogImplementation =  hive


In [3]:
df = spark.read.format("mongo") \
               .option("database","test") \
               .option("collection","products") \
               .load()

In [4]:
df.write.mode("overwrite").saveAsTable("products_new")

In [5]:
spark.sql("DESCRIBE EXTENDED products_new").show(100,100)

+----------------------------+--------------------------------------------------------------+-------+
|                    col_name|                                                     data_type|comment|
+----------------------------+--------------------------------------------------------------+-------+
|                   ListPrice|                                                        double|   null|
|                    MakeFlag|                                                           int|   null|
|                   ModelName|                                                        string|   null|
|                   ProductID|                                                           int|   null|
|                 ProductName|                                                        string|   null|
|               ProductNumber|                                                        string|   null|
|                StandardCost|                                                    

In [6]:
%ls -l metastore_db/

total 28
-rw-r--r-- 1 jovyan users    4 May 15 06:08 dbex.lck
-rw-r--r-- 1 jovyan users   38 May 15 06:08 db.lck
drwxr-sr-x 2 jovyan users 4096 May 15 06:08 [0m[01;34mlog[0m/
-rw-r--r-- 1 jovyan users  608 May 15 06:08 README_DO_NOT_TOUCH_FILES.txt
drwxr-sr-x 2 jovyan users 4096 May 15 06:08 [01;34mseg0[0m/
-rw-r--r-- 1 jovyan users  918 May 15 06:08 service.properties
drwxr-sr-x 2 jovyan users 4096 May 15 06:08 [01;34mtmp[0m/


In [7]:
%ls -l spark-warehouse/

total 4
drwxr-sr-x 2 jovyan users 4096 May 15 06:08 [0m[01;34mproducts_new[0m/


In [8]:
%ls -l spark-warehouse/products_new

total 16
-rw-r--r-- 1 jovyan users 13395 May 15 06:08 part-00000-00968bde-5527-4ec5-9548-f0099af56786-c000.snappy.parquet
-rw-r--r-- 1 jovyan users     0 May 15 06:08 _SUCCESS


In [9]:
%cat derby.log

----------------------------------------------------------------
Mon May 15 06:08:37 UTC 2023:
Booting Derby version The Apache Software Foundation - Apache Derby - 10.14.2.0 - (1828579): instance a816c00e-0188-1e06-6cab-000003ef5c00 
on database directory /home/jovyan/HiveMetastore/metastore_db with class loader jdk.internal.loader.ClassLoaders$AppClassLoader@5ffd2b27 
Loaded from file:/usr/local/spark-3.2.1-bin-hadoop3.2/jars/derby-10.14.2.0.jar
java.vendor=Ubuntu
java.runtime.version=11.0.13+8-Ubuntu-0ubuntu1.20.04
user.dir=/home/jovyan/HiveMetastore
os.name=Linux
os.arch=amd64
os.version=5.4.0-139-generic
derby.system.home=null
Database Class Loader started - derby.database.classpath=''


In [10]:
spark.sql("SELECT * FROM products_new WHERE StandardCost > 2000").show()

+---------+--------+---------+---------+----------------+-------------+------------+-------------+--------------------+
|ListPrice|MakeFlag|ModelName|ProductID|     ProductName|ProductNumber|StandardCost|SubCategoryID|                 _id|
+---------+--------+---------+---------+----------------+-------------+------------+-------------+--------------------+
|  3578.27|       1| Road-150|      749|Road-150 Red, 62|   BK-R93R-62|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      750|Road-150 Red, 44|   BK-R93R-44|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      751|Road-150 Red, 48|   BK-R93R-48|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      752|Road-150 Red, 52|   BK-R93R-52|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      753|Road-150 Red, 56|   BK-R93R-56|   2171.2942|            2|{6461c642c7456de4...|
+---------+--------+---------+---------+

In [11]:
spark.sql("DROP TABLE products_new")

DataFrame[]

In [12]:
%ls -l spark-warehouse/

total 0


In [13]:
df.write.mode("overwrite").saveAsTable("products_new")

In [14]:
%ls -l spark-warehouse/products_new

total 16
-rw-r--r-- 1 jovyan users 13395 May 15 06:09 part-00000-33e0f524-2e49-4af2-afe7-035cf5823065-c000.snappy.parquet
-rw-r--r-- 1 jovyan users     0 May 15 06:09 _SUCCESS


In [15]:
spark.stop()

In [16]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("myapp") \
        .master("local") \
        .config("spark.executor.memory", "1g") \
        .config("spark.mongodb.input.uri","mongodb://172.17.0.3:27017") \
        .config("spark.mongodb.output.uri","mongodb://172.17.0.3:27017") \
        .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .enableHiveSupport() \
        .getOrCreate()

In [17]:
spark.sql("SELECT * FROM products_new WHERE StandardCost > 2000").show()

+---------+--------+---------+---------+----------------+-------------+------------+-------------+--------------------+
|ListPrice|MakeFlag|ModelName|ProductID|     ProductName|ProductNumber|StandardCost|SubCategoryID|                 _id|
+---------+--------+---------+---------+----------------+-------------+------------+-------------+--------------------+
|  3578.27|       1| Road-150|      749|Road-150 Red, 62|   BK-R93R-62|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      750|Road-150 Red, 44|   BK-R93R-44|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      751|Road-150 Red, 48|   BK-R93R-48|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      752|Road-150 Red, 52|   BK-R93R-52|   2171.2942|            2|{6461c642c7456de4...|
|  3578.27|       1| Road-150|      753|Road-150 Red, 56|   BK-R93R-56|   2171.2942|            2|{6461c642c7456de4...|
+---------+--------+---------+---------+