In [1]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("myapp") \
        .master("local") \
        .config("spark.executor.memory", "1g") \
        .config("spark.mongodb.input.uri","mongodb://172.17.0.2:27017") \
        .config("spark.mongodb.output.uri","mongodb://172.17.0.2:27017") \
        .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.0") \
        .enableHiveSupport() \
        .getOrCreate()

In [2]:
conf = spark.sparkContext.getConf()
print("# spark.app.name = ", conf.get("spark.app.name"))
print("# spark.master = ", conf.get("spark.master"))
print("# spark.executor.memory = ", conf.get("spark.executor.memory"))
print("# spark.sql.warehouse.dir = ", conf.get("spark.sql.warehouse.dir"))
print("# spark.sql.catalogImplementation = ", conf.get("spark.sql.catalogImplementation"))

# spark.app.name =  myapp
# spark.master =  local
# spark.executor.memory =  1g
# spark.sql.warehouse.dir =  file:/home/jovyan/HiveMetastore/spark-warehouse
# spark.sql.catalogImplementation =  hive


In [3]:
df = spark.read.format("mongo") \
               .option("database","test") \
               .option("collection","products") \
               .load()

In [4]:
df.write.mode("overwrite").save("products_new")

In [5]:
%ls -l products_new

total 16
-rw-r--r-- 1 jovyan users 13479 May 21 07:17 part-00000-ff18c91b-5426-43ee-8533-e31672446445-c000.snappy.parquet
-rw-r--r-- 1 jovyan users     0 May 21 07:17 _SUCCESS


In [6]:
spark.sql("CREATE EXTERNAL TABLE external_products USING parquet LOCATION '/home/jovyan/HiveMetastore/products_new'")

DataFrame[]

In [7]:
spark.sql("DESCRIBE EXTENDED external_products").show(100,100)

+----------------------------+--------------------------------------------------------------+-------+
|                    col_name|                                                     data_type|comment|
+----------------------------+--------------------------------------------------------------+-------+
|                   ListPrice|                                                        double|   null|
|                    MakeFlag|                                                           int|   null|
|                   ModelName|                                                        string|   null|
|                   ProductID|                                                           int|   null|
|                 ProductName|                                                        string|   null|
|               ProductNumber|                                                        string|   null|
|                StandardCost|                                                    

In [8]:
%ls -l metastore_db/

total 28
-rw-r--r-- 1 jovyan users    4 May 21 07:18 dbex.lck
-rw-r--r-- 1 jovyan users   38 May 21 07:18 db.lck
drwxr-sr-x 2 jovyan users 4096 May 21 07:18 [0m[01;34mlog[0m/
-rw-r--r-- 1 jovyan users  608 May 21 07:18 README_DO_NOT_TOUCH_FILES.txt
drwxr-sr-x 2 jovyan users 4096 May 21 07:18 [01;34mseg0[0m/
-rw-r--r-- 1 jovyan users  918 May 21 07:18 service.properties
drwxr-sr-x 2 jovyan users 4096 May 21 07:18 [01;34mtmp[0m/


In [9]:
spark.sql("SELECT * FROM external_products").show()

+---------+--------+--------------------+---------+--------------------+-------------+------------+-------------+--------------------+
|ListPrice|MakeFlag|           ModelName|ProductID|         ProductName|ProductNumber|StandardCost|SubCategoryID|                 _id|
+---------+--------+--------------------+---------+--------------------+-------------+------------+-------------+--------------------+
|   1431.5|       1|       HL Road Frame|      680|HL Road Frame - B...|   FR-R92B-58|     1059.31|           14|{6469c39906e5b054...|
|   1431.5|       1|       HL Road Frame|      706|HL Road Frame - R...|   FR-R92R-58|     1059.31|           14|{6469c39906e5b054...|
|    34.99|       0|           Sport-100|      707|Sport-100 Helmet,...|    HL-U509-R|     13.0863|           31|{6469c39906e5b054...|
|    34.99|       0|           Sport-100|      708|Sport-100 Helmet,...|      HL-U509|     13.0863|           31|{6469c39906e5b054...|
|      9.5|       0| Mountain Bike Socks|      709|Moun

In [10]:
# spark.sql("CREATE TABLE ModelNames (ModelName string, ListPrice INT)")
spark.sql("CREATE TABLE ModelNames (ModelName string, ListPrice INT) USING parquet")

DataFrame[]

In [11]:
spark.sql("DESCRIBE EXTENDED ModelNames").show(100,100)

+----------------------------+--------------------------------------------------------------+-------+
|                    col_name|                                                     data_type|comment|
+----------------------------+--------------------------------------------------------------+-------+
|                   ModelName|                                                        string|   null|
|                   ListPrice|                                                           int|   null|
|                            |                                                              |       |
|# Detailed Table Information|                                                              |       |
|                    Database|                                                       default|       |
|                       Table|                                                    modelnames|       |
|                       Owner|                                                    

In [12]:
%ls -l spark-warehouse

total 4
drwxr-sr-x 2 jovyan users 4096 May 21 07:18 [0m[01;34mmodelnames[0m/


In [13]:
%ls -l spark-warehouse/modelnames

total 0


In [14]:
spark.sql("INSERT OVERWRITE TABLE ModelNames SELECT ModelName, ListPrice FROM external_products")

DataFrame[]

In [15]:
%ls -l spark-warehouse/modelnames

total 4
-rw-r--r-- 1 jovyan users 2861 May 21 07:18 part-00000-446546b8-c1cf-4b60-b8e6-dda58c47898c-c000.snappy.parquet
-rw-r--r-- 1 jovyan users    0 May 21 07:18 _SUCCESS


In [16]:
spark.sql("SELECT * FROM ModelNames").show()

+--------------------+---------+
|           ModelName|ListPrice|
+--------------------+---------+
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|           Sport-100|       34|
|           Sport-100|       34|
| Mountain Bike Socks|        9|
| Mountain Bike Socks|        9|
|           Sport-100|       34|
|         Cycling Cap|        8|
|Long-Sleeve Logo ...|       49|
|Long-Sleeve Logo ...|       49|
|Long-Sleeve Logo ...|       49|
|Long-Sleeve Logo ...|       49|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       LL Road Frame|      337|
|       LL Road Frame|      337|
|       LL Road Frame|      337|
+--------------------+---------+
only showing top 20 rows



In [17]:
spark.sql("DROP TABLE external_products")

DataFrame[]

In [18]:
spark.sql("SELECT * FROM ModelNames").show()

+--------------------+---------+
|           ModelName|ListPrice|
+--------------------+---------+
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|           Sport-100|       34|
|           Sport-100|       34|
| Mountain Bike Socks|        9|
| Mountain Bike Socks|        9|
|           Sport-100|       34|
|         Cycling Cap|        8|
|Long-Sleeve Logo ...|       49|
|Long-Sleeve Logo ...|       49|
|Long-Sleeve Logo ...|       49|
|Long-Sleeve Logo ...|       49|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       HL Road Frame|     1431|
|       LL Road Frame|      337|
|       LL Road Frame|      337|
|       LL Road Frame|      337|
+--------------------+---------+
only showing top 20 rows



In [19]:
%ls -l spark-warehouse/modelnames

total 4
-rw-r--r-- 1 jovyan users 2861 May 21 07:18 part-00000-446546b8-c1cf-4b60-b8e6-dda58c47898c-c000.snappy.parquet
-rw-r--r-- 1 jovyan users    0 May 21 07:18 _SUCCESS


In [20]:
spark.sql("DROP TABLE ModelNames")

DataFrame[]

In [21]:
%ls -l spark-warehouse

total 0
