In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession \
          .builder \
          .config("spark.master", "local") \
          .appName("interfacing spark sql to hive metastore with no configuration file") \
          .config("hive.metastore.uris", "thrift://10.0.0.46:9083") \
          .enableHiveSupport() \
          .getOrCreate()


In [3]:
sc=spark.sparkContext

In [4]:
sc.setLogLevel("ERROR")

In [5]:
spark.sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")

DataFrame[]

In [6]:
spark.sql("LOAD DATA LOCAL INPATH 'D:/spark/examples/src/main/resources/kv1.txt' INTO TABLE src")

DataFrame[]

In [7]:
spark.sql("SELECT * FROM src").show(3,False)

+---+-------+
|key|value  |
+---+-------+
|238|val_238|
|86 |val_86 |
|311|val_311|
+---+-------+
only showing top 3 rows



In [8]:
spark.sql("SELECT COUNT(*) FROM src").show()

+--------+
|count(1)|
+--------+
|     500|
+--------+



In [9]:
sqlDF = spark.sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")

In [10]:
stringsDS = sqlDF.rdd.map(lambda row: "Key: %d, Value: %s" % (row.key, row.value))

In [11]:
type(stringsDS)

pyspark.rdd.PipelinedRDD

In [12]:
for i in stringsDS.collect():
    print(i)

Key: 0, Value: val_0
Key: 0, Value: val_0
Key: 0, Value: val_0
Key: 2, Value: val_2
Key: 4, Value: val_4
Key: 5, Value: val_5
Key: 5, Value: val_5
Key: 5, Value: val_5
Key: 8, Value: val_8
Key: 9, Value: val_9


In [13]:
from pyspark.sql import Row

Record = Row("key", "value")



In [14]:
recordsDF = spark.createDataFrame(Record(i, "val_"+str(i)) for i in range(1,101))

In [15]:
recordsDF.createOrReplaceTempView("records")

In [16]:
spark.sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()

+---+------+---+------+
|key| value|key| value|
+---+------+---+------+
|  2| val_2|  2| val_2|
|  4| val_4|  4| val_4|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  5| val_5|  5| val_5|
|  8| val_8|  8| val_8|
|  9| val_9|  9| val_9|
| 10|val_10| 10|val_10|
| 11|val_11| 11|val_11|
| 12|val_12| 12|val_12|
| 12|val_12| 12|val_12|
| 15|val_15| 15|val_15|
| 15|val_15| 15|val_15|
| 17|val_17| 17|val_17|
| 18|val_18| 18|val_18|
| 18|val_18| 18|val_18|
| 19|val_19| 19|val_19|
| 20|val_20| 20|val_20|
| 24|val_24| 24|val_24|
| 24|val_24| 24|val_24|
+---+------+---+------+
only showing top 20 rows



In [17]:
spark.sql("CREATE TABLE IF NOT EXISTS hive_records(key int, value string) STORED AS PARQUET")

DataFrame[]

In [18]:
df = spark.table("src")

In [19]:
df.write.mode("Overwrite").saveAsTable("hive_records")

In [20]:
spark.sql("REFRESH TABLE hive_records")

DataFrame[]

In [21]:
spark.sql("SELECT * FROM hive_records").show()

+---+-------+
|key|  value|
+---+-------+
|238|val_238|
| 86| val_86|
|311|val_311|
| 27| val_27|
|165|val_165|
|409|val_409|
|255|val_255|
|278|val_278|
| 98| val_98|
|484|val_484|
|265|val_265|
|193|val_193|
|401|val_401|
|150|val_150|
|273|val_273|
|224|val_224|
|369|val_369|
| 66| val_66|
|128|val_128|
|213|val_213|
+---+-------+
only showing top 20 rows



In [31]:
dataDir = "/tmp/parquet_data"

In [32]:
spark.range(10).write.parquet(dataDir)

In [33]:
spark.sql("CREATE EXTERNAL TABLE IF NOT EXISTS hive_bigints(id bigint) STORED AS PARQUET LOCATION '/tmp/parquet_data'")

DataFrame[]

In [34]:
spark.sql("SELECT * FROM hive_bigints").show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [35]:
spark.conf.set("hive.exec.dynamic.partition", "true")
spark.conf.set("hive.exec.dynamic.partition.mode", "nonstrict")

In [36]:
df.write.mode("Overwrite").partitionBy("key").format("hive").saveAsTable("hive_part_tbl")

In [37]:
df=spark.sql("SELECT * FROM hive_part_tbl")

In [38]:
df.show()

+-------+---+
|  value|key|
+-------+---+
|val_170|170|
|val_368|368|
|val_307|307|
|val_307|307|
|val_224|224|
|val_224|224|
| val_90| 90|
| val_90| 90|
| val_90| 90|
|val_446|446|
|val_333|333|
|val_333|333|
|val_365|365|
|val_150|150|
|val_164|164|
|val_164|164|
|val_128|128|
|val_128|128|
|val_128|128|
|val_209|209|
+-------+---+
only showing top 20 rows



In [39]:
spark.stop()