# Spark SQL

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

conf = SparkConf()

conf.setAppName("Sample Spark SQL")
conf.set("spark.hadoop.fs.s3a.endpoint", "http://172.21.121.140:9000")
conf.set("spark.hadoop.fs.s3a.access.key", "chapolin")
conf.set("spark.hadoop.fs.s3a.secret.key", "mudar@123")
conf.set("spark.hadoop.fs.s3a.path.style.access", True)
conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") 
conf.set("hive.metastore.uris", "thrift://metastore:9083")

spark = SparkSession.builder.config(conf=conf).enableHiveSupport().getOrCreate()

## Add dataframe

In [4]:
data2 = [("James", "Smith", "M", 3000),
         ("Michael", "Rose", "M", 6000),
         ("Robert", "Willians", "M", 5500),
         ("Maria", "Anne", "F", 7000)
        ]

schema = StructType([
    StructField("firtsname", StringType(), True),
    StructField("lastname", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("salary", StringType(), True)
])

df = spark.createDataFrame(data=data2, schema=schema)
df.show()

+---------+--------+------+------+
|firtsname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|  Michael|    Rose|     M|  6000|
|   Robert|Willians|     M|  5500|
|    Maria|    Anne|     F|  7000|
+---------+--------+------+------+



In [5]:
df.createOrReplaceTempView("tb_sql")

In [15]:
query = spark.sql("""
select * from tb_sql where firtsname = 'James'
""")

In [16]:
query.write.format("delta").mode("append").save('s3a://bronze/tb_sql')

## SQL on Dataframe

In [21]:
df_sql = spark.read.format("delta").load('s3a://bronze/tb_sql') 

In [22]:
result_df_sql = spark.sql("SELECT * FROM tb_sql", df=df_sql)

In [23]:
result_df_sql.show()

+---------+--------+------+------+
|firtsname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|  3000|
|  Michael|    Rose|     M|  6000|
|   Robert|Willians|     M|  5500|
|    Maria|    Anne|     F|  7000|
+---------+--------+------+------+

