# Creating a session

In [1]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

# Registering dataframe as sql table

In [2]:
# Registering dataframe as sql table

# loading the data and assigning the schema.

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                     "cast(split(value,',') [1] as date) order_date",
                                     "cast(split(value,',') [2] as int) order_id",
                                      "cast(split(value,',') [3] as string) order_status")


'''if you are using saprk 1.6 use the below command

sqlContext.registerDataFrameAsTable(dataframe, "table_name")
sqlContext.sqlContext.sql("select * from table_name")

'''

orders_table.createOrReplaceTempView("orders_table")


spark.sql("select * from orders_table").show()


+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
|                3|2013-07-25|   12111|       COMPLETE|
|                4|2013-07-25|    8827|         CLOSED|
|                5|2013-07-25|   11318|       COMPLETE|
|                6|2013-07-25|    7130|       COMPLETE|
|                7|2013-07-25|    4530|       COMPLETE|
|                8|2013-07-25|    2911|     PROCESSING|
|                9|2013-07-25|    5657|PENDING_PAYMENT|
|               10|2013-07-25|    5648|PENDING_PAYMENT|
|               11|2013-07-25|     918| PAYMENT_REVIEW|
|               12|2013-07-25|    1837|         CLOSED|
|               13|2013-07-25|    9149|PENDING_PAYMENT|
|               14|2013-07-25|    9842|     PROCESSING|
|               15|2013-07-25|    2568|       CO

# using sql transformer machine learning feature

In [6]:
#SQL TRANSFORMER STATEMENTS ONLY in 2.1 and above

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                            "cast(split(value,',') [1] as date) order_date",
                                            "cast(split(value,',') [2] as int) order_id",
                                            "cast(split(value,',') [3] as string) order_status")

from pyspark.ml.feature import SQLTransformer

'''movies is a dataframe. this shows how a (SQL) select statement can be applied on
the dataframe without creating the dataframe'''

sqlTrans = SQLTransformer(statement="SELECT * FROM __THIS__ where order_id >11000")

sqlTrans.transform(orders_table).show()

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                3|2013-07-25|   12111|       COMPLETE|
|                5|2013-07-25|   11318|       COMPLETE|
|               24|2013-07-25|   11441|         CLOSED|
|               38|2013-07-25|   11586|     PROCESSING|
|               40|2013-07-25|   12092|PENDING_PAYMENT|
|               48|2013-07-25|   12186|     PROCESSING|
|               51|2013-07-25|   12271|         CLOSED|
|               59|2013-07-25|   11644|PENDING_PAYMENT|
|               70|2013-07-25|   11809|PENDING_PAYMENT|
|               94|2013-07-25|   11589|     PROCESSING|
|               99|2013-07-25|   11542|PENDING_PAYMENT|
|              100|2013-07-25|   12131|     PROCESSING|
|              103|2013-07-25|   12256|     PROCESSING|
|              108|2013-07-26|   12149|     PROC