# Creating spark session

In [1]:
# use tis command if you are using the jupyter notebook

import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

# loading the data and assigning the schema.

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                     "cast(split(value,',') [1] as date) order_date",
                                     "cast(split(value,',') [2] as int) order_id",
                                      "cast(split(value,',') [3] as string) order_status")

orders_table.show(2)

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|                1|2013-07-25|   11599|         CLOSED|
|                2|2013-07-25|     256|PENDING_PAYMENT|
+-----------------+----------+--------+---------------+
only showing top 2 rows



# Launching in CLOUDERA vm

In [None]:
## to stop the warnings and info in saprk 1.6

sc.setLogLevel("ERROR")

'''launch pysaprk'''

'''load the data'''

path_text_orders="/user/pruthviraj/sqoop_text/orders"

orders_text=sqlContext.read.format("text").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                     "cast(split(value,',') [1] as date) order_date",
                                     "cast(split(value,',') [2] as int) order_id",
                                      "cast(split(value,',') [3] as string) order_status")

orders_table.count()

![tilte](https://pysparktutorials.files.wordpress.com/2018/05/12.jpg)

# Filtering data using multipart name

## filtering data on single condition

In [19]:
# filtering data on single column

orders_table.filter(orders_table.order_customer_id>10).show()

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|               11|2013-07-25|     918| PAYMENT_REVIEW|
|               12|2013-07-25|    1837|         CLOSED|
|               13|2013-07-25|    9149|PENDING_PAYMENT|
|               14|2013-07-25|    9842|     PROCESSING|
|               15|2013-07-25|    2568|       COMPLETE|
|               16|2013-07-25|    7276|PENDING_PAYMENT|
|               17|2013-07-25|    2667|       COMPLETE|
|               18|2013-07-25|    1205|         CLOSED|
|               19|2013-07-25|    9488|PENDING_PAYMENT|
|               20|2013-07-25|    9198|     PROCESSING|
|               21|2013-07-25|    2711|        PENDING|
|               22|2013-07-25|     333|       COMPLETE|
|               23|2013-07-25|    4367|PENDING_PAYMENT|
|               24|2013-07-25|   11441|         CLOSED|
|               25|2013-07-25|    9503|         

## filtering data on multiple condtion

In [20]:
# filtering data multiple columns

orders_table.filter((orders_table.order_customer_id>50000) &\
                    (orders_table.order_id >10000)).show()

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|            50001|2014-06-02|   10731|PENDING_PAYMENT|
|            50006|2014-06-02|   11020|       COMPLETE|
|            50012|2014-06-02|   10696|        PENDING|
|            50013|2014-06-02|   10275|     PROCESSING|
|            50022|2014-06-02|   11413|     PROCESSING|
|            50023|2014-06-02|   11947|       COMPLETE|
|            50028|2014-06-02|   12139|         CLOSED|
|            50029|2014-06-02|   12215|PENDING_PAYMENT|
|            50030|2014-06-02|   11106|       COMPLETE|
|            50031|2014-06-02|   10987|     PROCESSING|
|            50035|2014-06-02|   10664|       COMPLETE|
|            50036|2014-06-02|   11696|         CLOSED|
|            50043|2014-06-02|   10434|         CLOSED|
|            50055|2014-06-02|   10101|        ON_HOLD|
|            50057|2014-06-02|   11309|PENDING_P

# Filtering data using double quote method

In [21]:
# filtering data on single column using double quote

orders_table.filter("order_customer_id>10 ").show()

+-----------------+----------+--------+---------------+
|order_customer_id|order_date|order_id|   order_status|
+-----------------+----------+--------+---------------+
|               11|2013-07-25|     918| PAYMENT_REVIEW|
|               12|2013-07-25|    1837|         CLOSED|
|               13|2013-07-25|    9149|PENDING_PAYMENT|
|               14|2013-07-25|    9842|     PROCESSING|
|               15|2013-07-25|    2568|       COMPLETE|
|               16|2013-07-25|    7276|PENDING_PAYMENT|
|               17|2013-07-25|    2667|       COMPLETE|
|               18|2013-07-25|    1205|         CLOSED|
|               19|2013-07-25|    9488|PENDING_PAYMENT|
|               20|2013-07-25|    9198|     PROCESSING|
|               21|2013-07-25|    2711|        PENDING|
|               22|2013-07-25|     333|       COMPLETE|
|               23|2013-07-25|    4367|PENDING_PAYMENT|
|               24|2013-07-25|   11441|         CLOSED|
|               25|2013-07-25|    9503|         

In [29]:
# filtering data on multiple column using double qoute

from pyspark.sql import functions as f

orders_table.groupBy("order_status").\
agg(f.count(orders_table.order_status).alias("count"),\
f.max(orders_table.order_id).alias("max")).\
filter("count >2000 and max>=12433 and order_status='COMPLETE'").show()

+------------+-----+-----+
|order_status|count|  max|
+------------+-----+-----+
|    COMPLETE|22899|12434|
+------------+-----+-----+



## Filtering on null values

In [50]:
#loading the data

path_text_orders="file:///D://data-master/retail_db/orders"

orders_text=spark.read.format("file:///D://data-master/retail_db/orders").load(path_text_orders)

orders_table=orders_text.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
                                     "cast(split(value,',') [1] as date) order_date",
                                     "cast(split(value,',') [2] as int) order_id",
                                      "cast(split(value,',') [3] as string) order_status")
a=orders_table.limit(4)
b=orders_table.limit(5)
c=a.join(b,a.order_customer_id==b.order_customer_id,"outer").\
    select(b.order_customer_id,a.order_date,b.order_id)

c.show()
    
c.filter("order_customer_id is null").show()
    
c.filter("order_customer_id is not null").show()

+-----------------+----------+--------+
|order_customer_id|order_date|order_id|
+-----------------+----------+--------+
|                1|2013-07-25|   11599|
|                2|2013-07-25|     256|
|                3|2013-07-25|   12111|
|                4|2013-07-25|    8827|
|             null|      null|    null|
+-----------------+----------+--------+

+-----------------+----------+--------+
|order_customer_id|order_date|order_id|
+-----------------+----------+--------+
|             null|      null|    null|
+-----------------+----------+--------+

+-----------------+----------+--------+
|order_customer_id|order_date|order_id|
+-----------------+----------+--------+
|                1|2013-07-25|   11599|
|                2|2013-07-25|     256|
|                3|2013-07-25|   12111|
|                4|2013-07-25|    8827|
+-----------------+----------+--------+

