# Creating a spark session

In [73]:
import os
from pyspark import SparkConf
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local").config(conf=SparkConf()).getOrCreate()

# loading the data and assigning the schema.

items="file:///D://data-master/retail_db/order_items"
items=spark.read.format("text").load(items)
items=items.selectExpr("cast(split(value,',') [0] as int) order_item_id",\
 "cast(split(value,',') [1] as int) order_item_order_id",\
 "cast(split(value,',') [2] as int) order_item_product_id",\
 "cast(split(value,',') [3] as int) order_item_quantity",\
 "cast(split(value,',') [4] as float) order_item_subtotal",\
 "cast(split(value,',') [5] as float) order_item_product_price")

orders="file:///D://data-master/retail_db/orders"
orders=spark.read.format("text").load(orders)
orders=orders.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
 "cast(split(value,',') [1] as date) order_date",
 "cast(split(value,',') [2] as int) order_id",
 "cast(split(value,',') [3] as string) order_status")

products="file:///D://data-master/retail_db/products"
products=spark.read.format("text").load(products)
products=products.selectExpr("cast(split(value,',') [0] as int) product_id",
 "cast(split(value,',') [1] as int) product_category_id",
 "cast(split(value,',') [2] as string) product_name",
"cast(split(value,',') [3] as string) product_description",
"cast(split(value,',') [4] as float) product_price",
"cast(split(value,',') [5] as string) product_image")


    
categories="file:///D://data-master/retail_db/categories"
categories=spark.read.format("text").load(categories)
categories=categories.selectExpr("cast(split(value,',') [0] as int) category_id",
 "cast(split(value,',') [1] as int) category_department_id",
 "cast(split(value,',') [2] as string) category_name")

items.show()
orders.show()
products.show()
categories.show()

+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|order_item_id|order_item_order_id|order_item_product_id|order_item_quantity|order_item_subtotal|order_item_product_price|
+-------------+-------------------+---------------------+-------------------+-------------------+------------------------+
|            1|                  1|                  957|                  1|             299.98|                  299.98|
|            2|                  2|                 1073|                  1|             199.99|                  199.99|
|            3|                  2|                  502|                  5|              250.0|                    50.0|
|            4|                  2|                  403|                  1|             129.99|                  129.99|
|            5|                  4|                  897|                  2|              49.98|                   24.99|
|            6| 

In [None]:
#Loading data in linux environment

items="/user/pruthviraj/sqoop_text/order_items"
items=sqlContext.read.format("text").load(items)
items=items.selectExpr("cast(split(value,',') [0] as int) order_item_id",\
"cast(split(value,',') [1] as int) order_item_order_id",\
"cast(split(value,',') [2] as int) order_item_product_id",\
"cast(split(value,',') [3] as int) order_item_quantity",\
"cast(split(value,',') [4] as float) order_item_subtotal",\
"cast(split(value,',') [5] as float) order_item_product_price")

orders="/user/pruthviraj/sqoop_text/orders"
orders=sqlContext.read.format("text").load(orders)
orders=orders.selectExpr("cast(split(value,',') [0] as int) order_customer_id",
"cast(split(value,',') [1] as date) order_date",
"cast(split(value,',') [2] as int) order_id",
"cast(split(value,',') [3] as string) order_status")

products="/user/pruthviraj/sqoop_text/products"
products=sqlContext.read.format("text").load(products)
products=products.selectExpr("cast(split(value,',') [0] as int) product_id",
"cast(split(value,',') [1] as int) product_category_id",
"cast(split(value,',') [2] as string) product_name",
"cast(split(value,',') [3] as string) product_description",
"cast(split(value,',') [4] as float) product_price",
"cast(split(value,',') [5] as string) product_image")
  
categories="/user/pruthviraj/sqoop_text/categories"
categories=sqlContext.read.format("text").load(categories)
categories=categories.selectExpr("cast(split(value,',') [0] as int) category_id",
"cast(split(value,',') [1] as int) category_department_id",
"cast(split(value,',') [2] as string) category_name")

items.show(2)
orders.show(2)
products.show(2)
categories.show(2)

![tilte](https://pysparktutorials.files.wordpress.com/2018/05/sparkload.jpg)

![tilte](https://pysparktutorials.files.wordpress.com/2018/05/loadres.jpg)

# Ranking 

In [57]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.rank().over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("rank")).show()

+-------------------+--------------------+---------+----+
|      category_name|        product_name|max_price|rank|
+-------------------+--------------------+---------+----+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|   1|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|   2|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|   3|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|   4|
|   Camping & Hiking|Diamondback Women...|   299.98|   1|
|Fitness Accessories|Bowflex SelectTec...|   599.99|   1|
|Fitness Accessories|Under Armour Hust...|    34.99|   2|
|         Golf Shoes|LIJA Women's Butt...|    108.0|   1|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|   2|
|         Golf Shoes|LIJA Women's Argy...|     80.0|   3|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|   4|
|         Basketball| SOLE E25 Elliptical|   999.99|   1|
|         Basketball|Diamondback Girls...|   299.99|   2|
|         Basketball|Diamondback Boys'...|   299.99|   2|
|     Men's Fo

# dense rank

In [58]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.dense_rank().over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("dense_rank")).show()

+-------------------+--------------------+---------+----+
|      category_name|        product_name|max_price|rank|
+-------------------+--------------------+---------+----+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|   1|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|   2|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|   3|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|   4|
|   Camping & Hiking|Diamondback Women...|   299.98|   1|
|Fitness Accessories|Bowflex SelectTec...|   599.99|   1|
|Fitness Accessories|Under Armour Hust...|    34.99|   2|
|         Golf Shoes|LIJA Women's Butt...|    108.0|   1|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|   2|
|         Golf Shoes|LIJA Women's Argy...|     80.0|   3|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|   4|
|         Basketball| SOLE E25 Elliptical|   999.99|   1|
|         Basketball|Diamondback Girls...|   299.99|   2|
|         Basketball|Diamondback Boys'...|   299.99|   2|
|     Men's Fo

# percent rank

In [74]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.percent_rank().over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("percent_rank")).show()

+-------------------+--------------------+---------+------------------+
|      category_name|        product_name|max_price|      percent_rank|
+-------------------+--------------------+---------+------------------+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|               0.0|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|0.3333333333333333|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|0.6666666666666666|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|               1.0|
|   Camping & Hiking|Diamondback Women...|   299.98|               0.0|
|Fitness Accessories|Bowflex SelectTec...|   599.99|               0.0|
|Fitness Accessories|Under Armour Hust...|    34.99|               1.0|
|         Golf Shoes|LIJA Women's Butt...|    108.0|               0.0|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|0.3333333333333333|
|         Golf Shoes|LIJA Women's Argy...|     80.0|0.6666666666666666|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|            

# ntile

In [62]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.ntile(2).over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("ntile")).show(30)

+-------------------+--------------------+---------+-----+
|      category_name|        product_name|max_price|ntile|
+-------------------+--------------------+---------+-----+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|    1|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|    1|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|    2|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|    2|
|   Camping & Hiking|Diamondback Women...|   299.98|    1|
|Fitness Accessories|Bowflex SelectTec...|   599.99|    1|
|Fitness Accessories|Under Armour Hust...|    34.99|    2|
|         Golf Shoes|LIJA Women's Butt...|    108.0|    1|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|    1|
|         Golf Shoes|LIJA Women's Argy...|     80.0|    2|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|    2|
|         Basketball| SOLE E25 Elliptical|   999.99|    1|
|         Basketball|Diamondback Girls...|   299.99|    1|
|         Basketball|Diamondback Boys'...|   299.99|    

# row Number

In [76]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.row_number().over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("row_number")).show()

+-------------------+--------------------+---------+----------+
|      category_name|        product_name|max_price|row_number|
+-------------------+--------------------+---------+----------+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|         1|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|         2|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|         3|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|         4|
|   Camping & Hiking|Diamondback Women...|   299.98|         1|
|Fitness Accessories|Bowflex SelectTec...|   599.99|         1|
|Fitness Accessories|Under Armour Hust...|    34.99|         2|
|         Golf Shoes|LIJA Women's Butt...|    108.0|         1|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|         2|
|         Golf Shoes|LIJA Women's Argy...|     80.0|         3|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|         4|
|         Basketball| SOLE E25 Elliptical|   999.99|         1|
|         Basketball|Diamondback Girls..

# lead

In [77]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.lead("max_price",1).over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("lead")).show()

+-------------------+--------------------+---------+------+
|      category_name|        product_name|max_price|  lead|
+-------------------+--------------------+---------+------+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|129.99|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|109.99|
|   Men's Golf Clubs|Merrell Men's All...|   109.99| 99.99|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|  null|
|   Camping & Hiking|Diamondback Women...|   299.98|  null|
|Fitness Accessories|Bowflex SelectTec...|   599.99| 34.99|
|Fitness Accessories|Under Armour Hust...|    34.99|  null|
|         Golf Shoes|LIJA Women's Butt...|    108.0| 100.0|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|  80.0|
|         Golf Shoes|LIJA Women's Argy...|     80.0|  65.0|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|  null|
|         Basketball| SOLE E25 Elliptical|   999.99|299.99|
|         Basketball|Diamondback Girls...|   299.99|299.99|
|         Basketball|Diamondback Boys'..

# LAG 

In [78]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))

win_fun.select("category_name","product_name","max_price",\
               f.lag("max_price",1).over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("lag")).show()

+-------------------+--------------------+---------+------+
|      category_name|        product_name|max_price|   lag|
+-------------------+--------------------+---------+------+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|  null|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|134.99|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|129.99|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|109.99|
|   Camping & Hiking|Diamondback Women...|   299.98|  null|
|Fitness Accessories|Bowflex SelectTec...|   599.99|  null|
|Fitness Accessories|Under Armour Hust...|    34.99|599.99|
|         Golf Shoes|LIJA Women's Butt...|    108.0|  null|
|         Golf Shoes|LIJA Women's Mid-...|    100.0| 108.0|
|         Golf Shoes|LIJA Women's Argy...|     80.0| 100.0|
|         Golf Shoes|LIJA Women's Eyel...|     65.0|  80.0|
|         Basketball| SOLE E25 Elliptical|   999.99|  null|
|         Basketball|Diamondback Girls...|   299.99|999.99|
|         Basketball|Diamondback Boys'..

# cume distance

In [79]:
from pyspark.sql import functions as f
from pyspark.sql.window import Window as w

win_fun=items.join(products,items.order_item_product_id==products.product_id).\
        join(categories,products.product_category_id==categories.category_id).\
        groupBy(categories.category_name,products.product_name).\
        agg(f.max(items.order_item_product_price).alias("max_price")).\
        sort(categories.category_name.asc(),f.desc("max_price"))
        
win_fun.select("category_name","product_name","max_price",\
               f.cume_dist().over(w.partitionBy("category_name").orderBy(f.desc("max_price"))).alias("cume_dist")).show()

+-------------------+--------------------+---------+-------------------+
|      category_name|        product_name|max_price|          cume_dist|
+-------------------+--------------------+---------+-------------------+
|   Men's Golf Clubs|Merrell Women's S...|   134.99|               0.25|
|   Men's Golf Clubs|Merrell Women's G...|   129.99|                0.5|
|   Men's Golf Clubs|Merrell Men's All...|   109.99|               0.75|
|   Men's Golf Clubs|Merrell Women's G...|    99.99|                1.0|
|   Camping & Hiking|Diamondback Women...|   299.98|                1.0|
|Fitness Accessories|Bowflex SelectTec...|   599.99|                0.5|
|Fitness Accessories|Under Armour Hust...|    34.99|                1.0|
|         Golf Shoes|LIJA Women's Butt...|    108.0|               0.25|
|         Golf Shoes|LIJA Women's Mid-...|    100.0|                0.5|
|         Golf Shoes|LIJA Women's Argy...|     80.0|               0.75|
|         Golf Shoes|LIJA Women's Eyel...|     65.0