# Initializing Spark 

In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

import findspark
findspark.init()
findspark.find()
import pyspark

from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext

conf = pyspark.SparkConf().setAppName('appName').setMaster('local')
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession(sc)
sqlcontext=SQLContext(sc)


# Knowledge  Questions

### Transactional systems, Analytic Systems and Data Warehouse
    
    See Lectures

# Aggregations using Apache Spark 

### Implementation From Lectures

In [112]:
retail_df=sqlcontext.read.csv("online-retail-dataset.csv",header=True,inferSchema=True)

In [113]:
retail_df.show(100)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [114]:
retail_df.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: integer (nullable = true)
 |-- Country: string (nullable = true)



In [115]:
retail_df.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country']

In [116]:
# Adding Dimension to the above dataset - Adding time
# Dropping the null data
retailDFNoNull=retail_df.na.drop()

In [117]:
retail_df.count()

541909

In [118]:
retailDFNoNull.count()

406829

In [156]:
retailDFNoNull.show(100)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|Date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|null|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|null|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|null|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|null|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|null|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|null|
|   536365|    21730|GLASS S

In [125]:
from datetime import *
from pyspark.sql.types import *
from pyspark.sql.functions import to_date

In [130]:
retailDFNoNull=retailDFNoNull.withColumn("Date",to_date(retailDFNoNull["InvoiceDate"],format="MM-d-yyyy H:mm"))

In [131]:
retailDFNoNull.dtypes

[('InvoiceNo', 'string'),
 ('StockCode', 'string'),
 ('Description', 'string'),
 ('Quantity', 'int'),
 ('InvoiceDate', 'string'),
 ('UnitPrice', 'double'),
 ('CustomerID', 'int'),
 ('Country', 'string'),
 ('Date', 'date')]

In [132]:
retailDFNoNull.show(1)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|Date|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|null|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+----+
only showing top 1 row



In [302]:
# adding region dimension to the retailDF
# Creating a user defined function to generalize country 
def generalize_country(country=str):
    my_list=["Austria","France","Germany","Italy","Spain","Belgium"]
    my_list1=["Canada","USA","Brazil"]
    x=0
    if country=="United Kingdom" :
        return "British Isles"
    elif country in my_list:
        for x in my_list:
            my_list==country
            return "EU"
    elif country in my_list1:
        for x in my_list1:
            my_list==country
            return "Amerika"
    else:
        return "Others"
    
    
generalize_countryUDF=udf(lambda x: generalize_country(x))        

In [303]:
generalize_country("United Kingdom")

'British Isles'

In [304]:
retailDFNoNull=retailDFNoNull.withColumn("Region",generalize_countryUDF(retailDFNoNull["Country"]))
retailDFNoNull.columns

['InvoiceNo',
 'StockCode',
 'Description',
 'Quantity',
 'InvoiceDate',
 'UnitPrice',
 'CustomerID',
 'Country',
 'Date',
 'Region']

In [305]:
#Selecting a column and displaying it with spark.sql/filtering data
retailDFNoNull.createOrReplaceTempView("retailData")

In [308]:
#selecting column and filtering it with spark.sql
spark.sql("""SELECT Country,Region FROM retailData WHERE Region=("Amerika")""").show()

+-------+-------+
|Country| Region|
+-------+-------+
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Canada|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
| Brazil|Amerika|
+-------+-------+
only showing top 20 rows



In [312]:
# Aggregating quantity over countries/regions
retailDFNoNull.select("Region","Quantity").groupBy("Region").agg({"Quantity":"sum"})\
                        .orderBy(retailDFNoNull["Region"]).show(100)




+-------------+-------------+
|       Region|sum(Quantity)|
+-------------+-------------+
|      Amerika|         4153|
|British Isles|      4008533|
|           EU|       290098|
|       Others|       604104|
+-------------+-------------+



In [340]:
# Aggregating quantity over countries
retailDFNoNull.select("Country","Quantity","Region").groupBy("Country","Region").agg({"Quantity":"sum"})\
                        .orderBy(retailDFNoNull["Country"]).show(100)


+--------------------+-------------+-------------+
|             Country|       Region|sum(Quantity)|
+--------------------+-------------+-------------+
|           Australia|       Others|        83653|
|             Austria|           EU|         4827|
|             Bahrain|       Others|          260|
|             Belgium|           EU|        23152|
|              Brazil|      Amerika|          356|
|              Canada|      Amerika|         2763|
|     Channel Islands|       Others|         9479|
|              Cyprus|       Others|         6317|
|      Czech Republic|       Others|          592|
|             Denmark|       Others|         8188|
|                EIRE|       Others|       136329|
|  European Community|       Others|          497|
|             Finland|       Others|        10666|
|              France|           EU|       109848|
|             Germany|           EU|       117448|
|              Greece|       Others|         1556|
|             Iceland|       Ot

In [315]:
# Aggregating with SQL queries / wrt Regions
retailDFNoNull.createOrReplaceTempView("retailData")

spark.sql("""SELECT Region,sum(Quantity) FROM retailData GROUP BY Region ORDER BY Region """).show()

+-------------+-------------+
|       Region|sum(Quantity)|
+-------------+-------------+
|      Amerika|         4153|
|British Isles|      4008533|
|           EU|       290098|
|       Others|       604104|
+-------------+-------------+



In [333]:
# Aggregating with SQL queries / wrt Countries
spark.sql("""SELECT Country,sum(Quantity) FROM retailData GROUP BY Country ORDER BY Country """).show(100)


+--------------------+-------------+
|             Country|sum(Quantity)|
+--------------------+-------------+
|           Australia|        83653|
|             Austria|         4827|
|             Bahrain|          260|
|             Belgium|        23152|
|              Brazil|          356|
|              Canada|         2763|
|     Channel Islands|         9479|
|              Cyprus|         6317|
|      Czech Republic|          592|
|             Denmark|         8188|
|                EIRE|       136329|
|  European Community|          497|
|             Finland|        10666|
|              France|       109848|
|             Germany|       117448|
|              Greece|         1556|
|             Iceland|         2458|
|              Israel|         3990|
|               Italy|         7999|
|               Japan|        25218|
|             Lebanon|          386|
|           Lithuania|          652|
|               Malta|          944|
|         Netherlands|       200128|
|

In [363]:
# Aggregating with Roll up/Cube
retailDFNoNull.cube("Region","Country").sum("Quantity").orderBy(col("Country")).show(100)


+-------------+--------------------+-------------+
|       Region|             Country|sum(Quantity)|
+-------------+--------------------+-------------+
|         null|                null|      4906888|
|       Others|                null|       604104|
|      Amerika|                null|         4153|
|           EU|                null|       290098|
|British Isles|                null|      4008533|
|         null|           Australia|        83653|
|       Others|           Australia|        83653|
|         null|             Austria|         4827|
|           EU|             Austria|         4827|
|       Others|             Bahrain|          260|
|         null|             Bahrain|          260|
|           EU|             Belgium|        23152|
|         null|             Belgium|        23152|
|      Amerika|              Brazil|          356|
|         null|              Brazil|          356|
|         null|              Canada|         2763|
|      Amerika|              Ca

In [373]:
#using pivot (Changes perspective)
retailDFNoNull.select("Country","Region","Quantity").groupBy("Country").pivot("Region").sum("Quantity").show()

+------------------+-------+-------------+------+------+
|           Country|Amerika|British Isles|    EU|Others|
+------------------+-------+-------------+------+------+
|            Sweden|   null|         null|  null| 35637|
|         Singapore|   null|         null|  null|  5234|
|           Germany|   null|         null|117448|  null|
|               RSA|   null|         null|  null|   352|
|            France|   null|         null|109848|  null|
|            Greece|   null|         null|  null|  1556|
|European Community|   null|         null|  null|   497|
|           Belgium|   null|         null| 23152|  null|
|           Finland|   null|         null|  null| 10666|
|             Malta|   null|         null|  null|   944|
|       Unspecified|   null|         null|  null|  1789|
|             Italy|   null|         null|  7999|  null|
|              EIRE|   null|         null|  null|136329|
|         Lithuania|   null|         null|  null|   652|
|            Norway|   null|   

In [378]:
#sampling data 
retailDFNoNull.select("Country","Region","Quantity").sample(fraction=0.5).show()

+--------------+-------------+--------+
|       Country|       Region|Quantity|
+--------------+-------------+--------+
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       8|
|United Kingdom|British Isles|       2|
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       3|
|United Kingdom|British Isles|       4|
|United Kingdom|British Isles|       4|
|United Kingdom|British Isles|       6|
|United Kingdom|British Isles|       3|
|United Kingdom|British Isles|       3|
|United Kingdom|British Isles|       3|
|        France|           EU|      24|
|        France|           EU|      24|
|        France|           EU|      18|
|        France|           EU|      24|
|        France|           EU|      24|
+--------------+-------------+--------+
only showing top 20 rows



In [405]:
#computing statistics
retailDFNoNull.select("Quantity").summary().show()

+-------+-----------------+
|summary|         Quantity|
+-------+-----------------+
|  count|           406829|
|   mean|12.06130339774205|
| stddev|248.6933700188259|
|    min|           -80995|
|    25%|                2|
|    50%|                5|
|    75%|               12|
|    max|            80995|
+-------+-----------------+



# Practise Questions

Download the dataset from here: https://github.com/databricks/Spark-The-Definitive-Guide/blob/master/data/retail-data/by-day/2010-12-01.csv

### SQL Query 1

#### Use SQL query operation provided by SPARK in order to answer the following questions.

Use "spark.sql()" function to write the queries.

Note: the following questions are not automatically correct by the OLAT system, therefore, as far as the solutions result in the right answer they will be considered.

 

#### How many orders did customers perform at which hour?

In [622]:
#reading the data and initial processing

retailsdata=sqlcontext.read.csv("2010-12-01.json",header=True,inferSchema=True)

In [623]:
retailsdata.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)



In [624]:
retailsdata.show(2)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+
only showing top 2 rows



In [625]:
# Adding Date and Yearcolumns to the dataframe
retailsdata=retailsdata.withColumn('Date',to_date(retailsdata["InvoiceDate"],format="yyyy-MM-dd HH:mm:ss"))\
.withColumn("Year",year(retailsdata["InvoiceDate"]))\
.withColumn("InvoiceDate",to_timestamp(retailsdata["InvoiceDate"]))
                       

In [626]:
retailsdata.printSchema()

root
 |-- InvoiceNo: string (nullable = true)
 |-- StockCode: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- InvoiceDate: timestamp (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: double (nullable = true)
 |-- Country: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Year: integer (nullable = true)



In [627]:
retailsdata.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      Date|Year|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   5363

In [628]:
#retailsdata=retailsdata.na.drop()

In [629]:
# USing spark Query
retailsdata.select("CustomerID","InvoiceDate","Quantity").groupBy("InvoiceDate").agg({"Quantity":"sum"})\
.orderBy("InvoiceDate").show()

+-------------------+-------------+
|        InvoiceDate|sum(Quantity)|
+-------------------+-------------+
|2010-12-01 08:26:00|           40|
|2010-12-01 08:28:00|           12|
|2010-12-01 08:34:00|           98|
|2010-12-01 08:35:00|            3|
|2010-12-01 08:45:00|          449|
|2010-12-01 09:00:00|           80|
|2010-12-01 09:01:00|           12|
|2010-12-01 09:02:00|           88|
|2010-12-01 09:09:00|           32|
|2010-12-01 09:32:00|          200|
|2010-12-01 09:34:00|           12|
|2010-12-01 09:37:00|          454|
|2010-12-01 09:41:00|          221|
|2010-12-01 09:45:00|          134|
|2010-12-01 09:49:00|           -1|
|2010-12-01 09:53:00|          190|
|2010-12-01 09:56:00|           53|
|2010-12-01 09:57:00|          236|
|2010-12-01 09:58:00|         1440|
|2010-12-01 09:59:00|          108|
+-------------------+-------------+
only showing top 20 rows



In [630]:
#using SQL query
#initiating
retailsdata.createOrReplaceTempView("retails")

In [631]:
#SQL query
spark.sql("""SELECT InvoiceDate,sum(Quantity) FROM retails Group by InvoiceDate Order by InvoiceDate ;""").show()

+-------------------+-------------+
|        InvoiceDate|sum(Quantity)|
+-------------------+-------------+
|2010-12-01 08:26:00|           40|
|2010-12-01 08:28:00|           12|
|2010-12-01 08:34:00|           98|
|2010-12-01 08:35:00|            3|
|2010-12-01 08:45:00|          449|
|2010-12-01 09:00:00|           80|
|2010-12-01 09:01:00|           12|
|2010-12-01 09:02:00|           88|
|2010-12-01 09:09:00|           32|
|2010-12-01 09:32:00|          200|
|2010-12-01 09:34:00|           12|
|2010-12-01 09:37:00|          454|
|2010-12-01 09:41:00|          221|
|2010-12-01 09:45:00|          134|
|2010-12-01 09:49:00|           -1|
|2010-12-01 09:53:00|          190|
|2010-12-01 09:56:00|           53|
|2010-12-01 09:57:00|          236|
|2010-12-01 09:58:00|         1440|
|2010-12-01 09:59:00|          108|
+-------------------+-------------+
only showing top 20 rows



###  SQL query 2

#### Use SQL query operation provided by SPARK in order to answer the following questions.

Use "spark.sql()" function to write the queries.

Note: the following questions are not automatically correct by the OLAT system, therefore, as far as the solutions result in the right answer they will be considered.

 

#### How frequently was each product bought in different countries?



In [632]:
# First using spark query
retailsdata.select("StockCode","Quantity","Country").show(5)

+---------+--------+--------------+
|StockCode|Quantity|       Country|
+---------+--------+--------------+
|   85123A|       6|United Kingdom|
|    71053|       6|United Kingdom|
|   84406B|       8|United Kingdom|
|   84029G|       6|United Kingdom|
|   84029E|       6|United Kingdom|
+---------+--------+--------------+
only showing top 5 rows



In [633]:
# How frequently was each product bought in different countries?
retailsdata.groupBy("StockCode","Country").agg({"Quantity":"count"}).orderBy("StockCode").show()

+---------+--------------+---------------+
|StockCode|       Country|count(Quantity)|
+---------+--------------+---------------+
|    10002|United Kingdom|              1|
|    10002|        France|              1|
|    10125|United Kingdom|              1|
|    10133|United Kingdom|              1|
|    10135|United Kingdom|              1|
|    11001|United Kingdom|              1|
|   15044B|United Kingdom|              1|
|  15056BL|United Kingdom|              3|
|   15056N|United Kingdom|              3|
|   15056P|United Kingdom|              1|
|    16014|United Kingdom|              1|
|    16016|United Kingdom|              1|
|   16168M|United Kingdom|              1|
|    16236|United Kingdom|              2|
|    16237|United Kingdom|              4|
|    16238|United Kingdom|              3|
|   16258A|United Kingdom|              1|
|    17003|United Kingdom|              2|
|   17011F|United Kingdom|              1|
|   17012A|United Kingdom|              2|
+---------+

In [634]:
#retailsdata.rollup("Country","StockCode").agg({"Quantity":"count"}).orderBy("StockCode").show()

In [635]:
#Using SQL query
retailsdata.createOrReplaceTempView("retails")

In [636]:
spark.sql("""SELECT StockCode,Country,count(Quantity) FROM retails Group by StockCode,Country Order by StockCode""").show()

+---------+--------------+---------------+
|StockCode|       Country|count(Quantity)|
+---------+--------------+---------------+
|    10002|United Kingdom|              1|
|    10002|        France|              1|
|    10125|United Kingdom|              1|
|    10133|United Kingdom|              1|
|    10135|United Kingdom|              1|
|    11001|United Kingdom|              1|
|   15044B|United Kingdom|              1|
|  15056BL|United Kingdom|              3|
|   15056N|United Kingdom|              3|
|   15056P|United Kingdom|              1|
|    16014|United Kingdom|              1|
|    16016|United Kingdom|              1|
|   16168M|United Kingdom|              1|
|    16236|United Kingdom|              2|
|    16237|United Kingdom|              4|
|    16238|United Kingdom|              3|
|   16258A|United Kingdom|              1|
|    17003|United Kingdom|              2|
|   17011F|United Kingdom|              1|
|   17012A|United Kingdom|              2|
+---------+

# Programming Questions

### Slice operation

#### Select from a datacube a slice that contains only items from the United States.

In [637]:
retailsdata.show(5)

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      Date|Year|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
+-------

In [638]:
retailsdata.filter(retailsdata["Country"]=="United Kingdom").show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      Date|Year|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|   536365|   85123A|WHITE HANGING HEA...|       6|2010-12-01 08:26:00|     2.55|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|    71053| WHITE METAL LANTERN|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84406B|CREAM CUPID HEART...|       8|2010-12-01 08:26:00|     2.75|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84029G|KNITTED UNION FLA...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|2010-12-01 08:26:00|     3.39|   17850.0|United Kingdom|2010-12-01|2010|
|   5363

### Dice Operation

#### Select a sub-cube from a datacube that contains items from United states and United Kingdom with quantities 0:



In [684]:

dice = retailsdata.where(((col("Country") == "United States") | (col("Country") == "United Kingdom")) & (col("Quantity") == 1))

In [686]:
dice.show()

+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|InvoiceNo|StockCode|         Description|Quantity|        InvoiceDate|UnitPrice|CustomerID|       Country|      Date|Year|
+---------+---------+--------------------+--------+-------------------+---------+----------+--------------+----------+----+
|   536381|    71270|     PHOTO CLIP LINE|       1|2010-12-01 09:41:00|     1.25|   15311.0|United Kingdom|2010-12-01|2010|
|   536381|    22262|FELT EGG COSY CHI...|       1|2010-12-01 09:41:00|     0.85|   15311.0|United Kingdom|2010-12-01|2010|
|   536381|    22637|PIGGY BANK RETROS...|       1|2010-12-01 09:41:00|     2.55|   15311.0|United Kingdom|2010-12-01|2010|
|   536381|    21166|COOK WITH WINE ME...|       1|2010-12-01 09:41:00|     1.95|   15311.0|United Kingdom|2010-12-01|2010|
|   536381|   37444A|YELLOW BREAKFAST ...|       1|2010-12-01 09:41:00|     2.95|   15311.0|United Kingdom|2010-12-01|2010|
|   5363

In [683]:
#Roll up operation,cube

retailsdata.rollup("StockCode","Country").agg({"Quantity":"count"}).orderBy("StockCode").show()


+---------+--------------+---------------+
|StockCode|       Country|count(Quantity)|
+---------+--------------+---------------+
|     null|          null|           3108|
|    10002|          null|              2|
|    10002|United Kingdom|              1|
|    10002|        France|              1|
|    10125|United Kingdom|              1|
|    10125|          null|              1|
|    10133|United Kingdom|              1|
|    10133|          null|              1|
|    10135|          null|              1|
|    10135|United Kingdom|              1|
|    11001|United Kingdom|              1|
|    11001|          null|              1|
|   15044B|          null|              1|
|   15044B|United Kingdom|              1|
|  15056BL|United Kingdom|              3|
|  15056BL|          null|              3|
|   15056N|United Kingdom|              3|
|   15056N|          null|              3|
|   15056P|          null|              1|
|   15056P|United Kingdom|              1|
+---------+