# Ex1 - Filtering and Sorting Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [1]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('ex1').getOrCreate()
spark

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipo.

In [3]:
from pyspark import SparkFiles

In [4]:
url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

spark.sparkContext.addFile(url)

chipo = spark.read.csv(SparkFiles.get("chipotle.tsv"), header=True, inferSchema=True, sep='\t')

### Step 4. How many products cost more than $10.00?

In [8]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import FloatType, StringType

In [9]:
def dolariser(x):
    return str(x[1:-1])

udf_dolariser = udf(lambda x: dolariser(x), StringType())

In [10]:
chipo_df = chipo.withColumn("item_price",udf_dolariser(col("item_price")))

In [12]:
chipo_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)



In [13]:
chipo_df = chipo_df.withColumn("item_price", chipo_df["item_price"].cast('float'))

In [14]:
chipo_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: float (nullable = true)



In [16]:
chipo_df.filter("item_price>10").count()

1130

### Step 5. What is the price of each item? 
###### print a data frame with only two columns item_name and item_price

In [17]:
chipo_df.select("item_name","item_price").show(5)

+--------------------+----------+
|           item_name|item_price|
+--------------------+----------+
|Chips and Fresh T...|      2.39|
|                Izze|      3.39|
|    Nantucket Nectar|      3.39|
|Chips and Tomatil...|      2.39|
|        Chicken Bowl|     16.98|
+--------------------+----------+
only showing top 5 rows



### Step 6. Sort by the name of the item

In [21]:
chipo_df.orderBy(chipo_df["item_name"], ascending= 1).show(5)

+--------+--------+-----------------+------------------+----------+
|order_id|quantity|        item_name|choice_description|item_price|
+--------+--------+-----------------+------------------+----------+
|     154|       1|6 Pack Soft Drink|            [Coke]|      6.49|
|     298|       1|6 Pack Soft Drink|          [Nestea]|      6.49|
|     168|       1|6 Pack Soft Drink|       [Diet Coke]|      6.49|
|     129|       1|6 Pack Soft Drink|          [Sprite]|      6.49|
|     182|       1|6 Pack Soft Drink|       [Diet Coke]|      6.49|
+--------+--------+-----------------+------------------+----------+
only showing top 5 rows



### Step 7. What was the quantity of the most expensive item ordered?

In [23]:
chipo_df.orderBy(chipo_df["item_price"], ascending=0).select("item_name","quantity").show(1, truncate=False)

+----------------------------+--------+
|item_name                   |quantity|
+----------------------------+--------+
|Chips and Fresh Tomato Salsa|15      |
+----------------------------+--------+
only showing top 1 row



### Step 8. How many times was a Veggie Salad Bowl ordered?

In [28]:
chipo_df.groupBy("item_name").count().where(chipo_df.item_name=="Veggie Salad Bowl").head(1)[0][1]

18

### Step 9. How many times did someone order more than one Canned Soda?

In [45]:
chipo_df.where((chipo_df.item_name=="Canned Soda") & (chipo_df.quantity>1)).count()

20