# Ex2 - Getting and Knowing your Data

This time we are going to pull data directly from the internet.
Special thanks to: https://github.com/justmarkham for sharing the dataset and materials.

### Step 1. Import the necessary libraries

In [14]:
import pyspark

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("ex2").getOrCreate()
spark

### Step 2. Import the dataset from this [address](https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv). 

### Step 3. Assign it to a variable called chipotle.

In [2]:
from pyspark import SparkFiles

In [8]:
# chipo = spark.read.csv(path="https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv", 
#                        sep='/t')

url = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv"

spark.sparkContext.addFile(url)

chipotle = spark.read.csv(SparkFiles.get("chipotle.tsv"), header=True, inferSchema=True, sep='\t')

### Step 4. See the first 10 entries

In [9]:
chipotle.show(10)

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|    $2.39 |
|       1|       1|                Izze|        [Clementine]|    $3.39 |
|       1|       1|    Nantucket Nectar|             [Apple]|    $3.39 |
|       1|       1|Chips and Tomatil...|                NULL|    $2.39 |
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|   $16.98 |
|       3|       1|        Chicken Bowl|[Fresh Tomato Sal...|   $10.98 |
|       3|       1|       Side of Chips|                NULL|    $1.69 |
|       4|       1|       Steak Burrito|[Tomatillo Red Ch...|   $11.75 |
|       4|       1|    Steak Soft Tacos|[Tomatillo Green ...|    $9.25 |
|       5|       1|       Steak Burrito|[Fresh Tomato Sal...|    $9.25 |
+--------+--------+--------------------+-----------

In [10]:
chipotle.take(2)

[Row(order_id=1, quantity=1, item_name='Chips and Fresh Tomato Salsa', choice_description='NULL', item_price='$2.39 '),
 Row(order_id=1, quantity=1, item_name='Izze', choice_description='[Clementine]', item_price='$3.39 ')]

### Step 5. What is the number of observations in the dataset?

In [12]:
# Solution 1
print(chipotle.count(), ",", len(chipotle.columns))

4622 , 5


In [15]:
# Solution 2
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))

pyspark.sql.dataframe.DataFrame.shape = sparkShape

print(chipotle.shape())

(4622, 5)


### Step 6. What is the number of columns in the dataset?

In [16]:
len(chipotle.columns)

5

### Step 7. Print the name of all the columns.

In [17]:
chipotle.columns

['order_id', 'quantity', 'item_name', 'choice_description', 'item_price']

### Step 8. How is the dataset indexed?

In [18]:
# trying to find the answer

### Step 9. Which was the most-ordered item? 

In [28]:
most_ordered_item = chipotle.groupBy("item_name").sum()
most_ordered_item.orderBy("sum(quantity)",ascending=0).show(1)

#ascending = 0 => False
#ascending = 1 => True

+------------+-------------+-------------+
|   item_name|sum(order_id)|sum(quantity)|
+------------+-------------+-------------+
|Chicken Bowl|       713926|          761|
+------------+-------------+-------------+
only showing top 1 row



### Step 10. For the most-ordered item, how many items were ordered?

In [29]:
most_ordered_item = chipotle.groupBy("item_name").sum()
most_ordered_item.orderBy("sum(quantity)",ascending=0).show(1)

#ascending = 0 => False
#ascending = 1 => True

+------------+-------------+-------------+
|   item_name|sum(order_id)|sum(quantity)|
+------------+-------------+-------------+
|Chicken Bowl|       713926|          761|
+------------+-------------+-------------+
only showing top 1 row



### Step 11. What was the most ordered item in the choice_description column?

In [35]:
most_choice_description_item = chipotle.groupBy("choice_description").sum()
most_choice_description_item.orderBy("sum(quantity)",ascending=0).show(2)

#ascending = 0 => False
#ascending = 1 => True

#NULL values must be dropped

+------------------+-------------+-------------+
|choice_description|sum(order_id)|sum(quantity)|
+------------------+-------------+-------------+
|              NULL|      1178731|         1382|
|       [Diet Coke]|       123455|          159|
+------------------+-------------+-------------+
only showing top 2 rows



### Step 12. How many items were orderd in total?

In [38]:
total_quantity = chipotle.agg({"quantity":"sum"})
total_quantity.show()

+-------------+
|sum(quantity)|
+-------------+
|         4972|
+-------------+



### Step 13. Turn the item price into a float

#### Step 13.a. Check the item price type

In [49]:
chipotle.show(2)

+--------+--------+--------------------+------------------+----------+
|order_id|quantity|           item_name|choice_description|item_price|
+--------+--------+--------------------+------------------+----------+
|       1|       1|Chips and Fresh T...|              NULL|    $2.39 |
|       1|       1|                Izze|      [Clementine]|    $3.39 |
+--------+--------+--------------------+------------------+----------+
only showing top 2 rows



In [39]:
chipotle.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)



#### Step 13.b. Create a lambda function and change the type of item price

In [50]:
from pyspark.sql.functions import regexp_replace

In [66]:
price = chipotle.withColumn("price",regexp_replace("item_price","$",""))
price.show(2)

#not working. Needs a check

+--------+--------+--------------------+------------------+----------+------+
|order_id|quantity|           item_name|choice_description|item_price| price|
+--------+--------+--------------------+------------------+----------+------+
|       1|       1|Chips and Fresh T...|              NULL|    $2.39 |$2.39 |
|       1|       1|                Izze|      [Clementine]|    $3.39 |$3.39 |
+--------+--------+--------------------+------------------+----------+------+
only showing top 2 rows



In [54]:
price.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)



In [60]:
from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType, FloatType

In [64]:
def dolariser(x):
    return str(x[1:-1])

udf_dolariser = udf(lambda x:dolariser(x),StringType())

I have converted python UDF "dolariser" to PySpark UDF "udf_dolariser". Now I can use it in PySpark code. "StringType" is the return type of PySpark function.

In [70]:
price_df = chipotle.withColumn("item_price",udf_dolariser(col("item_price")))

In [71]:
price_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: string (nullable = true)



In [74]:
price_df = price_df.withColumn("item_price", price_df["item_price"].cast('float'))

In [73]:
price_df.show(5)

+--------+--------+--------------------+--------------------+----------+
|order_id|quantity|           item_name|  choice_description|item_price|
+--------+--------+--------------------+--------------------+----------+
|       1|       1|Chips and Fresh T...|                NULL|      2.39|
|       1|       1|                Izze|        [Clementine]|      3.39|
|       1|       1|    Nantucket Nectar|             [Apple]|      3.39|
|       1|       1|Chips and Tomatil...|                NULL|      2.39|
|       2|       2|        Chicken Bowl|[Tomatillo-Red Ch...|     16.98|
+--------+--------+--------------------+--------------------+----------+
only showing top 5 rows



#### Step 13.c. Check the item price type

In [75]:
price_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- item_name: string (nullable = true)
 |-- choice_description: string (nullable = true)
 |-- item_price: float (nullable = true)



### Step 14. How much was the revenue for the period in the dataset?

In [77]:
revenue_result = price_df.withColumn("revenue", price_df["item_price"]*price_df["quantity"])
total_revenue = revenue_result.agg({"revenue":"sum"})
total_revenue.show()

+----------------+
|    sum(revenue)|
+----------------+
|39237.0197327137|
+----------------+



In [81]:
print("Total Revenue: $",total_revenue.head()[0])

Total Revenue: $ 39237.0197327137


### Step 15. How many orders were made in the period?

In [84]:
from pyspark.sql.functions import countDistinct

In [87]:
total_orders = price_df.select(countDistinct("order_id"))

In [90]:
print("Total Orders: ",total_orders.head(1)[0][0])

Total Orders:  1834


### Step 16. What is the average revenue amount per order?

In [94]:
# Solution 1
avg_revenue = revenue_result.groupBy("order_id").sum()

In [96]:
avg_revenue.show(5)

+--------+-------------+-------------+------------------+------------------+
|order_id|sum(order_id)|sum(quantity)|   sum(item_price)|      sum(revenue)|
+--------+-------------+-------------+------------------+------------------+
|     148|          592|            4|  30.9399995803833|  30.9399995803833|
|     463|          926|            2|10.679999828338623|10.679999828338623|
|     471|         2355|            5|24.149999618530273|24.149999618530273|
|     496|         2480|            5|17.549999952316284|17.549999952316284|
|     833|         1666|            2|             12.75|             12.75|
+--------+-------------+-------------+------------------+------------------+
only showing top 5 rows



In [105]:
avg_revenue.describe().show()

+-------+-----------------+------------------+------------------+------------------+------------------+
|summary|         order_id|     sum(order_id)|     sum(quantity)|   sum(item_price)|      sum(revenue)|
+-------+-----------------+------------------+------------------+------------------+------------------+
|  count|             1834|              1834|              1834|              1834|              1834|
|   mean|            917.5|2336.8440567066523| 2.711014176663032| 18.81142843757617|21.394231042919138|
| stddev|529.5745147443056|1861.6693582520418|1.6776242381295472|11.652511679485094| 30.21689117895705|
|    min|                1|                 2|                 1|10.079999804496765|10.079999804496765|
|    max|             1834|             21298|                35|            205.25|1074.2399997711182|
+-------+-----------------+------------------+------------------+------------------+------------------+



In [109]:
avg_revenue.describe().select('sum(revenue)').head(2)[1]

Row(sum(revenue)='21.394231042919138')

In [110]:
print("Average revenue per order: $",avg_revenue.describe().select('sum(revenue)').head(2)[1][0])

Average revenue per order: $ 21.394231042919138


In [114]:
# Solution 2

#Suggestions invited! 


### Step 17. How many different items are sold?

In [119]:
total_items = price_df.select(countDistinct("item_name"))

In [121]:
print("Total Different Items sold: ",total_items.head(1)[0][0])

Total Different Items sold:  50
