# Starbucks Analysis

#### Import Pyspark

In [1]:
import findspark
findspark.init()

In [4]:
from pyspark.sql import SparkSession, Window
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("starbucks Data Analysis").getOrCreate()

#### Schema creation

In [6]:

starbucks_schema=StructType(fields=[StructField("null", IntegerType(), False),
                                StructField("item",StringType(), True),
                                StructField("calories",IntegerType(), True),
                                StructField("fat",IntegerType(), True),
                                StructField("carb",IntegerType(), True),	
                                StructField("fiber",IntegerType(), True),
                                StructField("protien",IntegerType(), True),
                                StructField("type",StringType(), True)])



#### Reading file in dataframe

In [12]:
starbucks_df=spark.read\
.option("header",True)\
.schema(starbucks_schema)\
.csv("F:\starbucks.csv")

#### Load the data

In [13]:
starbucks_df.show(5)

+----+--------------------+--------+---+----+-----+-------+------+
|null|                item|calories|fat|carb|fiber|protien|  type|
+----+--------------------+--------+---+----+-----+-------+------+
|   1|        8-Grain Roll|     350|  8|  67|    5|     10|bakery|
|   2|   Apple Bran Muffin|     350|  9|  64|    7|      6|bakery|
|   3|       Apple Fritter|     420| 20|  59|    0|      5|bakery|
|   4|     Banana Nut Loaf|     490| 19|  75|    4|      7|bakery|
|   5|Birthday Cake Min...|     130|  6|  17|    0|      0|bakery|
+----+--------------------+--------+---+----+-----+-------+------+
only showing top 5 rows



#### Renaming a column name

In [17]:
starbucks_df_rename=starbucks_df.withColumnRenamed("null","row_num")

In [18]:
starbucks_df_rename.show(5)

+-------+--------------------+--------+---+----+-----+-------+------+
|row_num|                item|calories|fat|carb|fiber|protien|  type|
+-------+--------------------+--------+---+----+-----+-------+------+
|      1|        8-Grain Roll|     350|  8|  67|    5|     10|bakery|
|      2|   Apple Bran Muffin|     350|  9|  64|    7|      6|bakery|
|      3|       Apple Fritter|     420| 20|  59|    0|      5|bakery|
|      4|     Banana Nut Loaf|     490| 19|  75|    4|      7|bakery|
|      5|Birthday Cake Min...|     130|  6|  17|    0|      0|bakery|
+-------+--------------------+--------+---+----+-----+-------+------+
only showing top 5 rows



#### Top five Items with high Calories

In [39]:
starbucks_df_rename\
.select("item","calories")\
.orderBy(desc("calories")).show(5)

+--------------------+--------+
|                item|calories|
+--------------------+--------+
|Sausage & Cheddar...|     500|
|     Banana Nut Loaf|     490|
|Iced Lemon Pound ...|     490|
|Zucchini Walnut M...|     490|
|Cranberry Orange ...|     490|
+--------------------+--------+
only showing top 5 rows



#### top 5 items with high fat

In [40]:
starbucks_df_rename\
.select("item","fat")\
.orderBy(desc("fat")).show(5)

+--------------------+---+
|                item|fat|
+--------------------+---+
|      Cheese & Fruit| 28|
|Zucchini Walnut M...| 28|
|Sausage & Cheddar...| 28|
| Egg Salad Sandwich | 27|
|     Salumi & Cheese| 26|
+--------------------+---+
only showing top 5 rows



#### top 5 items with low fat

In [109]:
starbucks_df_rename\
.select("item","fat")\
.orderBy(asc("fat"))\
.filter("fat is not NULL").show(5)

+--------------------+---+
|                item|fat|
+--------------------+---+
|  Deluxe Fruit Blend|  0|
|         Plain Bagel|  1|
|Everything with C...|  2|
|    Multigrain Bagel|  3|
|Marshmallow Dream...|  4|
+--------------------+---+
only showing top 5 rows



In [42]:
starbucks_df_rename.createOrReplaceTempView("starbucks")

### using SQL

#### top 5 Items with High Protien

In [101]:
spark.sql("""
select item as top_five_items,protien from starbucks
order by protien desc
limit 5
""").show()

+--------------------+-------+
|      top_five_items|protien|
+--------------------+-------+
|Turkey & Swiss Sa...|     34|
|Tarragon Chicken ...|     32|
|  Ham & Swiss Panini|     28|
|Chipotle Chicken ...|     26|
|Chicken Santa Fe ...|     26|
+--------------------+-------+



#### Total number of Items per Type

In [64]:
spark.sql("""
select type,count(item) as total_items from starbucks
group by type
order by total_items desc """).show()

+-------------+-----------+
|         type|total_items|
+-------------+-----------+
|       bakery|         41|
|       petite|          9|
|   bistro box|          8|
|hot breakfast|          8|
|     sandwich|          7|
|      parfait|          3|
|        salad|          1|
+-------------+-----------+



#### Total number of Items per Type by  using Dataframe

In [73]:
starbucks_df_rename\
.groupBy("type")\
.agg(count("item").alias("total_items")).show()

+-------------+-----------+
|         type|total_items|
+-------------+-----------+
|     sandwich|          7|
|   bistro box|          8|
|      parfait|          3|
|       petite|          9|
|       bakery|         41|
|hot breakfast|          8|
|        salad|          1|
+-------------+-----------+

