# Creating a `dataframe`

In [6]:
orders_df = ( spark
          .read
          .csv('s3://fcc-spark-example/dataset/2023/orders.csv', header=True, inferSchema=True)
     )


                                                                                

In [7]:
orders_df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

In [8]:
orders_df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: timestamp (nullable = true)
 |-- order_customer_id: integer (nullable = true)
 |-- order_status: string (nullable = true)



In [9]:
orders_df.createOrReplaceTempView('orders')

In [10]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |   orders|       true|
+---------+---------+-----------+



### We have Temp View and Dataframe

- orders_df -> `Dataframe`
- orders -> `Temp View`

In [12]:
orders_df.show()

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
|       6|2013-07-25 00:00:00|             7130|       COMPLETE|
|       7|2013-07-25 00:00:00|             4530|       COMPLETE|
|       8|2013-07-25 00:00:00|             2911|     PROCESSING|
|       9|2013-07-25 00:00:00|             5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|             5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|              918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|             1837|         CLOSED|
|      13|2013-07-25 00:0

### 1. Top 10 customers who placed the most number of orders 

#### Using DF

In [19]:
(orders_df
     .groupBy('order_customer_id')
     .count()
     .sort('count', ascending=False)
     .show(10)
)

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             6316|   16|
|              569|   16|
|             5897|   16|
|            12431|   16|
|             5654|   15|
|            12284|   15|
|              221|   15|
|             5283|   15|
|             5624|   15|
|             4320|   15|
+-----------------+-----+
only showing top 10 rows



#### Using Spark SQL

In [28]:
spark.sql('SELECT \
              order_customer_id, COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_customer_id \
            ORDER BY count DESC \
            LIMIT 10').show()

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             5897|   16|
|            12431|   16|
|             6316|   16|
|              569|   16|
|             5624|   15|
|            12284|   15|
|             5283|   15|
|              221|   15|
|             5654|   15|
|             4320|   15|
+-----------------+-----+



### 2. Find the no. of orders under each order status 

#### Using DF

In [35]:
(orders_df
     .groupBy('order_status')
     .count()
     .sort('count', ascending=False)          # Here count is a transformation
     .show()
)                        

[Stage 46:>                                                         (0 + 1) / 1]

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8274|
|        PENDING| 7609|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



                                                                                

In [33]:
spark.sql('SELECT \
              order_status, COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_status \
            ORDER BY count DESC').show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8274|
|        PENDING| 7609|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



### 3. No. of active customers (at least 1 order they have placed)

#### Using DF

In [41]:
orders_df.select('order_customer_id').distinct().count()    # Here count is an action

12405

#### Using Spark SQL

In [42]:
spark.sql('SELECT \
              COUNT(DISTINCT(order_customer_id)) AS count \
              FROM orders').show()

+-----+
|count|
+-----+
|12404|
+-----+



### 4. Customers with most no. of `CLOSED` orders

#### Using DF

In [50]:
(orders_df 
     .filter('order_status = "CLOSED"')
     .groupBy('order_customer_id')
     .count()
     .sort('count', ascinding=False).show()
)

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             5308|    1|
|             6722|    1|
|             2721|    1|
|             3220|    1|
|              804|    1|
|             3796|    1|
|             6357|    1|
|            12249|    1|
|             1088|    1|
|             6154|    1|
|             2366|    1|
|             4000|    1|
|              540|    1|
|              879|    1|
|             6825|    1|
|             7333|    1|
|            12139|    1|
|             5217|    1|
|              496|    1|
|             9597|    1|
+-----------------+-----+
only showing top 20 rows



In [53]:
spark.sql('SELECT * FROM orders').count()

68881

In [54]:
spark.sql('SELECT DISTINCT order_customer_id FROM orders').count()

12405

In [60]:
(orders_df 
     .filter('order_status = "CLOSED"')
     .groupBy('order_customer_id')
     .count() 
     .orderBy('count', ascending=False) \
     .show()
)

[Stage 114:>                                                        (0 + 1) / 1]

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             1833|    6|
|             1687|    5|
|             5493|    5|
|             1363|    5|
|             8974|    4|
|             2774|    4|
|             2236|    4|
|             4282|    4|
|             5582|    4|
|            12431|    4|
|             9740|    4|
|             7879|    4|
|             4573|    4|
|             9213|    4|
|             4588|    4|
|            10111|    4|
|             2768|    4|
|             7948|    4|
|             9804|    4|
|             1521|    4|
+-----------------+-----+
only showing top 20 rows



                                                                                

In [65]:
spark.sql('SELECT \
            order_customer_id, COUNT(order_id) AS count\
            FROM orders \
            WHERE order_status = "CLOSED" \
            GROUP BY order_customer_id \
            ORDER BY count DESC' \
         ).show()

[Stage 123:>                                                        (0 + 1) / 1]

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             1833|    6|
|             1687|    5|
|             5493|    5|
|             1363|    5|
|             8974|    4|
|             2774|    4|
|             2236|    4|
|             4282|    4|
|             5582|    4|
|            12431|    4|
|             9740|    4|
|             7879|    4|
|             4573|    4|
|             9213|    4|
|             4588|    4|
|            10111|    4|
|             2768|    4|
|             7948|    4|
|             9804|    4|
|             1521|    4|
+-----------------+-----+
only showing top 20 rows



                                                                                

### Utility functions 
- printSchema()
- cache()
- createOrReplaceTempView()