# Creating a `dataframe`

In [1]:
orders_df = ( spark
          .read
          .csv('s3://fcc-spark-example/dataset/2023/orders.csv', header=True, inferSchema=True)
     )


                                                                                

# Creating a TempView

In [2]:
orders_df.createOrReplaceTempView('orders')

### So, we have `Table` and `Dataframe`

- orders_df -> `Dataframe`
- orders -> `Temp View`

In [3]:
# Using DF
orders_df.show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



In [4]:
# Using Spark SQL 
spark.sql('SELECT * FROM orders').show(5)

+--------+-------------------+-----------------+---------------+
|order_id|         order_date|order_customer_id|   order_status|
+--------+-------------------+-----------------+---------------+
|       1|2013-07-25 00:00:00|            11599|         CLOSED|
|       2|2013-07-25 00:00:00|              256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|            12111|       COMPLETE|
|       4|2013-07-25 00:00:00|             8827|         CLOSED|
|       5|2013-07-25 00:00:00|            11318|       COMPLETE|
+--------+-------------------+-----------------+---------------+
only showing top 5 rows



### 1. Top 10 customers who placed the most number of orders 

#### Using DF

In [5]:
(orders_df
     .groupBy('order_customer_id')
     .count()
     .sort('count', ascending=False)
     .show(10)
)

[Stage 4:>                                                          (0 + 1) / 1]

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             6316|   16|
|              569|   16|
|             5897|   16|
|            12431|   16|
|             5654|   15|
|            12284|   15|
|              221|   15|
|             5283|   15|
|             5624|   15|
|             4320|   15|
+-----------------+-----+
only showing top 10 rows



[Stage 6:>                                                          (0 + 1) / 1]                                                                                

In [6]:
(orders_df
     .groupBy('order_customer_id')
     .count()
     .sort(['count', 'order_customer_id'], ascending=[False, False])
     .show(10)
)

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|            12431|   16|
|             6316|   16|
|             5897|   16|
|              569|   16|
|            12284|   15|
|             5654|   15|
|             5624|   15|
|             5283|   15|
|             4320|   15|
|              221|   15|
+-----------------+-----+
only showing top 10 rows



#### Using Spark SQL

In [7]:
spark.sql('SELECT \
              order_customer_id, \
              COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_customer_id \
            ORDER BY count DESC \
            LIMIT 10').show()

[Stage 10:>                                                         (0 + 1) / 1]

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             5897|   16|
|            12431|   16|
|             6316|   16|
|              569|   16|
|             5624|   15|
|            12284|   15|
|             5283|   15|
|              221|   15|
|             5654|   15|
|             4320|   15|
+-----------------+-----+



                                                                                

In [8]:
spark.sql('SELECT \
              order_customer_id, \
              COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_customer_id \
            ORDER BY count DESC, order_customer_id DESC \
            LIMIT 10').show()

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|            12431|   16|
|             6316|   16|
|             5897|   16|
|              569|   16|
|            12284|   15|
|             5654|   15|
|             5624|   15|
|             5283|   15|
|             4320|   15|
|              221|   15|
+-----------------+-----+



### 2. Find the no. of orders under each order status 

#### Using DF

In [9]:
(orders_df
     .groupBy('order_status')
     .count()                                  # Here count is a transformation
     .sort('count', ascending=False)          
     .show()
)                        

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8274|
|        PENDING| 7609|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



In [10]:
spark.sql('SELECT \
              order_status, COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_status \
            ORDER BY count DESC').show()

+---------------+-----+
|   order_status|count|
+---------------+-----+
|       COMPLETE|22899|
|PENDING_PAYMENT|15030|
|     PROCESSING| 8274|
|        PENDING| 7609|
|         CLOSED| 7556|
|        ON_HOLD| 3798|
|SUSPECTED_FRAUD| 1558|
|       CANCELED| 1428|
| PAYMENT_REVIEW|  729|
+---------------+-----+



### 3. No. of active customers (at least 1 order they have placed)

#### Using DF

In [11]:
(
    orders_df
        .select('order_customer_id')
        .distinct()
        .count()    # Here count is an action
)

12405

#### Using Spark SQL

In [12]:
spark.sql('SELECT \
              COUNT(DISTINCT(order_customer_id)) AS count \
              FROM orders').show()

+-----+
|count|
+-----+
|12404|
+-----+



### 4. Customers with most no. of `CLOSED` orders

#### Using DF

In [13]:
(orders_df 
     .filter('order_status = "CLOSED"')
     .groupBy('order_customer_id')
     .count() 
     .orderBy('count', ascending=False) \
     .show(5)
)

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             1833|    6|
|             5493|    5|
|             1363|    5|
|             1687|    5|
|             1521|    4|
+-----------------+-----+
only showing top 5 rows



In [14]:
spark.sql('SELECT \
            order_customer_id, COUNT(order_id) AS count\
            FROM orders \
            WHERE order_status = "CLOSED" \
            GROUP BY order_customer_id \
            ORDER BY count DESC' \
         ).show(5)

+-----------------+-----+
|order_customer_id|count|
+-----------------+-----+
|             1833|    6|
|             5493|    5|
|             1363|    5|
|             1687|    5|
|             1521|    4|
+-----------------+-----+
only showing top 5 rows



## Summary 

### Utility functions 
```python
printSchema()
cache()
createOrReplaceTempView()
```

### Transformations
```python
.groupBy
.groupBy.count() 
.orderBy
.filter
.distinct
.join
```
### Actions 
```python
.show
.head
.tail
.take
.collect
```
