# Creating a `dataframe`

In [None]:
orders_df = ( spark
          .read
          .csv('s3://fcc-spark-example/dataset/2023/orders.csv', header=True, inferSchema=True)
     )


In [None]:
orders_df.show(5)

# Creating a TempView

In [None]:
orders_df.createOrReplaceTempView('orders')

### So, we have `Table` and `Dataframe`

- orders_df -> `Dataframe`
- orders -> `Temp View`

In [None]:
# Using DF
orders_df.show(5)

In [None]:
# Using Spark SQL 
spark.sql('SELECT * FROM orders').show(5)

### 1. Top 10 customers who placed the most number of orders 

#### Using DF

In [None]:
(orders_df
     .groupBy('order_customer_id')
     .count()                               # This is a transformation 
     .orderBy('count', ascending=False)
     .show(10)
)

In [None]:
(orders_df
     .groupBy('order_customer_id')
     .count()
     .orderBy(['count', 'order_customer_id'], ascending=[False, True])
     .show(10)
)

#### Using Spark SQL

In [None]:
spark.sql('SELECT \
              order_customer_id, \
              COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_customer_id \
            ORDER BY count DESC \
            LIMIT 10').show()

In [None]:
spark.sql('SELECT \
              order_customer_id, \
              COUNT(order_id) AS count  \
            FROM orders \
            GROUP BY order_customer_id \
            ORDER BY count DESC, order_customer_id ASC \
            LIMIT 10').show()

### 2. Find the no. of orders under each order status 

In [None]:
orders_df.show(5)

#### Using DF

In [None]:
(orders_df
     .groupBy('order_status')
     .count()                                  # Here count is a transformation
     .sort('count', ascending=False)          
     .show()
)                        

In [None]:
spark.sql('SELECT \
              order_status, COUNT(order_id) AS count_col  \
            FROM orders \
            GROUP BY order_status \
            ORDER BY count_col DESC').show()

### 3. No. of active customers (at least 1 order they have placed)

#### Using DF

In [None]:
orders_df.show(3)

In [None]:
(
    orders_df
        .select('order_customer_id')
        .distinct()
        .count()                             # Here count is an action
)

#### Using Spark SQL

In [None]:
spark.sql('SELECT \
              COUNT(DISTINCT(order_customer_id)) AS unique_no_customers \
            FROM orders').show()

### 4. Customers with most no. of `CLOSED` orders

#### Using DF

In [None]:
(orders_df 
     .filter('order_status = "CLOSED"')
     .groupBy('order_customer_id')
     .count() 
     .orderBy('count', ascending=False) \
     .show(5)
)

In [None]:
spark.sql('SELECT \
            order_customer_id, COUNT(order_id) AS count\
            FROM orders \
            WHERE order_status = "CLOSED" \
            GROUP BY order_customer_id \
            ORDER BY count DESC' \
         ).show(5)

## Summary 

### Transformations
```python
.groupBy
.groupBy.count() 
.orderBy
.filter
.distinct
.join
```
### Actions 
```python
.show
.head
.tail
.take
.collect
```
