# Commonly purchased together

* Source: https://towardsdatascience.com/twenty-five-sql-practice-exercises-5fc791e24082
* Source: https://www.careercup.com/question?id=5759072822362112
        
Using the following two tables, write a query to return the names and purchase frequency of the top three pairs of products most often bought together. The names of both products should appear in one column.

In [2]:
%run Question.ipynb

 * postgresql://fknight:***@localhost/postgres
Done.
Done.
7 rows affected.
7 rows affected.
 * postgresql://fknight:***@localhost/postgres
Done.
Done.
5 rows affected.
5 rows affected.


In [4]:
%%sql

SELECT * FROM products

 * postgresql://fknight:***@localhost/postgres
5 rows affected.


id,name
1,A
2,B
3,C
4,D
5,E


In [5]:
%%sql

SELECT * FROM orders

 * postgresql://fknight:***@localhost/postgres
7 rows affected.


order_id,customer_id,product_id
1,1,1
1,1,2
1,1,3
2,2,1
2,2,2
2,2,4
3,1,5


# Part A

Write a query that gives every pair of products that have been ordered together.

## Example answer

In [6]:
%%sql

SELECT
    o1.order_id,
    o1.product_id AS prod_1,
    o2.product_id AS prod_2
FROM orders o1
JOIN orders o2
ON o1.order_id = o2.order_id
AND o1.product_id < o2.product_id

 * postgresql://fknight:***@localhost/postgres
6 rows affected.


order_id,prod_1,prod_2
1,1,2
1,1,3
1,2,3
2,1,2
2,1,4
2,2,4


# Part B

Using the query from Part A to list the names of the pair of products in one column.

```sql
WITH product_pairs AS (
    SELECT
        o1.order_id,
        o1.product_id AS prod_1,
        o2.product_id AS prod_2
    FROM orders o1
    JOIN orders o2
    ON o1.order_id = o2.order_id
    AND o1.product_id < o2.product_id
)
```

## Example answer

In [8]:
%%sql

WITH product_pairs AS (
    SELECT
        o1.order_id,
        o1.product_id AS prod_1,
        o2.product_id AS prod_2
    FROM orders o1
    JOIN orders o2
    ON o1.order_id = o2.order_id
    AND o1.product_id < o2.product_id
)

SELECT concat(p1.name, ' ', p2.name) AS product_pair 
FROM product_pairs pps
JOIN products p1
ON pps.prod_1 = p1.id
JOIN products p2
ON pps.prod_2 = p2.id

 * postgresql://fknight:***@localhost/postgres
6 rows affected.


product_pair
A B
A B
A C
B C
A D
B D


# Part C

Using the subqueries from Part A & B, solve the original problem.

```sql
WITH product_pairs AS (
    SELECT
        o1.order_id,
        o1.product_id AS prod_1,
        o2.product_id AS prod_2
    FROM orders o1
    JOIN orders o2
    ON o1.order_id = o2.order_id
    AND o1.product_id < o2.product_id
),

named_pairs AS (
    SELECT concat(p1.name, ' ', p2.name) AS product_pair 
    FROM product_pairs pps
    JOIN products p1
    ON pps.prod_1 = p1.id
    JOIN products p2
    ON pps.prod_2 = p2.id
)
```

## Example answer

In [13]:
%%sql

WITH product_pairs AS (
    SELECT
        o1.order_id,
        o1.product_id AS prod_1,
        o2.product_id AS prod_2
    FROM orders o1
    JOIN orders o2
    ON o1.order_id = o2.order_id
    AND o1.product_id < o2.product_id
),

named_pairs AS (
    SELECT concat(p1.name, ' ', p2.name) AS product_pair 
    FROM product_pairs pps
    JOIN products p1
    ON pps.prod_1 = p1.id
    JOIN products p2
    ON pps.prod_2 = p2.id
)

SELECT 
    *, 
    count(*) AS purchase_freq 
FROM named_pairs
GROUP BY product_pair
ORDER BY purchase_freq DESC, product_pair ASC
LIMIT 5;

 * postgresql://fknight:***@localhost/postgres
5 rows affected.


product_pair,purchase_freq
A B,2
A C,1
A D,1
B C,1
B D,1


## The full solution is given below

In [15]:
%%sql

-- get unique product pairs from same order by self-joining orders 
-- table on order ID and product ID < product ID (avoids identical 
-- and double-counted product pairs)

WITH t1 AS (
    SELECT o1.product_id AS prod_1, o2.product_id AS prod_2 FROM orders o1
    JOIN orders o2
    ON o1.order_id = o2.order_id
    AND o1.product_id < o2.product_id 
),

-- join products table onto this to get product names, concatenate 
-- to get product pairs in one column

t2 AS (
    SELECT concat(p1.name, ' ', p2.name) AS product_pair FROM t1
    JOIN products p1
    ON t1.prod_1 = p1.id
    JOIN products p2
    ON t1.prod_2 = p2.id
)

-- grouping by product pair, return top 3 entries sorted by purchase
-- frequency

SELECT *, count(*) AS purchase_freq FROM t2
GROUP BY 1
ORDER BY purchase_freq DESC, product_pair ASC
LIMIT 5;

 * postgresql://fknight:***@localhost/postgres
5 rows affected.


product_pair,purchase_freq
A B,2
A C,1
A D,1
B C,1
B D,1
