# Week 1 Exercise 2

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

import snowflake.connector

In [3]:
PROJ_ROOT = os.pardir
src_dir = os.path.join(PROJ_ROOT, "src")
sys.path.append(src_dir)

In [4]:
%aimport sql_utils
from sql_utils import show_sql_df

## About

Week 1 Exercise 2

## User Inputs

In [5]:
tables = [
    'customers.customer_data',
    'customers.customer_address',
    'customers.customer_survey',
    'resources.recipe_tags',
    'chefs.recipe',
]

# CTE from exercise 1 with only the recursive members that return the
# customers who are eligible to place an order from Virtual Kitchen
basequery = f"""
            -- 1. clean us_cities table columns by removing whitespaces,
            -- changing city to lowercase and keeping the state abbreviation
            -- in uppercase
            WITH t1 AS (
                SELECT city_id,
                       city_name,
                       state_abbr,
                       -- remove whitespace in city name
                       RTRIM(LTRIM(LOWER(city_name), ' '), ' ') AS city_name_cleaned,
                       -- remove whitespace in state abbreviation
                       RTRIM(LTRIM(UPPER(state_abbr), ' '), ' ') AS state_abbr_cleaned,
                       geo_location
                FROM resources.us_cities
            ),
            -- 2. from cleaned us_cities columns, get cities with the first city ID
            t2 AS (
                SELECT city_name,
                       state_abbr,
                       city_name_cleaned,
                       state_abbr_cleaned,
                       MIN(city_id) AS city_id
                FROM t1
                GROUP BY ALL
            ),
            -- 3. filter original us_cities table to only capture cities with
            -- unique city ID in cleaned us_cities columns
            t3 AS (
                SELECT t2.city_id,
                       t2.city_name,
                       t2.state_abbr,
                       t1.city_name_cleaned,
                       t1.state_abbr_cleaned,
                       t1.geo_location
                FROM t1
                INNER JOIN t2 USING (city_id, city_name_cleaned, state_abbr_cleaned)
            ),
            -- 4. clean customer address table columns by removing whitespaces,
            -- changing city to lowercase and keeping the state abbreviation
            -- in uppercase
            t4 AS (
                SELECT customer_id,
                       customer_city,
                       customer_state,
                       -- remove whitespace in city name
                       RTRIM(LTRIM(LOWER(customer_city), ' '), ' ') AS customer_city_cleaned,
                       -- remove whitespace in state abbreviation
                       RTRIM(LTRIM(UPPER(customer_state), ' '), ' ') AS customer_state_cleaned
                FROM customers.customer_address
            ),
            -- 5. perform INNER JOIN using cleaned customer address and us_cities
            -- tables to get geo_location for each customer in the cleaned customer
            -- address table. The `INNER JOIN` selects customers whose city/state
            -- is in the the database and these customers are eligible to place
            -- an order from Virtual Kitchen. Perform the INNER JOIN using the
            -- cleaned columns (columns with the _cleaned suffix). However, in the
            -- output, exclude columns with the _cleaned suffix. Finally, perform
            -- a LEFT JOIN between the the output and the customers_data table in
            -- order to capture the customer's first and last name.
            t5 AS (
                SELECT t4.customer_id,
                       t4.customer_state,
                       cd.first_name,
                       cd.last_name,
                       cd.email,
                       t3.geo_location
                FROM t4
                -- JOIN with cleaned and non-duplicated us_cities table
                INNER JOIN t3 ON t4.customer_city_cleaned = t3.city_name_cleaned
                AND t4.customer_state_cleaned = t3.state_abbr_cleaned
                -- JOIN with customer_data table to get first_name and last_name
                LEFT JOIN (
                    SELECT customer_id,
                           first_name,
                           last_name,
                           email
                    FROM customers.customer_data
                ) AS cd USING (customer_id)
            )
            """

In [6]:
snow_connector_dict = dict(
    account=os.getenv("UPLIMIT_SNOWFLAKE_ACCOUNT"),
    user=os.getenv("UPLIMIT_SNOWFLAKE_USER"),
    password=os.getenv("UPLIMIT_SNOWFLAKE_PASS"),
    warehouse=os.getenv("UPLIMIT_SNOWFLAKE_WAREHOUSE"),
    database=os.getenv("UPLIMIT_SNOWFLAKE_DB_NAME"),
    role=os.getenv("UPLIMIT_SNOWFLAKE_ROLE"),
)

## Connect

In [7]:
conn = snowflake.connector.connect(**snow_connector_dict)
cur = conn.cursor()

## Tables

In [8]:
%%time
for table_name in tables:
    print(table_name)
    query = f"""
            SELECT *
            FROM {table_name}
            LIMIT 4
            """
    _ = show_sql_df(query, cur, conn, True, True)

customers.customer_data


column,customer_id,first_name,last_name,email,created_at
dtype,string[python],string[python],string[python],string[python],"datetime64[ns, America/Los_Angeles]"
nunique,4,4,4,4,4
missing,0,0,0,0,0
0,52c4ec3c-c93b-4a65-97cc-acd4f72e9196,Roberto,Terry,Roberto.Terry@email.com,2022-06-03 01:01:02-07:00
1,dd3ef9b2-1129-497d-b626-83fd3d6000af,Jacob,Everett,Jacob.Everett@email.com,2022-11-03 04:08:20-07:00
2,0324dd60-c476-4d86-a09a-ae7ec411c723,Sonia,Schulz,Sonia.Schulz@email.com,2022-06-23 09:28:30-07:00
3,f22a3a2f-8889-45e6-a2b1-19de55ba674a,Valentin,Bowman,Valentin.Bowman@email.com,2022-09-07 17:08:32-07:00


customers.customer_address


column,address_id,customer_id,customer_city,customer_state
dtype,string[python],string[python],string[python],string[python]
nunique,4,4,4,3
missing,0,0,0,0
0,17a6902b-d09b-4a4d-9f44-c4aab937d854,dd3ef9b2-1129-497d-b626-83fd3d6000af,Stringtown,TX
1,de9620c9-8ebf-4ebf-89c8-411b84495982,0324dd60-c476-4d86-a09a-ae7ec411c723,Newport,GA
2,9be2a042-0690-44a6-86cf-a6219d8af077,f22a3a2f-8889-45e6-a2b1-19de55ba674a,Glenville,IA
3,e9aa02d8-6331-4ecd-b2b2-f208b90ce65d,7576a8b6-e31a-4451-86ae-f77d9f060347,Woodville,IA


customers.customer_survey


column,customer_id,tag_id,is_active
dtype,string[python],string[python],boolean
nunique,4,3,1
missing,0,0,0
0,39e9e271-42f1-42c5-8fc3-5a237ce5f292,eecdb740-de16-496a-b427-216bd055c21e,True
1,7f5a1254-9eae-460e-842e-3a07f8f919cf,eecdb740-de16-496a-b427-216bd055c21e,True
2,2f6ea9aa-31c6-4e4e-885f-7164eac249a0,e3ac3c87-caee-40da-9859-05b34c531d50,True
3,9e70b283-a8a7-40f7-b0d7-7e15e60360e4,c11c4a6f-2b21-4a3c-9959-cf9af15f7619,True


resources.recipe_tags


column,tag_id,tag_property,tag_random
dtype,string[python],string[python],Int64
nunique,4,4,4
missing,0,0,0
0,eecdb740-de16-496a-b427-216bd055c21e,served-hot,1
1,e3ac3c87-caee-40da-9859-05b34c531d50,shellfish,2
2,c11c4a6f-2b21-4a3c-9959-cf9af15f7619,omelets-and-frittatas,3
3,1a13753e-6a92-4a89-b851-4b4a6a59b2cc,sauces,4


chefs.recipe


column,recipe_id,chef_id,recipe_name,minutes,submitted,tag_list,ingredient_count,ingredients,step_count,steps,recipe_json
dtype,string[python],string[python],string[python],Int64,object,string[python],Int64,string[python],Int64,string[python],string[python]
nunique,4,4,4,3,4,4,3,4,4,4,4
missing,0,0,0,0,0,0,0,0,0,0,0
0,71038e82-1e56-49ba-be57-bc0cc4b1eec9,d0fabf8f-143c-4f6b-9b83-0b0a420ff802,asparagus with orange ginger sauce,18,2022-11-08,"[  ""30-minutes-or-less"",  "" time-to-make"",  "" main-ingredient"",  "" preparation"",  "" fruit"",  "" vegetables"",  "" steam"",  "" diabetic"",  "" vegetarian"",  "" dietary"",  "" citrus"",  "" oranges...",6,"[  ""fresh asparagus"",  "" orange juice"",  "" orange zest"",  "" cornstarch"",  "" fresh gingerroot"",  "" slivered almonds"" ]",8,"[  ""wash asparagus and snap off tough ends"",  "" place in a steamer basket and steam until just tender "",  "" about 5 minutes"",  "" in a small saucepan "",  "" stir together juice "",  "" zest and ...","{  ""recipe_name"": ""asparagus with orange ginger sauce"",  ""recipe_tags"": {  ""tag1"": "" time-to-make"",  ""tag2"": "" main-ingredient"",  ""tag3"": "" preparation"",  ""tag4"": "" fruit"",  ""tag5..."
1,1853d956-0d7f-4a42-aec3-68a3728d9ef6,1c54741b-45b5-4404-bac4-3926d3d4e6f8,asparagus with orange vinaigrette,22,2022-04-20,"[  ""30-minutes-or-less"",  "" time-to-make"",  "" course"",  "" main-ingredient"",  "" preparation"",  "" low-protein"",  "" healthy"",  "" 5-ingredients-or-less"",  "" side-dishes"",  "" vegetables"",  ""...",5,"[  ""asparagus"",  "" orange rind"",  "" olive oil"",  "" hot mustard"",  "" orange juice"" ]",4,"[  ""wash the asparagus and trim them by breaking theim at the point where the tough woody part of the stem meets the tender part"",  "" steam the spears over hot water for about 7 minutes "",  "" d...","{  ""recipe_name"": ""asparagus with orange vinaigrette"",  ""recipe_tags"": {  ""tag1"": "" time-to-make"",  ""tag2"": "" course"",  ""tag3"": "" main-ingredient"",  ""tag4"": "" preparation"",  ""tag5..."
2,fc73c53a-54d5-489f-93ed-abe330913bde,01e538c5-cc42-4b28-84ba-f7d7a47a7823,asparagus with oyster sauce,20,2022-08-01,"[  ""30-minutes-or-less"",  "" time-to-make"",  "" course"",  "" main-ingredient"",  "" cuisine"",  "" preparation"",  "" low-protein"",  "" healthy"",  "" side-dishes"",  "" vegetables"",  "" asian"",  "" c...",10,"[  ""oil"",  "" ginger"",  "" green onion"",  "" garlic"",  "" asparagus"",  "" oyster sauce"",  "" chicken stock"",  "" sugar"",  "" cornstarch"",  "" water"" ]",6,"[  ""heat oil in a wok over medium-low heat"",  "" stir-fry ginger "",  "" scallion "",  "" and garlic until aroma develops "",  "" about 1 minute"",  "" add asparagus and raise heat"",  "" stir-fry for...","{  ""recipe_name"": ""asparagus with oyster sauce"",  ""recipe_tags"": {  ""tag1"": "" time-to-make"",  ""tag2"": "" course"",  ""tag3"": "" main-ingredient"",  ""tag4"": "" cuisine"",  ""tag5"": "" prepa..."
3,0d9ec2b2-323f-4919-a6ab-6db8be83e59a,1d6e42af-eb31-4287-9e6f-02e655221929,asparagus with parmesan crust,20,2022-09-27,"[  ""30-minutes-or-less"",  "" time-to-make"",  "" course"",  "" main-ingredient"",  "" preparation"",  "" occasion"",  "" side-dishes"",  "" eggs-dairy"",  "" vegetables"",  "" easy"",  "" dinner-party"",  ...",6,"[  ""asparagus spear"",  "" olive oil"",  "" parmesan cheese"",  "" black pepper"",  "" salt"",  "" balsamic vinegar"" ]",7,"[  ""set oven to 425 degrees"",  "" place the asparagus on a baking sheet"",  "" drizzle with olive oil"",  "" toss to coat "",  "" then arrange in a single layer"",  "" spread the parmesan cheese over...","{  ""recipe_name"": ""asparagus with parmesan crust"",  ""recipe_tags"": {  ""tag1"": "" time-to-make"",  ""tag2"": "" course"",  ""tag3"": "" main-ingredient"",  ""tag4"": "" preparation"",  ""tag5"": ""..."


CPU times: user 198 ms, sys: 8.69 ms, total: 207 ms
Wall time: 1.17 s


## Queries

### Exercise 2

#### Step 1. Create a query to return those customers who are eligible to order and have at least one food preference selected. Include up to three of their food preferences. If the customer has more than three food preferences, then return the first three, sorting in alphabetical order.

Use the following approach

1. (steps 1 to 5) get customers who are eligible to place an order
2. in the `tag_property` table
   - (step 6) clean the `tag_property` column
3. (step 7) perform an `LEFT JOIN` between the eligible customers and cleaned `customer_survey` and filter out any customers who do not have at least one food preference (where the cleaned `tag_property` column is '\<NA>\')
4. (step 8) reshape the `JOIN`ed data to convert the food preferences column into separate columns, where each food preference is contained in a separate column

In [9]:
%%time
query = f"""
        -- - bring in customers who are eligible to place an order, which was found in exercise 1 part 1
        -- - clean tag_property column from the customer_survey table
        -- - LEFT JOIN eligible customers with cleaned customer survey & exclude customers without food preference
        -- - reshape (PIVOT) the JOINed output from above to get one row per customer with three columns
        --   for the three required food preferences
        --
        -- CAPTURE ELIGIBLE CUSTOMERS FROM EXERCISE 1 PART 1
        {basequery},
        -- EXERCISE 2 PART 1
        -- 6. clean customer_survey table columns by removing whitespaces and
        -- changing the tag_property to lowercase
        t6 AS (
            SELECT cs.customer_id,
                   rt.tag_property,
                   RTRIM(LTRIM(LOWER(rt.tag_property), ' '), ' ') AS tag_property_cleaned
            FROM customers.customer_survey AS cs
            LEFT JOIN resources.recipe_tags AS rt USING (tag_id)
        ),
        -- 7. perform a LEFT JOIN between the eligible customers and the
        -- cleaned customer_survey, excluding customers who do not have a food
        -- preference selected (where the cleaned tag_property is missing)
        t7 AS (
            SELECT t5.customer_id,
                   t5.first_name,
                   t5.last_name,
                   t5.email,
                   -- t6.tag_property,
                   t6.tag_property_cleaned,
                   RANK() OVER(PARTITION BY customer_id ORDER BY tag_property_cleaned) AS food_preference
            FROM t5
            LEFT JOIN t6 USING (customer_id)
            WHERE tag_property != '<NA>'
        ),
        -- 8. reshape from tidy to untidy data using a PIVOT in order to get
        -- the food preferences (rows) as three columns and rename the columns
        t8 AS (
            SELECT *
            FROM t7
            PIVOT(MIN(tag_property_cleaned) FOR food_preference in (1, 2, 3)) AS p (
                customer_id,
                first_name,
                last_name,
                email,
                food_preference_1,
                food_preference_2,
                food_preference_3
            )
        )
        SELECT *
        FROM t8
        """
_ = show_sql_df(query, cur, conn, True, True)

column,customer_id,first_name,last_name,email,food_preference_1,food_preference_2,food_preference_3
dtype,string[python],string[python],string[python],string[python],string[python],string[python],string[python]
nunique,1048,530,716,1042,440,349,215
missing,0,0,0,0,0,425,725
0,41b5cd42-503b-4175-913e-295b40c478a7,Shelley,David,Shelley.David@email.com,oatmeal,,
1,26b0ad3a-75e8-4e81-8f5d-b8952181d99e,John,Brown,John.Brown@email.com,czech,main-dish-pasta,novelty
2,86fa1c6f-9c57-45ee-8f51-ddfac2182502,Darryl,Davis,Darryl.Davis@email.com,bread-machine,casseroles,eggs
3,82073dcc-3182-4cd4-9f96-23031cb52708,Viola,Pinson,Viola.Pinson@email.com,polish,tex-mex,
4,23411d0e-4ba5-4336-8b81-05f7e75703d0,George,Brown,George.Brown@email.com,leftovers,preparation,smoothies
...,...,...,...,...,...,...,...
1043,9653f472-f15b-4ec0-b7c5-9ae4bc64596f,Nicholas,Hernandez,Nicholas.Hernandez@email.com,mothers-day,,
1044,88b95a8e-b81f-45b1-8b31-37e6b20c174d,Michael,Carnahan,Michael.Carnahan@email.com,lamb-sheep-main-dish,south-american,
1045,6c2ce2d1-0622-4bcb-8496-be6ff6bf8b0a,William,Santos,William.Santos@email.com,marinades-and-rubs,,
1046,63bea6a6-ed1c-4e9f-a297-c70d2c369bcb,Milton,Dixon,Milton.Dixon@email.com,bass,egg-free,high-protein


CPU times: user 99.7 ms, sys: 11.4 ms, total: 111 ms
Wall time: 631 ms


### Step 2: Add a column to the query from Step 1 that suggests one recipe that matches food preference #1.

Use the following approach

1. (steps 1 to 5) get customers who are eligible to place an order
2. in the `customer_survey` table
   - (step 6) clean the `tag_property` column
3. (step 7) perform an `LEFT JOIN` between the eligible customers and cleaned `customer_survey` and filter out any customers who do not have at least one food preference (where the cleaned `tag_property` column is '\<NA>\')
4. (step 8) reshape the `JOIN`ed data to convert the food preferences column into three separate columns, where each of the three required food preferences is contained in a separate column
5. (step 9) flatten and clean the `tag_list` column from the `recipe` table
   - the `tag_list` column is chosen since tags are the same as food preferences
6. (step 10) for each recipe tag, get the last recipe ID
   - since only one recipe is required per tag, select the last one
7. (step 11) for the selected recipe ID, get the recipe name
8. (step 12) `LEFT JOIN` the food preferences per customer with the selected recipe name using the first food preference

In [10]:
%%time
query = f"""
        -- bring in customers who are eligible to place an order, which was found in exercise 1 part 1
        -- exercise 2 part 1
        -- - clean tag_property column from the customer_survey table
        -- - LEFT JOIN eligible customers with cleaned customer survey & exclude customers without food preference
        -- - reshape (PIVOT) the JOINed output from above to get one row per customer with three columns
        --   for the three required food preferences
        -- exercise 2 part 2
        -- - flatten the tag_list from the recipe table to get all tags as separate rows
        -- - for each flattened tag, select the last recipe ID (since only one recipe is required per tag)
        -- - JOIN the above output with the recipe table again to caputre the recipe name for the selected
        --   recipe ID
        -- - LEFT JOIN the PIVOTed eligible customers with the selected recipe using the
        --   first food preference column (customers) and the recipe tag (recipes) to get the recommended recipe
        --   for each customers first food preference
        --
        -- CAPTURE ELIGIBLE CUSTOMERS FROM EXERCISE 1 PART 1
        {basequery},
        -- EXERCISE 2 PART 1
        -- 6. clean customer_survey table columns by removing whitespaces and
        -- changing the tag_property to lowercase
        t6 AS (
            SELECT cs.customer_id,
                   rt.tag_property,
                   RTRIM(LTRIM(LOWER(rt.tag_property), ' '), ' ') AS tag_property_cleaned
            FROM customers.customer_survey AS cs
            LEFT JOIN resources.recipe_tags AS rt USING (tag_id)
        ),
        -- 7. perform a LEFT JOIN between the eligible customers and the
        -- cleaned customer_survey, excluding customers who do not have a food
        -- preference selected (where the cleaned tag_property is missing)
        t7 AS (
            SELECT t5.customer_id,
                   t5.first_name,
                   t5.last_name,
                   t5.email,
                   -- t6.tag_property,
                   t6.tag_property_cleaned,
                   RANK() OVER(PARTITION BY customer_id ORDER BY tag_property_cleaned) AS food_preference
            FROM t5
            LEFT JOIN t6 USING (customer_id)
            WHERE tag_property != '<NA>'
        ),
        -- 8. reshape from tidy to untidy data using a PIVOT in order to get
        -- the three food preferences (rows) as three columns and rename the columns
        t8 AS (
            SELECT *
            FROM t7
            PIVOT(MIN(tag_property_cleaned) FOR food_preference in (1, 2, 3)) AS p (
                customer_id,
                first_name,
                last_name,
                email,
                food_preference_1,
                food_preference_2,
                food_preference_3
            )
        ),
        -- EXERCISE 2 PART 2
        -- 12. flatten the tag_list (list of food preferences) from the recipe
        -- table and clean the flattened version
        t9 AS (
            SELECT rc.recipe_id,
                   rc_flat.value AS recipe_tag,
                   RTRIM(LTRIM(LOWER(rc_flat.value), ' '), ' ') AS recipe_tag_cleaned
            FROM chefs.recipe AS rc, table(flatten(tag_list)) AS rc_flat
        ),
        -- 13. for each cleaned and flattened recipe tag, get one recipe ID
        -- (use the last recipe ID)
        t10 AS (
            SELECT t9.recipe_tag_cleaned,
                   MAX(t9.recipe_id) AS recipe_id
            FROM t9
            GROUP BY recipe_tag_cleaned
        ),
        -- 14. for the selected recipe ID (per recipe tag), get the recipe name
        t11 AS (
            SELECT t10.recipe_tag_cleaned,
                   rc.recipe_name,
            FROM t10
            LEFT JOIN chefs.recipe AS rc USING (recipe_id)
        ),
        -- 15. LEFT JOIN pivotted food preferences for each eligible customer
        -- with the selected recipe. The recipe tags are the same as the
        -- food preferences. Since a recipe is only required for the first
        -- food preference, the JOIN should be performed using the first food
        -- preference column (LHS of JOIN) and recipe tag (RHS of JOIN)
        t12 AS (
            SELECT *
            FROM t8
            LEFT JOIN t11 ON t8.food_preference_1 = t11.recipe_tag_cleaned
            ORDER BY t8.email
        )
        SELECT *
        FROM t12
        """
_ = show_sql_df(query, cur, conn, True, True)

column,customer_id,first_name,last_name,email,food_preference_1,food_preference_2,food_preference_3,recipe_tag_cleaned,recipe_name
dtype,string[python],string[python],string[python],string[python],string[python],string[python],string[python],string[python],string[python]
nunique,1048,530,716,1042,440,349,215,440,288
missing,0,0,0,0,0,425,725,0,0
0,1ae290c8-2962-4ab6-9802-54388e3ac4ed,Aaron,Davis,Aaron.Davis@email.com,beef-liver,healthy-2,oven,beef-liver,chicken liver stroganoff
1,1d02bdbe-38db-4598-8b95-d1227b7094a2,Aaron,Dugan,Aaron.Dugan@email.com,beijing,frozen-desserts,irish-st-patricks-day,beijing,beijing banana toffee deep fried bananas
2,d74a8d6d-3a66-411b-8d84-3fcc1f6b14d5,Aaron,Lentz,Aaron.Lentz@email.com,granola-and-porridge,south-african,,granola-and-porridge,oats n honey granola pie
3,9aae0ee2-e08f-4c4e-9522-76516bf46624,Adam,Whitt,Adam.Whitt@email.com,vegetarian,,,vegetarian,salata arabieh
4,0df5c81f-0313-4fc7-b23b-48f6bd402b8a,Adela,Moore,Adela.Moore@email.com,hidden-valley-ranch,malaysian,,hidden-valley-ranch,breakfast pizza rsc
...,...,...,...,...,...,...,...,...,...
1043,71935817-9cff-4767-a662-b97d2631239e,Winifred,Ramon,Winifred.Ramon@email.com,heirloom-historical-recipes,,,heirloom-historical-recipes,simple hard apple cider
1044,67ed1979-3d7a-4229-a2dd-99b575dba0b1,Yolanda,Metz,Yolanda.Metz@email.com,danish,lettuces,sandwiches,danish,chocolate beer cake
1045,63a0cc0c-4262-417c-bcfc-0fdcf0e940ee,Yolando,Howe,Yolando.Howe@email.com,pies,,,pies,chocolate peanut butter tofu pie
1046,994b6e03-c1c7-4d47-99cf-fde3b4bc064f,Yvonne,Bird,Yvonne.Bird@email.com,ethiopian,fall,tuna,ethiopian,ethiopian honey yeast bread


CPU times: user 72.9 ms, sys: 670 µs, total: 73.6 ms
Wall time: 1.09 s


## SQL for Queries

The SQL for the final query for part 1 from above is shown below

```sql
-- - bring in customers who are eligible to place an order, which was found in exercise 1 part 1
-- - clean tag_property column from the customer_survey table
-- - LEFT JOIN eligible customers with cleaned customer survey & exclude customers without food preference
-- - reshape (PIVOT) the JOINed output from above to get one row per customer with three columns
--   for the three required food preferences
--
-- CAPTURE ELIGIBLE CUSTOMERS FROM EXERCISE 1 PART 1
-- 1. clean us_cities table columns by removing whitespaces,
-- changing city to lowercase and keeping the state abbreviation
-- in uppercase
WITH t1 AS (
    SELECT city_id,
           city_name,
           state_abbr,
           -- remove whitespace in city name
           RTRIM(LTRIM(LOWER(city_name), ' '), ' ') AS city_name_cleaned,
           -- remove whitespace in state abbreviation
           RTRIM(LTRIM(UPPER(state_abbr), ' '), ' ') AS state_abbr_cleaned,
           geo_location
    FROM resources.us_cities
),
-- 2. from cleaned us_cities columns, get cities with the first city ID
t2 AS (
    SELECT city_name,
           state_abbr,
           city_name_cleaned,
           state_abbr_cleaned,
           MIN(city_id) AS city_id
    FROM t1
    GROUP BY ALL
),
-- 3. filter original us_cities table to only capture cities with
-- unique city ID in cleaned us_cities columns
t3 AS (
    SELECT t2.city_id,
           t2.city_name,
           t2.state_abbr,
           t1.city_name_cleaned,
           t1.state_abbr_cleaned,
           t1.geo_location
    FROM t1
    INNER JOIN t2 USING (city_id, city_name_cleaned, state_abbr_cleaned)
),
-- 4. clean customer address table columns by removing whitespaces,
-- changing city to lowercase and keeping the state abbreviation
-- in uppercase
t4 AS (
    SELECT customer_id,
           customer_city,
           customer_state,
           -- remove whitespace in city name
           RTRIM(LTRIM(LOWER(customer_city), ' '), ' ') AS customer_city_cleaned,
           -- remove whitespace in state abbreviation
           RTRIM(LTRIM(UPPER(customer_state), ' '), ' ') AS customer_state_cleaned
    FROM customers.customer_address
),
-- 5. perform INNER JOIN using cleaned customer address and us_cities
-- tables to get geo_location for each customer in the cleaned customer
-- address table. The `INNER JOIN` selects customers whose city/state
-- is in the the database and these customers are eligible to place
-- an order from Virtual Kitchen. Perform the INNER JOIN using the
-- cleaned columns (columns with the _cleaned suffix). However, in the
-- output, exclude columns with the _cleaned suffix. Finally, perform
-- a LEFT JOIN between the the output and the customers_data table in
-- order to capture the customer's first and last name.
t5 AS (
    SELECT t4.customer_id,
           t4.customer_state,
           cd.first_name,
           cd.last_name,
           cd.email,
           t3.geo_location
    FROM t4
    -- JOIN with cleaned and non-duplicated us_cities table
    INNER JOIN t3 ON t4.customer_city_cleaned = t3.city_name_cleaned
    AND t4.customer_state_cleaned = t3.state_abbr_cleaned
    -- JOIN with customer_data table to get first_name and last_name
    LEFT JOIN (
        SELECT customer_id,
               first_name,
               last_name,
               email
        FROM customers.customer_data
    ) AS cd USING (customer_id)
),
-- EXERCISE 2 PART 1
-- 9. clean customer_survey table columns by removing whitespaces and
-- changing the tag_property to lowercase
t6 AS (
    SELECT cs.customer_id,
           rt.tag_property,
           RTRIM(LTRIM(LOWER(rt.tag_property), ' '), ' ') AS tag_property_cleaned
    FROM customers.customer_survey AS cs
    LEFT JOIN resources.recipe_tags AS rt USING (tag_id)
),
-- 10. perform a LEFT JOIN between the eligible customers and the
-- cleaned customer_survey, excluding customers who do not have a food
-- preference selected (where the cleaned tag_property is missing)
t7 AS (
    SELECT t5.customer_id,
           t5.first_name,
           t5.last_name,
           t5.email,
           -- t6.tag_property,
           t6.tag_property_cleaned,
           RANK() OVER(PARTITION BY customer_id ORDER BY tag_property_cleaned) AS food_preference
    FROM t5
    LEFT JOIN t6 USING (customer_id)
    WHERE tag_property != '<NA>'
),
-- 11. reshape from tidy to untidy data using a PIVOT in order to get
-- the food preferences (rows) as three columns and rename the columns
t8 AS (
    SELECT *
    FROM t7
    PIVOT(MIN(tag_property_cleaned) FOR food_preference in (1, 2, 3)) AS p (
        customer_id,
        first_name,
        last_name,
        email,
        food_preference_1,
        food_preference_2,
        food_preference_3
    )
)
SELECT *
FROM t8
```

The SQL for the final query for part 2 from above is shown below

```sql
-- - bring in customers who are eligible to place an order, which was found in exercise 1 part 1
-- exercise 2 part 1
-- - clean tag_property column from the customer_survey table
-- - LEFT JOIN eligible customers with cleaned customer survey & exclude customers without food preference
-- - reshape (PIVOT) the JOINed output from above to get one row per customer with three columns
--   for the three required food preferences
-- exercise 2 part 2
-- - flatten the tag_list from the recipe table to get all tags as separate rows
-- - for each flattened tag, select the last recipe ID (since only one recipe is required per tag)
-- - JOIN the above output with the recipe table again to caputre the recipe name for the selected
--   recipe ID
-- - LEFT JOIN the PIVOTed eligible customers with the selected recipe using the
--   first food preference column (customers) and the recipe tag (recipes) to get the recommended recipe
--   for each customers first food preference
--
-- CAPTURE ELIGIBLE CUSTOMERS FROM EXERCISE 1 PART 1
WITH t1 AS (
    SELECT city_id,
           city_name,
           state_abbr,
           -- remove whitespace in city name
           RTRIM(LTRIM(LOWER(city_name), ' '), ' ') AS city_name_cleaned,
           -- remove whitespace in state abbreviation
           RTRIM(LTRIM(UPPER(state_abbr), ' '), ' ') AS state_abbr_cleaned,
           geo_location
    FROM resources.us_cities
),
-- 2. from cleaned us_cities columns, get cities with the first city ID
t2 AS (
    SELECT city_name,
           state_abbr,
           city_name_cleaned,
           state_abbr_cleaned,
           MIN(city_id) AS city_id
    FROM t1
    GROUP BY ALL
),
-- 3. filter original us_cities table to only capture cities with
-- unique city ID in cleaned us_cities columns
t3 AS (
    SELECT t2.city_id,
           t2.city_name,
           t2.state_abbr,
           t1.city_name_cleaned,
           t1.state_abbr_cleaned,
           t1.geo_location
    FROM t1
    INNER JOIN t2 USING (city_id, city_name_cleaned, state_abbr_cleaned)
),
-- 4. clean customer address table columns by removing whitespaces,
-- changing city to lowercase and keeping the state abbreviation
-- in uppercase
t4 AS (
    SELECT customer_id,
           customer_city,
           customer_state,
           -- remove whitespace in city name
           RTRIM(LTRIM(LOWER(customer_city), ' '), ' ') AS customer_city_cleaned,
           -- remove whitespace in state abbreviation
           RTRIM(LTRIM(UPPER(customer_state), ' '), ' ') AS customer_state_cleaned
    FROM customers.customer_address
),
-- 5. perform INNER JOIN using cleaned customer address and us_cities
-- tables to get geo_location for each customer in the cleaned customer
-- address table. The `INNER JOIN` selects customers whose city/state
-- is in the the database and these customers are eligible to place
-- an order from Virtual Kitchen. Perform the INNER JOIN using the
-- cleaned columns (columns with the _cleaned suffix). However, in the
-- output, exclude columns with the _cleaned suffix. Finally, perform
-- a LEFT JOIN between the the output and the customers_data table in
-- order to capture the customer's first and last name.
t5 AS (
    SELECT t4.customer_id,
           t4.customer_state,
           cd.first_name,
           cd.last_name,
           cd.email,
           t3.geo_location
    FROM t4
    -- JOIN with cleaned and non-duplicated us_cities table
    INNER JOIN t3 ON t4.customer_city_cleaned = t3.city_name_cleaned
    AND t4.customer_state_cleaned = t3.state_abbr_cleaned
    -- JOIN with customer_data table to get first_name and last_name
    LEFT JOIN (
        SELECT customer_id,
               first_name,
               last_name,
               email
        FROM customers.customer_data
    ) AS cd USING (customer_id)
),
-- EXERCISE 2 PART 1
-- 6. clean customer_survey table columns by removing whitespaces and
-- changing the tag_property to lowercase
t6 AS (
    SELECT cs.customer_id,
           rt.tag_property,
           RTRIM(LTRIM(LOWER(rt.tag_property), ' '), ' ') AS tag_property_cleaned
    FROM customers.customer_survey AS cs
    LEFT JOIN resources.recipe_tags AS rt USING (tag_id)
),
-- 7. perform a LEFT JOIN between the eligible customers and the
-- cleaned customer_survey, excluding customers who do not have a food
-- preference selected (where the cleaned tag_property is missing)
t7 AS (
    SELECT t5.customer_id,
           t5.first_name,
           t5.last_name,
           t5.email,
           -- t6.tag_property,
           t6.tag_property_cleaned,
           RANK() OVER(PARTITION BY customer_id ORDER BY tag_property_cleaned) AS food_preference
    FROM t5
    LEFT JOIN t6 USING (customer_id)
    WHERE tag_property != '<NA>'
),
-- 8. reshape from tidy to untidy data using a PIVOT in order to get
-- the three food preferences (rows) as three columns and rename the columns
t8 AS (
    SELECT *
    FROM t7
    PIVOT(MIN(tag_property_cleaned) FOR food_preference in (1, 2, 3)) AS p (
        customer_id,
        first_name,
        last_name,
        email,
        food_preference_1,
        food_preference_2,
        food_preference_3
    )
),
-- EXERCISE 2 PART 2
-- 12. flatten the tag_list (list of food preferences) from the recipe
-- table and clean the flattened version
t9 AS (
    SELECT rc.recipe_id,
           rc_flat.value AS recipe_tag,
           RTRIM(LTRIM(LOWER(rc_flat.value), ' '), ' ') AS recipe_tag_cleaned
    FROM chefs.recipe AS rc, table(flatten(tag_list)) AS rc_flat
),
-- 13. for each cleaned and flattened recipe tag, get one recipe ID
-- (use the last recipe ID)
t10 AS (
    SELECT t9.recipe_tag_cleaned,
           MAX(t9.recipe_id) AS recipe_id
    FROM t9
    GROUP BY recipe_tag_cleaned
),
-- 14. for the selected recipe ID (per recipe tag), get the recipe name
t11 AS (
    SELECT t10.recipe_tag_cleaned,
           rc.recipe_name,
    FROM t10
    LEFT JOIN chefs.recipe AS rc USING (recipe_id)
),
-- 15. LEFT JOIN pivotted food preferences for each eligible customer
-- with the selected recipe. The recipe tags are the same as the
-- food preferences. Since a recipe is only required for the first
-- food preference, the JOIN should be performed using the first food
-- preference column (LHS of JOIN) and recipe tag (RHS of JOIN)
t12 AS (
    SELECT *
    FROM t8
    LEFT JOIN t11 ON t8.food_preference_1 = t11.recipe_tag_cleaned
    ORDER BY t8.email
)
SELECT *
FROM t12
```

## Disconnect

In [11]:
cur.close()
conn.close()