# Catchall

This notebook creates the Meal and Snack tables in the staging area. Both tables in the raw area contain a significant number of duplicate records. We first determine which fields are duplicated and then decide how to get rid of them when creating the staging table

# Meal

In [None]:
%%bigquery
select * from airline_raw.meals
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,meal_id,meal_name,meal_image,cat_name,tags,area,ingredient1,ingredient2,ingredient3,ingredient4,ingredient5,source,youtube,load_time
0,52968,Mbuzi Choma (Roasted Goat),https://www.themealdb.com/images/media/meals/c...,Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,,,2024-01-26 22:24:02.252144+00:00
1,52968,Mbuzi Choma (Roasted Goat),https://www.themealdb.com/images/media/meals/c...,Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,,,2024-01-26 22:24:02.252144+00:00
2,52968,Mbuzi Choma (Roasted Goat),https://www.themealdb.com/images/media/meals/c...,Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,,,2024-01-26 22:24:02.252144+00:00
3,52968,Mbuzi Choma (Roasted Goat),https://www.themealdb.com/images/media/meals/c...,Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,,,2024-01-26 22:24:02.252144+00:00
4,52968,Mbuzi Choma (Roasted Goat),https://www.themealdb.com/images/media/meals/c...,Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,,,2024-01-26 22:24:02.252144+00:00


In [None]:
%%bigquery
select (select count(*) from airline_raw.meals) as raw_meal_total_count,
  (select count(distinct meal_id) from airline_raw.meals) as raw_meal_distinct_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,raw_meal_total_count,raw_meal_distinct_count
0,3322,302


To place the `data_source` field before the `load_time` field, we use the `except` function.

In [None]:
%%bigquery
create or replace table airline_stg.Meal as
  select distinct * except(load_time), 'mealdb' as data_source, load_time
  from airline_raw.meals

Query is running:   0%|          |

In [None]:
%%bigquery
select * from airline_stg.Meal
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,meal_id,meal_name,meal_image,cat_name,tags,area,ingredient1,ingredient2,ingredient3,ingredient4,ingredient5,source,youtube,data_source,load_time
0,52827,Massaman Beef curry,https://www.themealdb.com/images/media/meals/t...,Beef,Curry,Thai,Peanuts,Coconut cream,Massaman curry paste,Beef,Potatoes,https://www.bbcgoodfood.com/recipes/420631/bee...,https://www.youtube.com/watch?v=mVxgZSCU9_g,mealdb,2024-01-26 22:24:02.252144+00:00
1,52979,Bitterballen (Dutch meatballs),https://www.themealdb.com/images/media/meals/l...,Beef,"DinnerParty,HangoverFood,Alcoholic",Dutch,Butter,Flour,Beef Stock,Onion,Parsley,https://www.holland.com/global/tourism/informa...,https://www.youtube.com/watch?v=q8AKfYUtDuM,mealdb,2024-01-26 22:24:02.252144+00:00
2,53006,Moussaka,https://www.themealdb.com/images/media/meals/c...,Beef,,Greek,Beef,Aubergine,Greek Yogurt,Egg,Parmesan,https://www.bbcgoodfood.com/recipes/must-make-...,https://www.youtube.com/watch?v=8U_29i9Qp5U,mealdb,2024-01-26 22:24:02.252144+00:00
3,53000,Vegetable Shepherd's Pie,https://www.themealdb.com/images/media/meals/w...,Beef,Alcoholic,Irish,Potatoes,Small Potatoes,Salted Butter,Mushrooms,Brown Lentils,,,mealdb,2024-01-26 22:24:02.252144+00:00
4,52781,Irish stew,https://www.themealdb.com/images/media/meals/s...,Beef,"Stew,Meat",Irish,whole wheat,lamb loin chops,olive oil,shallots,carrots,http://www.ottolenghi.co.uk/recipes/meat/irish...,https://www.youtube.com/watch?v=kYH2qJXnSMo,mealdb,2024-01-26 22:24:02.252144+00:00


In [None]:
%%bigquery
select (select count(*) from airline_stg.Meal) as stg_meal_records,
  (select count(*) from airline_raw.meals) as raw_meal_records

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,stg_meal_records,raw_meal_records
0,302,3322


## Primary Key

In [None]:
%%bigquery
alter table airline_stg.Meal add primary key (meal_id) not enforced

Query is running:   0%|          |

In [None]:
%%bigquery
select meal_id, count(*) as duplicate_records
from airline_stg.Meal
group by meal_id
having count(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,meal_id,duplicate_records


# Snacks

In [None]:
%%bigquery
select * from airline_raw.snacks
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,code,url,product_name,brands,categories,countries_en,ingredients_text,image_url,load_time
0,8606014406504,http://world-en.openfoodfacts.org/product/8606...,,,snacks,,idk,https://images.openfoodfacts.org/images/produc...,2024-02-03 17:32:09.469541+00:00
1,7311041080993,http://world-en.openfoodfacts.org/product/7311...,,,snacks,,"Hvetemel, salt 3,6 %, rapsolje, surhetsreguler...",https://images.openfoodfacts.org/images/produc...,2024-02-03 17:32:09.469541+00:00
2,4901561213587,http://world-en.openfoodfacts.org/product/4901...,焼するめ,,"en:Snacks, en:Salty snacks",,いか (中国)、砂糖、食塩、乳糖/ソルビット、調味料(ア ミノ酸等)、酸味料、リン酸塩(Na...,,2024-02-03 17:32:09.469541+00:00
3,5000159504355,http://world-en.openfoodfacts.org/product/5000...,Peanut Chocolate Treat Bag,,"Snacks, en:Confiseries, en:Snacks sucrés",,R zoz 82 20:00 ¥2170/20 000159 504355&quot;&gt;,https://images.openfoodfacts.org/images/produc...,2024-02-03 17:32:09.469541+00:00
4,4009900526838,http://world-en.openfoodfacts.org/product/4009...,Minis,,"Snacks, Sweet snacks, Confectioneries, Candies",,"sugar, glucose syrup, palm fat, acid citric ac...",https://images.openfoodfacts.org/images/produc...,2024-02-03 17:32:09.469541+00:00


In [None]:
%%bigquery
select (select count(*) from airline_raw.snacks) as raw_snack_total_count,
  (select count(distinct code) from airline_raw.snacks) as raw_snack_distinct_count,
    (select count(*) - count(distinct code) from airline_raw.snacks) as duplicate_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,raw_snack_total_count,raw_snack_distinct_count,duplicate_count
0,226831,226780,51


In [None]:
%%bigquery
select code, count(*) as duplicate_count
from airline_raw.snacks
group by code
having count(*) > 1
order by code
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,code,duplicate_count
0,1.0841052523204612e+21,2
1,10.0,2
2,1142273.0,2
3,11778.0,2
4,130509.0,2


In [None]:
%%bigquery
select code, created_datetime, last_modified_datetime from airline_raw.snacks
where code in ('10', '11778', '130509', '502566', '783033')
order by code, last_modified_datetime desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,code,created_datetime,last_modified_datetime
0,10,2019-06-28 13:22:36+00:00,2023-12-16 12:18:25+00:00
1,10,2016-10-20 07:19:01+00:00,2023-04-29 11:35:19+00:00
2,11778,2016-09-21 20:19:09+00:00,2022-02-11 08:26:09+00:00
3,11778,2017-03-31 15:58:35+00:00,2022-02-10 15:27:26+00:00
4,130509,2019-06-28 10:16:34+00:00,2023-09-06 12:34:25+00:00
5,130509,2019-04-23 20:28:51+00:00,2022-02-11 03:38:22+00:00
6,502566,2018-04-22 07:47:05+00:00,2023-10-27 10:37:21+00:00
7,502566,2021-12-27 11:12:30+00:00,2023-08-25 09:55:29+00:00
8,783033,2023-08-27 23:35:42+00:00,2023-10-26 18:47:39+00:00
9,783033,2018-03-09 08:51:18+00:00,2023-09-27 11:20:44+00:00


Use a Window function to [rank](https://cloud.google.com/bigquery/docs/reference/standard-sql/window-function-calls#compute_rank) the snacks within their code by their last modified date. This ranking will allow us to filter out the duplicate snacks.  

In [None]:
%%bigquery
select RANK() over (partition by code order by last_modified_datetime desc) AS rank, code, product_name, last_modified_datetime
from airline_raw.snacks
where code in ('10', '11778', '130509', '502566')
order by code;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rank,code,product_name,last_modified_datetime
0,1,10,Vegan 3K-Protein Cheesecake,2023-12-16 12:18:25+00:00
1,2,10,Madeleines nature,2023-04-29 11:35:19+00:00
2,1,11778,All Butter Reduced Fat Stem Ginger Cookies,2022-02-11 08:26:09+00:00
3,2,11778,Stem Ginger Cookies,2022-02-10 15:27:26+00:00
4,1,130509,Wheat Biscuits,2023-09-06 12:34:25+00:00
5,2,130509,24 wholewheat biscuits,2022-02-11 03:38:22+00:00
6,1,502566,Buttery Croissants,2023-10-27 10:37:21+00:00
7,2,502566,all butter croissants,2023-08-25 09:55:29+00:00


Create the intermediate staging table `snacks` using the `rank()` function:

In [None]:
%%bigquery
create or replace table airline_stg.snacks as
  select code as snack_id, * except(code, rank, load_time), 'open_food_facts' as data_source, load_time
  from
    (select RANK() over (partition by code order by last_modified_datetime desc) AS rank, *
    from airline_raw.snacks)
  where rank = 1

Query is running:   0%|          |

In [None]:
%%bigquery
select (select count(*) from airline_stg.snacks) as intermediate_stg_count,
  (select count(*) from airline_raw.snacks) as raw_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,intermediate_stg_count,raw_count
0,226782,226831


In [None]:
%%bigquery
select *
from airline_stg.snacks
where snack_id in ('10', '11778', '130509', '502566')
order by snack_id;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text,image_url,created_datetime,last_modified_datetime,data_source,load_time
0,10,http://world-en.openfoodfacts.org/product/0010...,Vegan 3K-Protein Cheesecake,Nutri+,"Snacks, Sweet snacks, Biscuits and cakes, Cake...","France,Germany,Ireland",,https://images.openfoodfacts.org/images/produc...,2019-06-28 13:22:36+00:00,2023-12-16 12:18:25+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
1,11778,http://world-en.openfoodfacts.org/product/0011...,All Butter Reduced Fat Stem Ginger Cookies,Marks & Spencer,"Snacks, Snacks sucrés, Biscuits et gâteaux, Bi...","France,United Kingdom",Farine de _blé_ (contient _Gluten_) (avec Fari...,https://images.openfoodfacts.org/images/produc...,2016-09-21 20:19:09+00:00,2022-02-11 08:26:09+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
2,130509,http://world-en.openfoodfacts.org/product/0013...,Wheat Biscuits,By Sainsbury's,"Plant-based foods and beverages, Plant-based f...",United Kingdom,"British Wholegrain Wheat (95%), Malted Barley ...",https://images.openfoodfacts.org/images/produc...,2019-06-28 10:16:34+00:00,2023-09-06 12:34:25+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
3,502566,http://world-en.openfoodfacts.org/product/0050...,Buttery Croissants,Marks & Spencer,"Snacks, Sweet snacks, Viennoiseries, Croissant...",France,Not suitabl Wheatflour (contains Gluten). Butt...,,2018-04-22 07:47:05+00:00,2023-10-27 10:37:21+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00


In [None]:
%%bigquery
select *
from airline_stg.snacks
where snack_id in ('10', '11778', '130509', '502566')
order by snack_id;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text,image_url,created_datetime,last_modified_datetime,data_source,load_time
0,10,http://world-en.openfoodfacts.org/product/0010...,Vegan 3K-Protein Cheesecake,Nutri+,"Snacks, Sweet snacks, Biscuits and cakes, Cake...","France,Germany,Ireland",,https://images.openfoodfacts.org/images/produc...,2019-06-28 13:22:36+00:00,2023-12-16 12:18:25+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
1,11778,http://world-en.openfoodfacts.org/product/0011...,All Butter Reduced Fat Stem Ginger Cookies,Marks & Spencer,"Snacks, Snacks sucrés, Biscuits et gâteaux, Bi...","France,United Kingdom",Farine de _blé_ (contient _Gluten_) (avec Fari...,https://images.openfoodfacts.org/images/produc...,2016-09-21 20:19:09+00:00,2022-02-11 08:26:09+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
2,130509,http://world-en.openfoodfacts.org/product/0013...,Wheat Biscuits,By Sainsbury's,"Plant-based foods and beverages, Plant-based f...",United Kingdom,"British Wholegrain Wheat (95%), Malted Barley ...",https://images.openfoodfacts.org/images/produc...,2019-06-28 10:16:34+00:00,2023-09-06 12:34:25+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
3,502566,http://world-en.openfoodfacts.org/product/0050...,Buttery Croissants,Marks & Spencer,"Snacks, Sweet snacks, Viennoiseries, Croissant...",France,Not suitabl Wheatflour (contains Gluten). Butt...,,2018-04-22 07:47:05+00:00,2023-10-27 10:37:21+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00


In [None]:
%%bigquery
select snack_id, count(*)
from airline_stg.snacks
group by snack_id
having count(*) > 1

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,f0_
0,8424465927603,2
1,7071688002962,2


In [None]:
%%bigquery
select *
from airline_stg.snacks
where snack_id in ('8424465927603', '7071688002962')

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text,image_url,created_datetime,last_modified_datetime,data_source,load_time
0,8424465927603,http://world-en.openfoodfacts.org/product/8424...,Croissant Integral Realfooding,,"Botanas,Snacks dulces,Viennoiseries,Croissants",Spain,"Harina de TRIGO integral 40%, preparado graso ...",https://images.openfoodfacts.org/images/produc...,2022-06-10 08:41:42+00:00,2022-06-13 09:01:09+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
1,8424465927603,http://world-en.openfoodfacts.org/product/8424...,Croissant Integral Realfooding,,"Botanas, Snacks dulces, Viennoiseries, Croissants",Spain,"Harina de TRIGO integral 40%, preparado graso ...",https://images.openfoodfacts.org/images/produc...,2022-06-10 08:41:42+00:00,2022-06-13 09:01:09+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
2,7071688002962,http://world-en.openfoodfacts.org/product/7071...,Tortilla Fyldig Ost,Sørlands Chips,"Snacks,Salty snacks,Appetizers,Chips and fries...",Norway,"Hvit mais (68 %), solsikkeolje og krydder (glu...",https://images.openfoodfacts.org/images/produc...,2021-07-08 10:01:18+00:00,2021-07-13 14:26:35+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
3,7071688002962,http://world-en.openfoodfacts.org/product/7071...,Tortilla Fyldig Ost,Sørlands Chips,"en:Snacks, en:Salty snacks, en:Appetizers, en:...",Norway,"Hvit mais (68 %), solsikkeolje og krydder (glu...",https://images.openfoodfacts.org/images/produc...,2021-07-08 10:01:18+00:00,2021-07-13 14:26:35+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00


In [None]:
%%bigquery
  select distinct * except(categories)
  from airline_stg.snacks
  where snack_id in ('8424465927603', '7071688002962')

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,countries_en,ingredients_text,image_url,created_datetime,last_modified_datetime,data_source,load_time
0,8424465927603,http://world-en.openfoodfacts.org/product/8424...,Croissant Integral Realfooding,,Spain,"Harina de TRIGO integral 40%, preparado graso ...",https://images.openfoodfacts.org/images/produc...,2022-06-10 08:41:42+00:00,2022-06-13 09:01:09+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00
1,7071688002962,http://world-en.openfoodfacts.org/product/7071...,Tortilla Fyldig Ost,Sørlands Chips,Norway,"Hvit mais (68 %), solsikkeolje og krydder (glu...",https://images.openfoodfacts.org/images/produc...,2021-07-08 10:01:18+00:00,2021-07-13 14:26:35+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00


In [None]:
%%bigquery
  select categories
  from airline_stg.snacks
  where snack_id in ('8424465927603', '7071688002962')

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,categories
0,"Botanas,Snacks dulces,Viennoiseries,Croissants"
1,"Botanas, Snacks dulces, Viennoiseries, Croissants"
2,"Snacks,Salty snacks,Appetizers,Chips and fries..."
3,"en:Snacks, en:Salty snacks, en:Appetizers, en:..."


In [None]:
%%bigquery
select snack_id, count(*)
from airline_stg.Snack
group by snack_id
having count(*) > 1

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,f0_
0,8424465927603,2
1,7071688002962,2


In [None]:
%%bigquery
select RANK() over (partition by snack_id order by categories desc) AS rank, snack_id, product_name, categories, last_modified_datetime
from airline_stg.snacks
where snack_id in ('8424465927603', '7071688002962')
order by snack_id;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,rank,snack_id,product_name,categories,last_modified_datetime
0,1,7071688002962,Tortilla Fyldig Ost,"en:Snacks, en:Salty snacks, en:Appetizers, en:...",2021-07-13 14:26:35+00:00
1,2,7071688002962,Tortilla Fyldig Ost,"Snacks,Salty snacks,Appetizers,Chips and fries...",2021-07-13 14:26:35+00:00
2,1,8424465927603,Croissant Integral Realfooding,"Botanas,Snacks dulces,Viennoiseries,Croissants",2022-06-13 09:01:09+00:00
3,2,8424465927603,Croissant Integral Realfooding,"Botanas, Snacks dulces, Viennoiseries, Croissants",2022-06-13 09:01:09+00:00


In [None]:
%%bigquery
create or replace table airline_stg.Snack as
  select * except(rank)
  from
    (select RANK() over (partition by snack_id order by categories desc) AS rank, *
    from airline_stg.snacks)
  where rank = 1

Query is running:   0%|          |

In [None]:
%%bigquery
select snack_id, count(*)
from airline_stg.Snack
group by snack_id
having count(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,snack_id,f0_


In [None]:
%%bigquery
select (select count(*) from airline_stg.snacks) as intermediate_stg_count,
  (select count(*) from airline_stg.Snack) as final_stg_count

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,intermediate_stg_count,final_stg_count
0,226782,226780


## Primary Key

In [None]:
%%bigquery
alter table airline_stg.Snack add primary key(snack_id) not enforced

Query is running:   0%|          |

In [None]:
%%bigquery
select snack_id, count(*) as duplicate_records
from airline_stg.Snack
group by snack_id
having count(*) > 1

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,snack_id,duplicate_records


## Cleanup

In [None]:
%%bigquery
drop table airline_stg.snacks

Query is running:   0%|          |