## Data Enrichment with AI

### Setup

##### Create a dataset for storing the AI tables and another for storing the AI models

In [13]:
%%bigquery
CREATE SCHEMA `cs329e-sp2024`.airline_stg_ai

Query is running:   0%|          |

In [2]:
%%bigquery
CREATE SCHEMA `cs329e-sp2024`.remote_models

Query is running:   0%|          |

##### Before running this cell, create the remote connection and assign the IAM role `Vertex AI User` to the service account associated with the connection.

In [44]:
%%bigquery
create or replace model remote_models.gemini_pro
  remote with connection `projects/cs329e-sp2024/locations/us/connections/vertex_connection`
  options (endpoint = 'gemini-pro');

Query is running:   0%|          |

### Part 1: Predict the subcategory of a snack

In [46]:
%%bigquery
select * except (image_url, created_time, last_modified_time, data_source, load_time)
from airline_stg.Snack
where ingredients_text is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text
0,660901066729,http://world-en.openfoodfacts.org/product/0660...,Chocolate Chip Walnut bar,WellBean,Snacks,,"ORGANIC DATES, NAVY BEANS, WALNUTS, ALMONDS, O..."
1,815369010542,http://world-en.openfoodfacts.org/product/0815...,Cheddar Cheese Baked Corn Curls,Cadia,Snacks,,"CORNMEAL, EXPELLER PRESSED VEGETABLE OIL (SUNF..."
2,5059512742484,http://world-en.openfoodfacts.org/product/5059...,10 Meat Free Bangers in Duvets,Tesco,Snacks,,"Water, Wheat Flour (Wheat Flour, Calcium Carbo..."
3,8906125471782,http://world-en.openfoodfacts.org/product/8906...,Finger Stix,Star,Snacks,,"Corn Flour, Potato Powder, Refined Palmolein O..."
4,7311041080993,http://world-en.openfoodfacts.org/product/7311...,,,snacks,,"Hvetemel, salt 3,6 %, rapsolje, surhetsreguler..."


#### Test the generate_text function

In [47]:
%%bigquery
declare prompt_query STRING default "Suggest a subcategory for each snack. Return the output as json, include the snack_id in the output";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("snack_id", snack_id, "url", url,
                  "brands", brands, "product_name", product_name, "categories", categories,
                  "ingredients_text", ingredients_text))) as prompt
    from airline_stg.Snack
    order by snack_id
    limit 10
  ),
  struct(TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"```json\n{\n ""snacks"": [\n {\n ""snack...",,,Suggest a subcategory for each snack. Return t...
1,"```json\n{\n ""brands"": ""Coup de Pâtes"",\n ""c...",,,Suggest a subcategory for each snack. Return t...
2,"```json\n{\n ""brands"": ""CANDY FLOSS"",\n ""cat...",,,Suggest a subcategory for each snack. Return t...
3,"```json\n{\n ""snack_id"": ""1.0500016938604001e...",,,Suggest a subcategory for each snack. Return t...
4,"```json\n{\n ""brands"": null,\n ""categories"":...",,,Suggest a subcategory for each snack. Return t...
5,"```json\n{\n ""brands"": null,\n ""categories"":...",,,Suggest a subcategory for each snack. Return t...
6,"```json\n{\n ""Botanas"": {\n ""snack_id"": ""1...",,,Suggest a subcategory for each snack. Return t...
7,"```json\n{\n ""brands"": null,\n ""categories"":...",,,Suggest a subcategory for each snack. Return t...
8,"```json\n{\n ""brands"": null,\n ""categories"":...",,,Suggest a subcategory for each snack. Return t...
9,"```json\n{\n ""snacks"": {\n ""subcategory"": ...",,,Suggest a subcategory for each snack. Return t...


#### Tweak the prompt and save the output
##### [More details](https://cloud.google.com/bigquery/docs/generate-text#generate_text_from_text_data_by_using_a_prompt_from_a_query) on `ML.generate_text` parameters

In [48]:
%%bigquery
declare prompt_query STRING default "Suggest a subcategory for each snack. Return the output as json, include only the snack_id and subcategory in the output";
create or replace table airline_stg_ai.subcategory_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("snack_id", snack_id, "url", url,
                  "brands", brands, "product_name", product_name, "categories", categories,
                  "ingredients_text", ingredients_text))) as prompt
      from airline_stg.Snack
      order by snack_id
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [50]:
%%bigquery
select ml_generate_text_llm_result, prompt
from airline_stg_ai.subcategory_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"```json\n{\n ""snack_id"": ""1.0841214700531211e...",Suggest a subcategory for each snack. Return t...
1,"```json\n{\n ""snack_id"": ""1.084105251179851e+...",Suggest a subcategory for each snack. Return t...
2,"```json\n{""snack_id"":""1.0479006121e+19"",""subca...",Suggest a subcategory for each snack. Return t...
3,"```json\n{\n ""snack_id"": ""1.084105252184841e+...",Suggest a subcategory for each snack. Return t...
4,"```json\n{""snack_id"":""1"",""subcategory"":""Cookie...",Suggest a subcategory for each snack. Return t...
5,"```json\n{\n ""snack_id"": ""1.036043802595381e+...",Suggest a subcategory for each snack. Return t...
6,"```json\n{\n ""snack_id"": ""1.0500016938604001e...",Suggest a subcategory for each snack. Return t...
7,"```json\n{\n ""snack_id"": ""1.0841052523204611e...",Suggest a subcategory for each snack. Return t...
8,"```json\n{\n ""snack_id"": ""1.0843506381328111e...",Suggest a subcategory for each snack. Return t...
9,"```json\n{\n ""snack_id"": ""1.0841052512999511e...",Suggest a subcategory for each snack. Return t...


#### Format the output to proper json

In [51]:
%%bigquery
select ml_generate_text_llm_result, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as formated_result
from airline_stg_ai.subcategory_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,formated_result
0,"```json\n{\n ""snack_id"": ""1.0841214700531211e...","{ ""snack_id"": ""1.0841214700531211e+20"", ""sub..."
1,"```json\n{\n ""snack_id"": ""1.084105251179851e+...","{ ""snack_id"": ""1.084105251179851e+21"", ""subc..."
2,"```json\n{""snack_id"":""1.0479006121e+19"",""subca...","{""snack_id"":""1.0479006121e+19"",""subcategory"":""..."
3,"```json\n{\n ""snack_id"": ""1.084105252184841e+...","{ ""snack_id"": ""1.084105252184841e+21"", ""subc..."
4,"```json\n{""snack_id"":""1"",""subcategory"":""Cookie...","{""snack_id"":""1"",""subcategory"":""Cookies""}"
5,"```json\n{\n ""snack_id"": ""1.036043802595381e+...","{ ""snack_id"": ""1.036043802595381e+43"", ""subc..."
6,"```json\n{\n ""snack_id"": ""1.0500016938604001e...","{ ""snack_id"": ""1.0500016938604001e+19"", ""sub..."
7,"```json\n{\n ""snack_id"": ""1.0841052523204611e...","{ ""snack_id"": ""1.0841052523204611e+21"", ""sub..."
8,"```json\n{\n ""snack_id"": ""1.0843506381328111e...","{ ""snack_id"": ""1.0843506381328111e+21"", ""sub..."
9,"```json\n{\n ""snack_id"": ""1.0841052512999511e...","{ ""snack_id"": ""1.0841052512999511e+21"", ""sub..."


In [52]:
%%bigquery
create or replace table airline_stg_ai.subcategory_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from airline_stg_ai.subcategory_predictions_raw_10

Query is running:   0%|          |

In [53]:
%%bigquery
select json_value(ml_generate_text_llm_result, '$.snack_id') as snack_id,
  json_value(ml_generate_text_llm_result, '$.subcategory') as subcategory
from airline_stg_ai.subcategory_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,subcategory
0,1.0841214700531212e+20,Bonbons
1,1.0843506381328111e+21,Bonbons
2,1.0479006121e+19,CANDY FLOSS
3,1.0500016938604e+19,Sausage rolls
4,1.0,Cookies
5,1.0841052512999512e+21,Bonbons
6,1.0841052523204612e+21,Botanas
7,1.084105251179851e+21,Golosinas
8,1.036043802595381e+43,Pains aux raisins
9,1.084105252184841e+21,Golosinas


#### Add the subcategory field to the Snack table

In [54]:
%%bigquery
alter table airline_stg.Snack add column subcategory string;

Query is running:   0%|          |

#### Update the Snack records with the predicted subcategory

In [55]:
%%bigquery
update airline_stg.Snack set subcategory =
  (select json_value(ml_generate_text_llm_result, '$.subcategory')
   from airline_stg_ai.subcategory_predictions_formatted_10
   where snack_id = json_value(ml_generate_text_llm_result, '$.snack_id'))
where 1=1

Query is running:   0%|          |

#### Inspect the output

In [56]:
%%bigquery
select * except(url, image_url, created_time, last_modified_time, data_source, load_time)
from airline_stg.Snack
where subcategory is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,product_name,brands,categories,countries_en,ingredients_text,subcategory
0,1.0,Test Cookie,MTR,Snacks,Ukraine,,Cookies
1,1.0841052523204612e+21,Fini Fun,,"Botanas, Snacks dulces, Dulces",Spain,,Botanas
2,1.084105251179851e+21,Abichuela golisina,,"Botanas, Snacks dulces, Dulces, Golosinas",Spain,,Golosinas
3,1.084105252184841e+21,Geles dulce pica,Fini,"Botanas, Snacks dulces, Dulces, Golosinas",Spain,,Golosinas
4,1.0479006121e+19,Zuckerwatte,CANDY FLOSS,"Snacks,Sweet snacks,Confectioneries,Candies",Germany,"Zucker, Aroma, Farbstoffe E100, E120, E133",CANDY FLOSS
5,1.0841214700531212e+20,Bonbon,,"Snacks, Snacks sucrés, Confiseries, Bonbons",France,"Colorants : E-100, E-120, E-133.",Bonbons
6,1.0841052512999512e+21,Confiserie gélifiée mures,,"Snacks, Snacks sucrés, Confiseries, Bonbons",France,,Bonbons
7,1.0843506381328111e+21,Cintas pica sandia,King Regal,"Snacks, Snacks sucrés, Confiseries, Bonbons",France,,Bonbons
8,1.0500016938604e+19,Outdoor Bred Pork Sausage Rolls,Waitrose,"Snacks, Salty snacks, Appetizers, Sausage rolls",Singapore,"pork (40%), fortified wheat flour (_wheat_ flo...",Sausage rolls
9,1.036043802595381e+43,60 Pains aux Raisins Prépoussés au Beurre Fin,Coup de Pâtes,"Snacks, Snacks sucrés, Surgelés, Viennoiseries...",France,"Pâte (60.9 %) Farine de _blé_, _beurre_ fin 26...",Pains aux raisins


#### Apply at larger scale

Gemini-pro is slow, it would take **3.7 hours** to process 226780 rows in the Snack table based on a 1000 QPM limit (226780/1000 = 227 min).

And the default quota is only 300 QPM in us-central1.

To process larger volumes, we can request a quota increaseby following these steps:
- Go to the [Quota page](https://console.cloud.google.com/iam-admin/quotas)
- Click on "Generate content requests per minute per project per base model per minute per region per base_model" for us-central1 and gemini-pro
-Click the Edit Quotas button
-In the new value field, enter 1000
-In the justification field, enter "To process a 226780 row table"
-Click Next until you get to the last page
-Click Submit
-Wait for a few minutes, you should get an email once it's been approved. Usually takes ~5 minutes.

###### Create a smaller Snack table with 10K records in order to finish within 30 minutes

In [57]:
%%bigquery
create or replace table airline_stg.Snack_10k as
  select *
  from airline_stg.Snack
  where brands is not null
  and categories is not null
  and ingredients_text is not null
  and countries_en is not null
  limit 10000

Query is running:   0%|          |

In [58]:
%%bigquery
select *
from airline_stg.Snack_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text,image_url,created_time,last_modified_time,data_source,load_time,subcategory
0,3256225732111,http://world-en.openfoodfacts.org/product/3256...,Cacahuètes enrobées saveur sweet chili,U,"Aliments et boissons à base de végétaux, Alime...",France,"CACAHUÈTES 72%, farine de BLÉ, amidon de pomme...",https://images.openfoodfacts.org/images/produc...,2017-03-06 16:46:30+00:00,2020-09-15 17:14:12+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
1,3256224160502,http://world-en.openfoodfacts.org/product/3256...,Tourte Champignons - surgelée 500 g,U,"Snacks, Snacks salés, Amuse-gueules, Surgelés,...",France,"Pâte 50 % : farine de blé, margarine (graisses...",https://images.openfoodfacts.org/images/produc...,2013-05-26 14:09:26+00:00,2022-02-11 04:14:22+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
2,3256221212266,http://world-en.openfoodfacts.org/product/3256...,Noix de cajou grillées salées,U,"Aliments et boissons à base de végétaux, Alime...",France,"NOIX DE CAJOU 96%, huile de tournesol, sel 1%....",https://images.openfoodfacts.org/images/produc...,2015-11-10 16:49:31+00:00,2023-06-07 10:25:49+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
3,3256229522022,http://world-en.openfoodfacts.org/product/3256...,Snack poppés,U,Snacks,France,"Riz rond, fécule de pommes de terre, pois cass...",https://images.openfoodfacts.org/images/produc...,2023-05-08 11:10:56+00:00,2023-11-20 02:27:04+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
4,3256225732104,http://world-en.openfoodfacts.org/product/3256...,Cacahuètes enrobées saveur wasabi,U,"Aliments et boissons à base de végétaux, Alime...",France,"CACAHUÈTES 71%, farine de BLÉ, amidon de pomme...",https://images.openfoodfacts.org/images/produc...,2017-11-11 14:16:30+00:00,2023-03-07 09:41:20+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,37323118785,http://world-en.openfoodfacts.org/product/0037...,Apple Sauce,Knouse Foods Inc.,Snacks,United States,"Apples, water, ascorbic acid to maintain color...",,2017-03-09 12:33:32+00:00,2020-04-22 17:57:47+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
9996,37323115821,http://world-en.openfoodfacts.org/product/0037...,Apple sauce,Knouse Foods Inc.,Snacks,United States,"Apples, sugar, water, lemon juice and cinnamon...",,2017-03-09 12:13:32+00:00,2020-04-22 17:46:13+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
9997,37323118709,http://world-en.openfoodfacts.org/product/0037...,"Musselman's, lite original apple sauce",Knouse Foods Inc.,Snacks,United States,"Apples, water, ascorbic acid to maintain color...",,2017-03-09 12:33:32+00:00,2020-04-22 17:57:47+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,
9998,8606012185067,http://world-en.openfoodfacts.org/product/8606...,Rocky rice choco orange,"Benlian Food,Benlian","Plant-based foods and beverages, Plant-based f...","France,Hungary,Netherlands","chocolate coating 68% (sugar, cocoa butter*, c...",,2018-07-30 13:50:51+00:00,2023-08-07 07:20:15+00:00,open_food_facts,2024-02-03 19:40:10.880358+00:00,


In [29]:
%%bigquery
declare prompt_query STRING default "Suggest a subcategory for each snack. Return the output as json, include only the snack_id and subcategory in the output";
create or replace table airline_stg_ai.subcategory_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("snack_id", snack_id, "url", url,
                  "brands", brands, "product_name", product_name, "categories", categories,
                  "ingredients_text", ingredients_text))) as prompt
      from airline_stg.Snack_10k
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [30]:
%%bigquery
select creation_time, end_time, query
from `region-us`.INFORMATION_SCHEMA.JOBS
where job_id = 'a441f6b3-d6b4-4b33-a8f0-30e044e7b326'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,creation_time,end_time,query
0,2024-04-05 16:31:53.551000+00:00,2024-04-05 18:33:45.166000+00:00,"declare prompt_query STRING default ""Suggest a..."


##### It actually took 2 hours to process the 10k table. This is with only a 300 QPM, without a quota increase

In [31]:
%%bigquery
create or replace table airline_stg_ai.subcategory_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from airline_stg_ai.subcategory_predictions_raw_10k;

Query is running:   0%|          |

In [39]:
%%bigquery
select count(*) as subcategory_count
from airline_stg_ai.subcategory_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,subcategory_count
0,10000


In [40]:
%%bigquery
update airline_stg.Snack s
  set subcategory = json_value(p.ml_generate_text_llm_result, '$.subcategory')
  from airline_stg_ai.subcategory_predictions_formatted_10k p
  where s.snack_id = json_value(p.ml_generate_text_llm_result, '$.snack_id');

Query is running:   0%|          |

In [41]:
%%bigquery
select count(*) as subcategory_count
from airline_stg.Snack
where subcategory is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,subcategory_count
0,9081


In [42]:
%%bigquery
select categories, subcategory, count(*) as count
from airline_stg.Snack
where subcategory is not null
group by categories, subcategory
order by count(*) desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,categories,subcategory,count
0,"Snacks, Snacks sucrés, Confiseries, Bonbons",Bonbons,957
1,"Botanas, Snacks dulces, Galletas y pasteles, G...",Galletas,189
2,"Snacks, Snacks salés, Amuse-gueules",Amuse-gueules,160
3,"Botanas, Snacks dulces, en:Cocoa and its produ...",Chocolates negros,155
4,"Snacks, Snacks sucrés, Confiseries",Confiseries,152
...,...,...,...
2752,"Snacks, Snacks salés, Chips et frites, Frites",Frites,1
2753,"Snacks, Snacks sucrés, Barres, Barre sucrées",Barre sucrée,1
2754,"Snacks, Snacks sucrés, Produits déshydratés",Produits déshydratés,1
2755,"Snacks, Snacks sucrés, Surgelés, Viennoiseries...",Pains aux raisins,1


In [43]:
%%bigquery
update airline_stg.Snack
  set data_source = 'open_food_facts_ai' where subcategory is not null

Query is running:   0%|          |

### Part 2: Detect the language of the snack product (English, Spanish, Japanese, etc.)

#### Experiment with prompt

In [59]:
%%bigquery
declare prompt_query STRING default "Detect the language of the text. For example, English, French, Spanish, etc. Return the output as json, include the snack_id in the output as well";
select *
from ML.generate_text(
  model remote_models.gemini_pro,
  (
    select concat(prompt_query, to_json_string(json_object("snack_id", snack_id, "product_name", product_name,
                  "brands", brands, "categories", categories, "ingredients_text", ingredients_text))) as prompt
    from airline_stg.Snack_10k
    order by snack_id
    limit 10
  ),
  struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,ml_generate_text_rai_result,ml_generate_text_status,prompt
0,"{""language"": ""English"", ""snack_id"": ""1.6128354...",,,"Detect the language of the text. For example, ..."
1,"```json\n{""language"": ""English"", ""snack_id"": ""...",,,"Detect the language of the text. For example, ..."
2,"{""language"":""English"",""snack_id"":""10300048947""}",,,"Detect the language of the text. For example, ..."
3,"```json\n{\n ""language"": ""English"",\n ""snack...",,,"Detect the language of the text. For example, ..."
4,"```json\n{\n ""language"": ""English"",\n ""snack...",,,"Detect the language of the text. For example, ..."
5,"```json\n{\n ""language"": ""English"",\n ""snack...",,,"Detect the language of the text. For example, ..."
6,"{""language"": ""English"", ""snack_id"": ""103000654...",,,"Detect the language of the text. For example, ..."
7,"{""language"": ""English"", ""snack_id"": ""103001063...",,,"Detect the language of the text. For example, ..."
8,"{""language"": ""English"", ""snack_id"": ""103001064...",,,"Detect the language of the text. For example, ..."
9,"```json\n{""language"": ""English"", ""snack_id"": ""...",,,"Detect the language of the text. For example, ..."


In [60]:
%%bigquery
declare prompt_query STRING default "Detect the language of the text. For example, English, French, Spanish, etc. Return the output as json, include the snack_id in the output as well";
create or replace table airline_stg_ai.language_predictions_raw_10 as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("snack_id", snack_id, "product_name", product_name,
                  "brands", brands, "categories", categories, "ingredients_text", ingredients_text))) as prompt
      from airline_stg.Snack_10k
      order by snack_id
      limit 10
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [61]:
%%bigquery
select ml_generate_text_llm_result, prompt
from airline_stg_ai.language_predictions_raw_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"{""language"": ""English"", ""snack_id"": ""1.6128354...","Detect the language of the text. For example, ..."
1,"```json\n{""language"": ""English"", ""snack_id"": ""...","Detect the language of the text. For example, ..."
2,"{""language"": ""English"", ""snack_id"": ""103001064...","Detect the language of the text. For example, ..."
3,"```json\n{""language"": ""English"", ""snack_id"": ""...","Detect the language of the text. For example, ..."
4,"{""language"": ""English"", ""snack_id"": ""103001063...","Detect the language of the text. For example, ..."
5,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."
6,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."
7,"{""language"":""English"",""snack_id"":""10300048947""}","Detect the language of the text. For example, ..."
8,"{""language"": ""English"", ""snack_id"": ""103000654...","Detect the language of the text. For example, ..."
9,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."


In [62]:
%%bigquery
create or replace table airline_stg_ai.language_predictions_formatted_10 as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from airline_stg_ai.language_predictions_raw_10

Query is running:   0%|          |

In [63]:
%%bigquery
select * from airline_stg_ai.language_predictions_formatted_10

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"{""language"": ""English"", ""snack_id"": ""103001063..."
1,"{""language"": ""English"", ""snack_id"": ""103001064..."
2,"{ ""language"": ""English"", ""snack_id"": ""103000..."
3,"{""language"": ""English"", ""snack_id"": ""1.6128354..."
4,"{""language"": ""English"", ""snack_id"": ""103000654..."
5,"{ ""language"": ""English"", ""snack_id"": ""103000..."
6,"{ ""language"": ""English"", ""snack_id"": ""103000..."
7,"{""language"":""English"",""snack_id"":""10300048947""}"
8,"{""language"": ""English"", ""snack_id"": ""102864500..."
9,"{""language"": ""English"", ""snack_id"": ""103001415..."


In [64]:
%%bigquery
alter table airline_stg.Snack add column language string;

Query is running:   0%|          |

In [None]:
%%bigquery
update airline_stg.Snack set language =
  (select json_value(ml_generate_text_llm_result, '$.language')
   from airline_stg_ai.language_predictions_formatted_10
   where snack_id = json_value(ml_generate_text_llm_result, '$.snack_id'))
where 1=1;

Query is running:   0%|          |

In [None]:
%%bigquery
select * except(image_url, created_time, last_modified_time, data_source, load_time)
from airline_stg.Snack_10k
where language is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text,subcategory,language
0,10300048947.0,http://world-en.openfoodfacts.org/product/0010...,Chopped Walnuts,Diamond,Snacks,United States,Walnuts.,Nuts,English
1,10300141518.0,http://world-en.openfoodfacts.org/product/0010...,Premium Pecan Halves,Diamond,Snacks,United States,Pecans,Nuts,English
2,10300106395.0,http://world-en.openfoodfacts.org/product/0010...,Glazed Pecans,Emerald,Snacks,United States,"Pecans, sugar, corn syrup, brown sugar, natura...",Nuts,English
3,10300106494.0,http://world-en.openfoodfacts.org/product/0010...,Mixed Nuts,Emerald,Snacks,United States,"Glazed almonds (almonds, sugar, corn syrup, sa...",Nuts,English
4,10300065425.0,http://world-en.openfoodfacts.org/product/0010...,"Emerald, salt & pepper cashews",Emerald,Snacks,United States,"Cashews, vegetable oil (safflower, sunflower, ...",Nuts,English
5,1028645000901.0,http://world-en.openfoodfacts.org/product/1028...,Deluxe Cashews,Pardoe's,Snacks,United States,"Cashews, peanut oil, salt. artificial flavor.",Nuts,English
6,10300064220.0,http://world-en.openfoodfacts.org/product/0010...,Berry Nut Blend Breakfast In The Go!,Emerald,Snacks,United States,"Vanilla granola (rolled oats, brown sugar, whe...",Breakfast,English
7,10300064213.0,http://world-en.openfoodfacts.org/product/0010...,Nut & granola mix,Emerald,Snacks,United States,"Vanilla granola (rolled oats, brown sugar, whe...",Trail Mix,English
8,10300064701.0,http://world-en.openfoodfacts.org/product/0010...,Nut & granola mix snack blends,Emerald,Snacks,United States,Sea salt caramel flavored glazed almonds (almo...,Trail Mix,English
9,1.612835432630006e+19,http://world-en.openfoodfacts.org/product/1612...,Peanut Caramel Bars,Manzela,Snacks,"Mexico,United States","Peanuts, sugar, glucose, brown sugar",Candy Bars,English


#### Apply at larger scale

In [None]:
%%bigquery
declare prompt_query STRING default "Detect the language of the text. For example, English, French, Spanish, etc. Return the output as json, include the snack_id in the output as well";
create or replace table airline_stg_ai.language_predictions_raw_10k as
  select *
  from ML.generate_text(
    model remote_models.gemini_pro,
    (
      select concat(prompt_query, to_json_string(json_object("snack_id", snack_id, "product_name", product_name,
                  "brands", brands, "categories", categories, "ingredients_text", ingredients_text))) as prompt
      from airline_stg.Snack_10k
      order by snack_id
    ),
    struct(0 as temperature, 8192 as max_output_tokens, 0.0 as top_p, 1 as top_k, TRUE as flatten_json_output)
  );

Query is running:   0%|          |

In [None]:
%%bigquery
select ml_generate_text_llm_result, prompt
from airline_stg_ai.language_predictions_raw_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result,prompt
0,"{""language"": ""English"", ""snack_id"": ""111100323...","Detect the language of the text. For example, ..."
1,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."
2,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."
3,"{""language"":""English"",""snack_id"":""11110914576""}","Detect the language of the text. For example, ..."
4,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."
...,...,...
9995,"{""language"": ""English"", ""snack_id"": ""870762114...","Detect the language of the text. For example, ..."
9996,"```json\n{\n ""language"": ""English"",\n ""snack...","Detect the language of the text. For example, ..."
9997,"{""language"": ""English"", ""snack_id"": ""895296001...","Detect the language of the text. For example, ..."
9998,"{""language"": ""English"", ""snack_id"": ""949220329...","Detect the language of the text. For example, ..."


In [None]:
%%bigquery
select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
from airline_stg_ai.language_predictions_raw_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,"{""language"": ""English"", ""snack_id"": ""111100323..."
1,"{ ""language"": ""English"", ""snack_id"": ""111108..."
2,"{ ""language"": ""English"", ""snack_id"": ""111108..."
3,"{""language"":""English"",""snack_id"":""11110914576""}"
4,"{ ""language"": ""English"", ""snack_id"": ""111501..."
...,...
9995,"{""language"": ""English"", ""snack_id"": ""870762114..."
9996,"{ ""language"": ""English"", ""snack_id"": ""870762..."
9997,"{""language"": ""English"", ""snack_id"": ""895296001..."
9998,"{""language"": ""English"", ""snack_id"": ""949220329..."


In [None]:
%%bigquery
create or replace table airline_stg_ai.language_predictions_formatted_10k as
  select trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
  from airline_stg_ai.language_predictions_raw_10k

Query is running:   0%|          |

In [None]:
%%bigquery
select *
from airline_stg_ai.language_predictions_formatted_10k

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_text_llm_result
0,
1,
2,
3,"{""language"":""English"",""snack_id"":""536769""}"
4,"{""language"":""English"",""snack_id"":""981477""}"
...,...
9995,"{""language"": ""English"",""brands"":""Valued Natura..."
9996,"{""language"":""Italian"",""brands"":""Saladine"",""cat..."
9997,"{""language"": ""English"",""brands"":""Gefen"",""categ..."
9998,"{""language"": ""English"",""brands"":""That's It."",""..."


In [None]:
%%bigquery
alter table airline_stg.Snack add column language string;

Query is running:   0%|          |

In [None]:
%%bigquery
update airline_stg.Snack set language =
  (select json_value(ml_generate_text_llm_result, '$.language')
   from airline_stg_ai.language_predictions_formatted_10k
   where snack_id = json_value(ml_generate_text_llm_result, '$.snack_id'))
where 1=1

Query is running:   0%|          |

In [None]:
%%bigquery
select * except(url, image_url, created_time, last_modified_time, data_source, load_time)
from airline_stg.Snack
where language is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,product_name,brands,categories,countries_en,ingredients_text,subcategory,language
0,8901565001728,Piknik Classic Tomato Chilli (fried Indian snack),Piknik,Snacks,India,"Wheat Flour, Edible Starch, Edible Vegetable O...",,English
1,8906081129222,Dryfruit Bar - Blueberries,Happilo,Snacks,India,"Cashews, Dates, Almonds, Black Raisins, Bluebe...",,English
2,8901491208291,Bikaneri Bhujia,Kurkure,Snacks,India,"Edible Vegetable Oil (Rice Brand Oil), Moth Da...",,English
3,8908005583189,cheese balls,time out,Snacks,India,"INGREDIENTS:Corn Grits, Refined Vegetable Oil,...",,English
4,8904406116735,Garlic 100% Veggie Chips,SnackWise,Snacks,India,Garlic and Edible Oil,,English
...,...,...,...,...,...,...,...,...
9992,7610095128904,Kezz Chips Indian Pepper & Sea Salt,"Zweifel, Zweifel KEZZ Kesselchips","Plant-based foods and beverages, Plant-based f...","Austria,France,Switzerland,World,Deutschland,F...","Kartoffeln, Rapsöl 25%, Gewürzzubereitung (Rei...",,English
9993,7613036562287,Beetroot and Apple Fruit Bar,Ohhh Yes!,Snacks,"Czech Republic,France,Ireland,Netherlands,Slov...","Apple juice concentrate, Apple puree, Dried ap...",,English
9994,38000185083,Pringles Pizza,Pringles,"Alimentos y bebidas de origen vegetal, Aliment...","Argentina,Bolivia,Colombia,Costa Rica,Panama,P...","PATATAS DESHIDRATADAS, ACEITE VEGETAL (MAÍZ, S...",,Spanish
9995,38000846748,Pringles Crema y Cebolla,Pringles,"Alimentos y bebidas de origen vegetal, Aliment...","Bolivia,Colombia,Costa Rica,Dominican Republic...","Papas deshidratadas, aceite vegetal, harina de...",,Spanish


In [None]:
%%bigquery
select * except(url, image_url, created_time, last_modified_time, data_source, load_time)
from airline_stg.Snack
where language != "English"
and language is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,product_name,brands,categories,countries_en,ingredients_text,subcategory,language
0,8008620012139,Crunchy cereals,Crich,"Aliments et boissons à base de végétaux, Alime...",Italy,"Whole WHEAT flour 46,5%, OAT Hakes 21,2%, sunf...",,French
1,8425514135154,,Mercadona,"Alimentos y bebidas de origen vegetal, Aliment...",Spain,"Agua, harina de trigo (_gluten_) integral (30%...",,Spanish
2,8480000341549,Cacahuete Garrapiñado,Hacendado,"Alimentos y bebidas de origen vegetal, Aliment...",Spain,"_Cacahuete_ 50%, azúcar, miel 1%, aromas (vain...",,Spanish
3,8480000336231,Varitas,Hacendado,"Alimentos y bebidas de origen vegetal, Aliment...",Spain,"Fécula de patata, aceite de girasol alto oleic...",,Spanish
4,8480000333070,,Hacendado,"Aliments i begudes amb base vegetal, Aliments ...",Spain,"potato, sunflower oil (28%), salt,",,Spanish
...,...,...,...,...,...,...,...,...
305,3560071083991,Noix de cajou,Carrefour,"Aliments et boissons à base de végétaux, Alime...","Belgium,France,Italy,Poland,Romania,Spain","_Noix de cajou_ 94,6% (origine Vietnam ou Camb...",,French
306,7501011178854,Stax con sabor a Queso Cheddar,"Lay's,Stax","Alimentos y bebidas de origen vegetal, Aliment...","Argentina,Bolivia,Mexico,Paraguay,Peru,Venezuela","Papas deshidratadas, aceite vegetal de maíz *,...",,Spanish
307,38000185083,Pringles Pizza,Pringles,"Alimentos y bebidas de origen vegetal, Aliment...","Argentina,Bolivia,Colombia,Costa Rica,Panama,P...","PATATAS DESHIDRATADAS, ACEITE VEGETAL (MAÍZ, S...",,Spanish
308,38000846748,Pringles Crema y Cebolla,Pringles,"Alimentos y bebidas de origen vegetal, Aliment...","Bolivia,Colombia,Costa Rica,Dominican Republic...","Papas deshidratadas, aceite vegetal, harina de...",,Spanish


In [68]:
%%bigquery
update airline_stg.Snack
  set data_source = 'open_food_facts_ai' where language is not null

Query is running:   0%|          |

### Part 3: Detect the product name of a snack from the image

#### Explore the raw data

In [65]:
%%bigquery
select snack_id, product_name, image_url
from airline_stg.Snack
where product_name is null
and image_url is not null
order by snack_id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,product_name,image_url
0,1001776768,,https://images.openfoodfacts.org/images/produc...
1,1002485781,,https://images.openfoodfacts.org/images/produc...
2,10107043,,https://images.openfoodfacts.org/images/produc...
3,1020242,,https://images.openfoodfacts.org/images/produc...
4,104770048009,,https://images.openfoodfacts.org/images/produc...
...,...,...,...
1883,9784001372250,,https://images.openfoodfacts.org/images/produc...
1884,9788426398482,,https://images.openfoodfacts.org/images/produc...
1885,9788492808274,,https://images.openfoodfacts.org/images/produc...
1886,9815203,,https://images.openfoodfacts.org/images/produc...


#### Setup

In [None]:
!pip install requests

In [66]:
import requests, os
from google.cloud import bigquery
from google.cloud import storage

bq_client = bigquery.Client()
storage_client = storage.Client()

bucket_name = "cs329e-open-access"
bucket = storage_client.get_bucket(bucket_name)

sql = "select snack_id, image_url from airline_stg.Snack where product_name is null and snack_id != '0' and image_url is not null order by snack_id limit 10"

rows = bq_client.query(sql).result()

for row in rows:
  snack_id = row["snack_id"]
  image_url = row["image_url"].strip()

  image_content = requests.get(image_url).content
  image_name = "{}.jpg".format(snack_id)
  image_file = open(image_name,'wb')
  image_file.write(image_content)
  image_file.close()

  blob = bucket.blob("images/" + image_name)
  blob.upload_from_filename("/content/" + image_name) # Colab places the downloaded image in the /content folder

  os.remove("/content/" + image_name)

##### Create a remote connection for Cloud Storage before running the next cell. Assign the service account of the connection the `Storage Object Viewer` and `Vertex User` roles.

In [67]:
%%bigquery
create or replace external table airline_stg_ai.snack_product_images
with connection `projects/cs329e-sp2024/locations/us/connections/gcs-connection`
options(
  object_metadata = 'SIMPLE',
  uris = ['gs://cs329e-open-access/images/*'],
  max_staleness = INTERVAL 1 DAY,
  metadata_cache_mode = 'AUTOMATIC'
);

Query is running:   0%|          |

In [None]:
%%bigquery
select * from airline_stg_ai.snack_product_images

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,generation,content_type,size,md5_hash,updated,metadata
0,gs://cs329e_datasets/images/1001776768.jpg,1712261719888692,image/jpeg,27642,98983b825a60f39340d56dcb658ec459,2024-04-04 20:15:19.903000+00:00,[]
1,gs://cs329e_datasets/images/1002485781.jpg,1712261720857749,image/jpeg,20589,b62a7eedfa66744603e0516d0cc0023b,2024-04-04 20:15:20.868000+00:00,[]
2,gs://cs329e_datasets/images/10107043.jpg,1712261721717119,image/jpeg,25728,c35b56bdacd005da4406e53d161f2e9d,2024-04-04 20:15:21.727000+00:00,[]
3,gs://cs329e_datasets/images/1020242.jpg,1712261722444152,image/jpeg,12504,3cf5513a9bc1c83a0d312af5583a5c10,2024-04-04 20:15:22.455000+00:00,[]
4,gs://cs329e_datasets/images/104770048009.jpg,1712261723625406,image/jpeg,35556,318600c9a5fb1e608522e7e1223faa13,2024-04-04 20:15:23.633000+00:00,[]
5,gs://cs329e_datasets/images/10700450265.jpg,1712261724541120,image/jpeg,46216,732c049f5c14c1f0e9ab2dbe331ac9b2,2024-04-04 20:15:24.555000+00:00,[]
6,gs://cs329e_datasets/images/11185352.jpg,1712261725307782,image/jpeg,26453,fdd6706a5c718b12ab66d7fcbeef32e2,2024-04-04 20:15:25.317000+00:00,[]
7,gs://cs329e_datasets/images/11300051395.jpg,1712261726187606,image/jpeg,25176,4252019c84028b59909a9481ac389255,2024-04-04 20:15:26.202000+00:00,[]
8,gs://cs329e_datasets/images/1142273.jpg,1712261726946369,image/jpeg,33150,4eef7a08c079cd17b22c6f15fe1100dd,2024-04-04 20:15:26.957000+00:00,[]
9,gs://cs329e_datasets/images/1202.jpg,1712261727665790,image/jpeg,25629,71622a70f7323087141ddf5ebd0aa7c8,2024-04-04 20:15:27.677000+00:00,[]


In [70]:
%%bigquery
create or replace model remote_models.gemini_pro_vision
  remote with connection `projects/cs329e-sp2024/locations/us/connections/vertex_connection`
  options (endpoint = 'gemini-pro-vision');

Query is running:   0%|          |

In [71]:
%%bigquery
SELECT
  uri,
  ml_generate_text_llm_result
FROM
  ML.GENERATE_TEXT(
    MODEL remote_models.gemini_pro_vision,
      TABLE `cs329e-sp2024.airline_stg_ai.snack_product_images`,
        STRUCT("What is the text in this image?" as prompt,
          true as flatten_json_output));

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,ml_generate_text_llm_result
0,gs://cs329e-open-access/images/1001776768.jpg,Nestle TOPPING\nPROFESSIONAL CHOCOLATE BALLS\...
1,gs://cs329e-open-access/images/11300051395.jpg,"The text in this image is: ""Brach's Spiced Je..."
2,gs://cs329e-open-access/images/104770048009.jpg,Haribo\nPandawai Pik\nSans colorant artificie...
3,gs://cs329e-open-access/images/1142273.jpg,4 Blueberry Muffins\nby Sainsbury's\n1 muffin...
4,gs://cs329e-open-access/images/1202.jpg,"The text on the image says ""Coconut Rock""."
5,gs://cs329e-open-access/images/1020242.jpg,"The text in the image is ""Cerelene Galettes""."
6,gs://cs329e-open-access/images/10700450265.jpg,Jolly Rancher\nSour Surge\nHard Candy with So...
7,gs://cs329e-open-access/images/1002485781.jpg,MEZETE\nHUMMUS WITH BREADSTICKS\nMIDDLE EASTE...
8,gs://cs329e-open-access/images/10107043.jpg,The text in the image is:\n\npopchips\nPopped...
9,gs://cs329e-open-access/images/11185352.jpg,"The text on the front of the package says: ""N..."


#### Tweak the prompt

In [72]:
%%bigquery
SELECT
  uri,
  ml_generate_text_llm_result
FROM
  ML.GENERATE_TEXT(
    MODEL remote_models.gemini_pro_vision,
      TABLE `cs329e-sp2024.airline_stg_ai.snack_product_images`,
      STRUCT('What is the large text in this image? Return it as json' as prompt,
        true as flatten_json_output));

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,ml_generate_text_llm_result
0,gs://cs329e-open-access/images/1001776768.jpg,"```json\n{\n ""text"": ""TOPPING CHOCOLATE BALL..."
1,gs://cs329e-open-access/images/11300051395.jpg,"```json\n{\n ""text"": ""Spiced""\n}\n```"
2,gs://cs329e-open-access/images/104770048009.jpg,"```json\n{\n ""text"": ""Haribo""\n}\n```"
3,gs://cs329e-open-access/images/1142273.jpg,"```json\n{\n ""text"": ""4 BLUEBERRY MUFFINS""\n..."
4,gs://cs329e-open-access/images/1202.jpg,"```json\n{\n ""text"": ""Coconut Rock""\n}\n```"
5,gs://cs329e-open-access/images/1020242.jpg,"```json\n{\n ""text"": ""Cerelene""\n}\n```"
6,gs://cs329e-open-access/images/10700450265.jpg,"```json\n{\n ""text"": ""Jolly Rancher""\n}\n```"
7,gs://cs329e-open-access/images/1002485781.jpg,"```json\n{\n ""text"": ""MEZETE"",\n ""confidenc..."
8,gs://cs329e-open-access/images/10107043.jpg,"```json\n{\n ""text"": ""popchips""\n}\n```"
9,gs://cs329e-open-access/images/11185352.jpg,"```json\n{\n ""text"": ""PIRE JABUKA MANGO PAPA..."


In [73]:
%%bigquery
SELECT
  uri,
  ml_generate_text_llm_result
FROM
  ML.GENERATE_TEXT(
    MODEL remote_models.gemini_pro_vision,
      TABLE `cs329e-sp2024.airline_stg_ai.snack_product_images`,
        STRUCT('What is the product name in this image? Return it as json' as prompt,
          True as flatten_json_output));

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,uri,ml_generate_text_llm_result
0,gs://cs329e-open-access/images/1001776768.jpg,"```json\n{\n ""product_name"": ""Topping Chocol..."
1,gs://cs329e-open-access/images/11300051395.jpg,"```json\n{\n ""product_name"": ""Spiced Jelly B..."
2,gs://cs329e-open-access/images/104770048009.jpg,"```json\n{\n ""product_name"": ""Haribo Panda P..."
3,gs://cs329e-open-access/images/1142273.jpg,"```json\n{\n ""product_name"": ""4 Blueberry Mu..."
4,gs://cs329e-open-access/images/1202.jpg,"```json\n{\n ""product_name"": ""Coconut Rock""\..."
5,gs://cs329e-open-access/images/1020242.jpg,"```json\n{\n ""product_name"": ""Galettes""\n}\n```"
6,gs://cs329e-open-access/images/10700450265.jpg,"```json\n{\n ""product_name"": ""Jolly Rancher ..."
7,gs://cs329e-open-access/images/1002485781.jpg,"```json\n{\n ""product_name"": ""Hummus with Br..."
8,gs://cs329e-open-access/images/10107043.jpg,"```json\n{\n ""product_name"": ""Sea Salt & Vin..."
9,gs://cs329e-open-access/images/11185352.jpg,"```json\n{\n ""product_name"": ""Nutrilab Pire ..."


In [75]:
%%bigquery
CREATE or REPLACE TABLE airline_stg_ai.product_name_predictions_raw_10 AS
SELECT
  uri,
  ml_generate_text_llm_result
FROM
  ML.GENERATE_TEXT(
    MODEL remote_models.gemini_pro_vision,
        TABLE `cs329e-sp2024.airline_stg_ai.snack_product_images`,
          STRUCT('What is the product name in this image? Return it as json' as prompt,
          True as flatten_json_output));

Query is running:   0%|          |

#### Format the json

In [76]:
%%bigquery
  create or replace table airline_stg_ai.product_name_predictions_formatted_10 as
    select split(replace(uri, 'gs://cs329e-open-access/images/', ''), '.')[0] as snack_id, uri, trim(replace(replace(replace(ml_generate_text_llm_result, '```json', ''), '```', ''), '\n', '')) as ml_generate_text_llm_result
    from airline_stg_ai.product_name_predictions_raw_10

Query is running:   0%|          |

In [78]:
%%bigquery
select s.snack_id, s.product_name, p.snack_id, p.ml_generate_text_llm_result, s.data_source
from airline_stg.Snack s join airline_stg_ai.product_name_predictions_formatted_10 p
on s.snack_id = p.snack_id
order by s.snack_id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,product_name,snack_id_1,ml_generate_text_llm_result,data_source
0,1001776768,,1001776768,"{ ""product_name"": ""Topping Chocolate Balls""}",open_food_facts
1,1002485781,,1002485781,"{ ""product_name"": ""Hummus with Breadsticks""}",open_food_facts
2,10107043,,10107043,"{ ""product_name"": ""Sea Salt & Vinegar Popchips""}",open_food_facts
3,1020242,,1020242,"{ ""product_name"": ""Galettes""}",open_food_facts
4,104770048009,,104770048009,"{ ""product_name"": ""Haribo Panda Pik""}",open_food_facts
5,10700450265,,10700450265,"{ ""product_name"": ""Jolly Rancher Sour Surge H...",open_food_facts
6,11185352,,11185352,"{ ""product_name"": ""Nutrilab Pire Jabuka Mango...",open_food_facts
7,11300051395,,11300051395,"{ ""product_name"": ""Spiced Jelly Bird Eggs""}",open_food_facts
8,1142273,,1142273,"{ ""product_name"": ""4 Blueberry Muffins""}",open_food_facts
9,1202,,1202,"{ ""product_name"": ""Coconut Rock""}",open_food_facts


#### Update the original table with the product names

In [79]:
%%bigquery
update airline_stg.Snack s
  set product_name = (select json_value(ml_generate_text_llm_result, '$.product_name')
    from airline_stg_ai.product_name_predictions_formatted_10 where snack_id = s.snack_id)
  where 1=1

Query is running:   0%|          |

In [None]:
%%bigquery
select s.snack_id, s.product_name, p.snack_id, p.ml_generate_text_llm_result, s.data_source
from airline_stg.Snack s join airline_stg_ai.product_name_predictions_formatted_10 p
on s.snack_id = p.snack_id
order by s.snack_id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,product_name,snack_id_1,ml_generate_text_llm_result,data_source
0,1001776768,Topping Chocolate Balls,1001776768,"{ ""product_name"": ""Topping Chocolate Balls""}",open_food_facts_ai
1,1002485781,Hummus with Breadsticks,1002485781,"{ ""product_name"": ""Hummus with Breadsticks""}",open_food_facts_ai
2,10107043,Sea Salt & Vinegar Popchips,10107043,"{ ""product_name"": ""Sea Salt & Vinegar Popchips""}",open_food_facts_ai
3,1020242,Galettes,1020242,"{ ""product_name"": ""Galettes""}",open_food_facts_ai
4,104770048009,Haribo Panda Pik,104770048009,"{ ""product_name"": ""Haribo Panda Pik""}",open_food_facts_ai
5,10700450265,Jolly Rancher Sour Surge Hard Candy,10700450265,"{ ""product_name"": ""Jolly Rancher Sour Surge H...",open_food_facts_ai
6,11185352,Nutrilab Pire Jabuka Mango Papaja Banana,11185352,"{ ""product_name"": ""Nutrilab Pire Jabuka Mango...",open_food_facts_ai
7,11300051395,Spiced Jelly Bird Eggs,11300051395,"{ ""product_name"": ""Spiced Jelly Bird Eggs""}",open_food_facts_ai
8,1142273,4 Blueberry Muffins,1142273,"{ ""product_name"": ""4 Blueberry Muffins""}",open_food_facts_ai
9,1202,Coconut Rock,1202,"{ ""product_name"": ""Coconut Rock""}",open_food_facts_ai


In [81]:
%%bigquery
update airline_stg.Snack
  set data_source = 'open_food_facts_ai'
    where snack_id in (select snack_id from airline_stg_ai.product_name_predictions_formatted_10)

Query is running:   0%|          |

In [82]:
%%bigquery
select *
from airline_stg.Snack
where snack_id in (select snack_id from airline_stg_ai.product_name_predictions_formatted_10)
order by snack_id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,snack_id,url,product_name,brands,categories,countries_en,ingredients_text,image_url,created_time,last_modified_time,data_source,load_time,subcategory,language
0,1001776768,http://world-en.openfoodfacts.org/product/1001...,Topping Chocolate Balls,Nestlé,"Botanas, Snacks dulces, Cacao y sus productos,...",France,"Chocolate con leche [azúcar, cacao, leche desn...",https://images.openfoodfacts.org/images/produc...,2022-11-05 16:43:09+00:00,2023-07-01 21:40:44+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
1,1002485781,http://world-en.openfoodfacts.org/product/1002...,Hummus with Breadsticks,,"Snacks, Snacks salés",France,,https://images.openfoodfacts.org/images/produc...,2023-07-14 10:10:21+00:00,2023-08-28 17:18:46+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
2,10107043,http://world-en.openfoodfacts.org/product/1010...,Sea Salt & Vinegar Popchips,Popchips,"Plant-based foods and beverages, Plant-based f...",United Kingdom,,https://images.openfoodfacts.org/images/produc...,2023-09-07 11:59:45+00:00,2023-11-07 16:48:18+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
3,1020242,http://world-en.openfoodfacts.org/product/1020242,Galettes,,"Snacks,Sweet snacks,Biscuits and cakes,Biscuits",,"Wheat flour, sugar, vegetable fat (palm), inve...",https://images.openfoodfacts.org/images/produc...,2023-12-22 19:05:21+00:00,2023-12-22 19:10:26+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
4,104770048009,http://world-en.openfoodfacts.org/product/0104...,Haribo Panda Pik,Haribo,"Snacks, Snacks sucrés, Confiseries, Bonbons",France,,https://images.openfoodfacts.org/images/produc...,2023-03-16 12:36:15+00:00,2023-05-24 10:50:23+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
5,10700450265,http://world-en.openfoodfacts.org/product/0010...,Jolly Rancher Sour Surge Hard Candy,The Hershey Company,"Snacks, Sweet snacks, Confectioneries, Candies",Pakistan,,https://images.openfoodfacts.org/images/produc...,2019-10-17 20:41:40+00:00,2023-10-08 16:51:30+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
6,11185352,http://world-en.openfoodfacts.org/product/1118...,Nutrilab Pire Jabuka Mango Papaja Banana,nutrino lab,"Plant-based foods and beverages, Plant-based f...","Bosnia and Herzegovina,Serbia",,https://images.openfoodfacts.org/images/produc...,2021-04-01 16:21:52+00:00,2022-04-01 11:52:00+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
7,11300051395,http://world-en.openfoodfacts.org/product/0011...,Spiced Jelly Bird Eggs,Brach's,"Snacks, Snacks sucrés, Confiseries","France,United States","Sugar, corn syrup, modified food starch (corn)...",https://images.openfoodfacts.org/images/produc...,2017-03-30 17:02:35+00:00,2023-04-28 14:33:17+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
8,1142273,http://world-en.openfoodfacts.org/product/0114...,4 Blueberry Muffins,By Sainsbury's,"Snacks,Snacks sucrés,Biscuits et gâteaux,Gâtea...",New Caledonia,,https://images.openfoodfacts.org/images/produc...,2019-11-16 20:17:31+00:00,2023-06-28 01:50:34+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,
9,1202,http://world-en.openfoodfacts.org/product/1202...,Coconut Rock,fanny,"Snacks, Snacks sucrés, Biscuits et gâteaux, Pâ...","France,Sri Lanka","DESICCATED COCONUT SUGAR, CARDAMOM POWDER, ART...",https://images.openfoodfacts.org/images/produc...,2019-09-03 08:44:53+00:00,2022-02-11 03:44:44+00:00,open_food_facts_ai,2024-02-03 19:40:10.880358+00:00,,


#### Apply at larger scale

##### Download the image and copy it into the GCS bucket. Make sure that the file name of the image contains the snack_id

In [None]:
import requests, os
from google.cloud import bigquery
from google.cloud import storage

bq_client = bigquery.Client()
storage_client = storage.Client()

bucket_name = "cs329e-open-access"
bucket = storage_client.get_bucket(bucket_name)

sql = "select snack_id, image_url from airline_stg.Snack where product_name is null and snack_id != '0' and image_url is not null order by snack_id limit 10"

rows = bq_client.query(sql).result()

for row in rows:
  snack_id = row["snack_id"]
  image_url = row["image_url"].strip()

  image_content = requests.get(image_url).content
  image_name = "{}.jpg".format(snack_id)
  image_file = open(image_name,'wb')
  image_file.write(image_content)
  image_file.close()

  blob = bucket.blob("images/" + image_name)
  blob.upload_from_filename("/content/" + image_name) # Colab places the downloaded image in the /content folder

  os.remove("/content/" + image_name)

##### Re-run the cells in the previous sections of Part 3 to re-process the full batch of images

#### Part 4: Merge changes into target table

In [95]:
%%bigquery
alter table airline_csp.Snack
  add column subcategory string;

Query is running:   0%|          |

In [96]:
%%bigquery
alter table airline_csp.Snack
  add column language string;

Query is running:   0%|          |

In [93]:
%%bigquery
select count(*) as num_records
from airline_csp.Snack

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,226780


In [105]:
%%bigquery
select count(*) as num_updates
from airline_csp.Snack t join airline_stg.Snack s
on t.snack_id = s.snack_id
where t.status_flag = true
and (s.product_name is not null and t.product_name is null
or s.subcategory is not null and t.subcategory is null
or s.language is not null and t.language is null);

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_updates
0,20


In [107]:
%%bigquery
declare current_ts TIMESTAMP;
set current_ts = current_timestamp();

create temp table updates as
  select s.*
  from airline_csp.Snack t join airline_stg.Snack s
  on t.snack_id = s.snack_id
  where t.status_flag = true
  and (s.product_name is not null and t.product_name is null
  or s.subcategory is not null and t.subcategory is null
  or s.language is not null and t.language is null);

update airline_csp.Snack
set discontinue_time = timestamp_sub(current_ts, interval 1 second), status_flag = false
where snack_id in (select snack_id from updates);

insert into airline_csp.Snack
  (snack_id, url, product_name, brands, categories, countries_en, ingredients_text, image_url,
    subcategory, language, created_time, last_modified_time, data_source, load_time, effective_time, status_flag)
    (select snack_id, url, product_name, brands, categories, countries_en, ingredients_text, image_url,
      subcategory, language, created_time, last_modified_time, data_source, load_time, current_ts, true
      from updates);

Query is running:   0%|          |

In [108]:
%%bigquery
select count(*) as num_records
from airline_csp.Snack

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,num_records
0,226800
