#### Final Project Option 1:
##### Using text embeddings for classifying the sentiment of airport reviews

#### Part 1: Setup

##### Create the BQ datasets


In [None]:
from google.cloud import bigquery

project_id = "cs378-fa2024"
dataset = "fin_air_travel"
region = "us-central1"

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset cs378-fa2024.fin_air_travel


In [None]:
from google.cloud import bigquery

project_id = "cs378-fa2024"
dataset = "ai_models"
region = "us-central1"

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset cs378-fa2024.ai_models


##### Register the embeddings model with BQ
##### Before running the next cell, create a remote connection to Vertex AI and then grant the service account associated with the connection the "Vertex AI User" role

In [None]:
%%bigquery
create or replace model ai_models.text_embedding
remote with connection `projects/cs378-fa2024/locations/us-central1/connections/remote-connection`
options (endpoint = 'text-embedding-004');

Query is running:   0%|          |

#### Part 2: Create the embeddings

#### Create the embeddings for the airport review records, using the `subject` and `body` fields (and ignoring the remaining ones).
###### More details on the `ml.generate_embedding()`: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-generate-embedding#text-embedding

In [None]:
%%bigquery
create or replace table fin_air_travel.tmp_airport_review_embedding as (
select
  id,
  ml_generate_embedding_result as embedding
from
  ml.generate_embedding(
    model ai_models.text_embedding,
    (select id, concat(subject, ' ', body) as content from air_travel_int.Airport_Review
    where subject is not null and body is not null),
    # struct('RETRIEVAL_DOCUMENT' as task_type)
    STRUCT('CLUSTERING' as task_type)
  )
);

Query is running:   0%|          |

##### Note: the embeddings gets created on the `content` field. Having a `content` field is required when calling `ml.generate_embedding()`

In [None]:
%%bigquery
select * from fin_air_travel.tmp_airport_review_embedding

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,embedding
0,8248,"[0.04698944091796875, -0.03605397418141365, 0...."
1,8963,"[0.02252197265625, 0.017169218510389328, 0.063..."
2,21850,"[0.028484344482421875, 0.04783940687775612, -0..."
3,1629,"[0.0433502197265625, 0.04099934920668602, 0.04..."
4,1964,"[0.02364349365234375, 0.03252800926566124, 0.0..."
...,...,...
8658,33287,"[-0.005857463460415602, 0.07193433493375778, -..."
8659,24501,"[0.027599332854151726, -0.048200808465480804, ..."
8660,24500,"[0.027599332854151726, -0.048200808465480804, ..."
8661,17220,"[0.04264068230986595, -0.02278808131814003, -0..."


In [None]:
%%bigquery
create or replace table fin_air_travel.tmp_airport_review_joined as
select ar.id,	ar.thread_id,	ar.icao,	ar.date_created,	ar.author, ar.subject, ar.body,
  ar.relevant as relevant_gemini, ar.sentiment as sentiment_gemini, t.embedding
from air_travel_int.Airport_Review ar join fin_air_travel.tmp_airport_review_embedding t
on ar.id = t.id

Query is running:   0%|          |

In [None]:
%%bigquery
select * from fin_air_travel.tmp_airport_review_joined

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,thread_id,icao,date_created,author,subject,body,relevant_gemini,sentiment_gemini,embedding
0,83279,80618,AGGH,2021-11-18 17:20:11,david,Originally Henderson Field,For people interested in Second-World-War hist...,,,"[0.012682422995567322, 0.06087036803364754, 0...."
1,21134,18493,AGGH,2015-08-18 06:08:46,,re: HIR,Reply to @Ozguy: It really needs TLC. Yesterda...,,,"[0.04773791879415512, -0.005082897376269102, 0..."
2,18055,15645,AYPY,2013-06-21 19:00:31,,POM,A reasonable international terminal but nothin...,,,"[0.07391546666622162, 0.019694367423653603, -0..."
3,16447,14605,AYTA,2012-09-15 12:55:51,Bill35,Location region,This airport is in the new province of Hela. h...,,,"[0.00996060948818922, 0.0797303169965744, -0.0..."
4,16709,14769,AYTK,2012-10-27 13:00:58,,One day soon I hope.,Having spent 2 years living in Rabaul when I w...,,,"[-0.033397454768419266, -0.014946848154067993,..."
...,...,...,...,...,...,...,...,...,...,...
8658,1272,1085,ZBAA,2008-02-26 08:17:57,david,New terminal for Olympics,Terminal 3 at PEK will open on a trial basis o...,True,positive,"[0.04803690314292908, 0.039951127022504807, -0..."
8659,5594,4616,ZMBH,2010-05-05 09:08:52,,Bayankhongor Airport Parking,Bayankhongor Airport is the only airport that ...,True,positive,"[-0.027688542380928993, -0.00972745195031166, ..."
8660,537399,82358,ZSQZ,2024-07-14 04:34:25,malaybear,(no subject),welcome to my favorite airport.,True,positive,"[0.03930637985467911, -0.014370210468769073, 0..."
8661,5469,4521,ZSSS,2010-04-07 06:26:52,green_chilli,New terminal just opened April 2010,With all the preparations for the 2010 Shangha...,True,positive,"[0.010875684209167957, -0.0065320804715156555,..."


#### Create the embeddings for the target categories (e.g. "airport was amazing", "airport was good", etc.)

In [None]:
%%bigquery
create or replace table fin_air_travel.tmp_airport_review_query as
with amazing as (
  select content as query, ml_generate_embedding_result as embedding
  from
    ml.generate_embedding(
      model ai_models.text_embedding,
      (select "The airport was amazing" as content),
      struct(true as flatten_json_output)
    )
),
good as (
  select content as query, ml_generate_embedding_result as embedding
  from
    ml.generate_embedding(
      model ai_models.text_embedding,
      (select "The airport was good" as content),
      struct(true as flatten_json_output)
    )
),
acceptable as (
  select content as query, ml_generate_embedding_result as embedding
  from
    ml.generate_embedding(
      model ai_models.text_embedding,
      (select "The airport was acceptable" as content),
      struct(true as flatten_json_output)
    )
),
bad as (
  select content as query, ml_generate_embedding_result as embedding
  from
    ml.generate_embedding(
      model ai_models.text_embedding,
      (select "The airport was bad" as content),
      struct(true as flatten_json_output)
    )
),
terrible as (
  select content as query, ml_generate_embedding_result as embedding
  from
    ml.generate_embedding(
      model ai_models.text_embedding,
      (select "The airport was terrible" as content),
      struct(true as flatten_json_output)
    )
)
select query, embedding
from amazing
union all
select query, embedding
from good
union all
select query, embedding
from acceptable
union all
select query, embedding
from bad
union all
select query, embedding
from terrible

Query is running:   0%|          |

In [None]:
%%bigquery
select * from fin_air_travel.tmp_airport_review_query

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,query,embedding
0,The airport was good,"[0.04038482904434204, -0.020308557897806168, 0..."
1,The airport was bad,"[0.05869739130139351, -0.0489741712808609, 0.0..."
2,The airport was acceptable,"[0.04364384710788727, -0.038139164447784424, 0..."
3,The airport was amazing,"[0.034214235842227936, -0.031752489507198334, ..."
4,The airport was terrible,"[0.06528552621603012, -0.05658840388059616, 0...."


#### Part 3: Assigning the categories
###### More details on the `ml.distance()` function: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-distance

In [None]:
%%bigquery
select id, subject, body, relevant_gemini, sentiment_gemini,
  ml.distance(
    (select embedding from fin_air_travel.tmp_airport_review_query where query like '%amazing'),
    embedding,
    'COSINE'
  ) as distance_amazing
from
  fin_air_travel.tmp_airport_review_joined
order by distance_amazing;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,subject,body,relevant_gemini,sentiment_gemini,distance_amazing
0,774,airport,its really nice airport,True,positive,0.102952
1,6291,hello,it's an amazing airport..,True,positive,0.106890
2,21689,Amazing Airport,"If every airport was like this one, I wouldn't...",,,0.114719
3,20972,beautiful airport,NICE AIRPORT,,,0.116644
4,27766,Very nice,Very clean and beautiful airport,,,0.129927
...,...,...,...,...,...,...
8658,16325,re: Persian Kittens,It's a scam. Please post the email addresses a...,,,0.824906
8659,26213,re: RE:Source of income,kapag sinabi ko pa ba nag buy and sell ako and...,,,0.828288
8660,18931,english bulldog,I think I have been scam been trying to get a ...,,,0.832294
8661,23509,@degree of consanguinity and affinity,kelangan po pasok sa 4th degree of consanguini...,,,0.834698


In [None]:
%%bigquery
create or replace table fin_air_travel.tmp_airport_review_distance as
select id, 'amazing' as distance_key,
  ml.distance(
    (select embedding from fin_air_travel.tmp_airport_review_query where query like '%amazing'),
    embedding,
    'COSINE'
  ) as distance_value
from fin_air_travel.tmp_airport_review_joined
union all
select id, 'good' as distance_key,
  ml.distance(
    (select embedding from fin_air_travel.tmp_airport_review_query where query like '%good'),
    embedding,
    'COSINE'
  ) as distance_value
from fin_air_travel.tmp_airport_review_joined
union all
select id, 'acceptable' as distance_key,
  ml.distance(
    (select embedding from fin_air_travel.tmp_airport_review_query where query like '%acceptable'),
    embedding,
    'COSINE'
  ) as distance_value
from fin_air_travel.tmp_airport_review_joined
union all
select id, 'bad' as distance_key,
  ml.distance(
    (select embedding from fin_air_travel.tmp_airport_review_query where query like '%bad'),
    embedding,
    'COSINE'
  ) as distance_value
from fin_air_travel.tmp_airport_review_joined
union all
select id, 'terrible' as distance_key,
  ml.distance(
    (select embedding from fin_air_travel.tmp_airport_review_query where query like '%terrible'),
    embedding,
    'COSINE'
  ) as distance_value
from fin_air_travel.tmp_airport_review_joined

Query is running:   0%|          |

In [None]:
%%bigquery
select * from fin_air_travel.tmp_airport_review_distance
order by id

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,distance_key,distance_value
0,1,bad,0.587674
1,1,good,0.492201
2,1,amazing,0.508092
3,1,terrible,0.587785
4,1,acceptable,0.510464
...,...,...,...
43310,537399,bad,0.360615
43311,537399,good,0.258935
43312,537399,amazing,0.229636
43313,537399,terrible,0.353116


##### Note: for each <airport review, target category> pair, we compute the distance between their embeddings

##### Group the 5 distance values associated with each airport review and then take the min distance from each set

In [None]:
%%bigquery
create or replace table fin_air_travel.tmp_airport_review_min_distance as
with distance as
  (select id, min(distance_value) as min_value
  from fin_air_travel.tmp_airport_review_distance
  group by id)
select d.id, d.min_value, t.distance_key
from distance d join fin_air_travel.tmp_airport_review_distance t
on d.id = t.id and d.min_value = t.distance_value
order by id

Query is running:   0%|          |

In [1]:
%%bigquery
select * from fin_air_travel.tmp_airport_review_min_distance

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,min_value,distance_key
0,1,0.492201,good
1,2,0.586693,acceptable
2,3,0.435318,good
3,4,0.398733,good
4,5,0.597766,acceptable
...,...,...,...
8658,531049,0.270233,amazing
8659,531615,0.432982,good
8660,535932,0.518727,amazing
8661,536849,0.621275,bad


##### Join the minimum distance and assignment target category with the rest of the airport review record

In [None]:
%%bigquery
select j.id, j.thread_id, j.icao, j.date_created, j.author, j.subject, j.body, j.relevant_gemini, j.sentiment_gemini, j.embedding, d.min_value as min_distance, d.distance_key as sentiment_gecko
from fin_air_travel.tmp_airport_review_joined j join fin_air_travel.tmp_airport_review_min_distance d
on j.id = d.id
order by min_value

##### Create the final table in the same dataset

In [None]:
%%bigquery
create or replace table fin_air_travel.Airport_Review as
  select j.id, j.thread_id, j.icao, j.date_created, j.author, j.subject, j.body, j.relevant_gemini, j.sentiment_gemini, j.embedding, d.min_value as min_distance, d.distance_key as sentiment_gecko
  from fin_air_travel.tmp_airport_review_joined j join fin_air_travel.tmp_airport_review_min_distance d
  on j.id = d.id
  order by min_value

Query is running:   0%|          |

#### Part 4: Evaluation

##### Visually inspect the output to assess the accuracy of the predicted categories.

##### Note: If the assigned category doesn't make sense, you can go back and tune the `generate_embedding()` parameters and re-run

In [None]:
%%bigquery
select id, icao, date_created, subject, body, relevant_gemini, sentiment_gemini, min_distance, sentiment_gecko
from fin_air_travel.Airport_Review
order by min_distance

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,icao,date_created,subject,body,relevant_gemini,sentiment_gemini,min_distance,sentiment_gecko
0,774,OPLA,2007-10-21 05:38:04,airport,its really nice airport,True,positive,0.102952,amazing
1,6291,NSFA,2010-07-25 21:34:15,hello,it's an amazing airport..,True,positive,0.106890,amazing
2,18864,MMBT,2013-11-15 17:01:59,A nice AirPort,It is a nice airport,,,0.107144,good
3,21689,KLGB,2016-04-05 14:27:23,Amazing Airport,"If every airport was like this one, I wouldn't...",,,0.114719,amazing
4,20972,VIDP,2015-06-23 10:07:11,beautiful airport,NICE AIRPORT,,,0.116644,amazing
...,...,...,...,...,...,...,...,...,...
8658,26320,RPLL,2017-06-23 07:13:06,freelancer or unemployed,"pls advice po, d ko alam kng ano dpt ko sabhin...",,,0.803517,good
8659,19538,RPLL,2014-03-29 05:10:54,Immigration,Pwede po b maging sponsor ang boyfriend kahit ...,,,0.804224,amazing
8660,19655,RPLL,2014-04-09 14:10:14,re: my brother is my sponsor,both b. certh nyo or m. cert mo kung married k...,,,0.808316,good
8661,23509,RPLL,2016-11-05 02:11:18,@degree of consanguinity and affinity,kelangan po pasok sa 4th degree of consanguini...,,,0.814569,acceptable


##### Under the embeddings approach, any category with a distance value > 0.5 appears to be unreliable

In [None]:
%%bigquery
select * except(embedding)
from fin_air_travel.Airport_Review
where min_distance > 0.5
order by min_distance

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,thread_id,icao,date_created,author,subject,body,relevant_gemini,sentiment_gemini,min_distance,sentiment_gecko
0,19907,17266,RPVB,2014-06-09 02:34:52,,"To the Guard on duty dated June 9, 2014 (time ...","Unfortunately, my maid did not allowed to ente...",,,0.500120,terrible
1,5340,4432,LPBJ,2010-03-14 09:26:10,,Beja Airport,Any Clues to when it will open anyone?. We can...,True,neutral,0.500214,amazing
2,4574,3787,VARP,2009-10-08 05:51:52,,i want to know about this airport,when did aipur airport inaugerated,True,neutral,0.500296,good
3,18219,15748,RPVW,2013-07-29 03:25:35,,borongan,"baka march 2020 pa magbukas yong airport,,sa t...",,,0.500364,bad
4,21054,18413,YCAR,2015-07-16 12:14:42,,$1.8 million Airport Upgrade and Extention 2017,The state and federal governments in 2017 will...,,,0.500396,amazing
...,...,...,...,...,...,...,...,...,...,...,...
5697,26320,23679,RPLL,2017-06-23 07:13:06,,freelancer or unemployed,"pls advice po, d ko alam kng ano dpt ko sabhin...",,,0.803517,good
5698,19538,16897,RPLL,2014-03-29 05:10:54,,Immigration,Pwede po b maging sponsor ang boyfriend kahit ...,,,0.804224,amazing
5699,19655,17014,RPLL,2014-04-09 14:10:14,,re: my brother is my sponsor,both b. certh nyo or m. cert mo kung married k...,,,0.808316,good
5700,23509,20868,RPLL,2016-11-05 02:11:18,,@degree of consanguinity and affinity,kelangan po pasok sa 4th degree of consanguini...,,,0.814569,acceptable


##### Set the records with a min distance > 0.5 to 'unknown' since we can't trust their assigned categories

In [None]:
%%bigquery
update fin_air_travel.Airport_Review
set sentiment_gecko = 'unknown'
where min_distance > 0.5

Query is running:   0%|          |

#### Part 4: Performance comparison

##### With the prompting approach, how many predictions and nulls did we get

In [7]:
%%bigquery

with predictions as
  (select count(*) as count
  from fin_air_travel.Airport_Review
  where sentiment_gemini is not null),

missing_predictions as
  (select count(*) as count
  from fin_air_travel.Airport_Review
  where sentiment_gemini is null)

select p.count as prompting_predictions, m.count as prompting_null_predictions
from predictions p cross join missing_predictions m

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,prompting_predictions,prompting_null_predictions
0,2666,5997


##### With the embedding approach, how many predictions and nulls did we get

In [10]:
%%bigquery

with predictions as
  (select count(*) as count
  from fin_air_travel.Airport_Review
  where sentiment_gecko != 'unknown'),

missing_predictions as
  (select count(*) as count
  from fin_air_travel.Airport_Review
  where sentiment_gecko = 'unknown')

select p.count as embeddings_predictions, m.count as embeddings_unknown_predictions
from predictions p cross join missing_predictions m

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,embeddings_predictions,embeddings_unknown_predictions
0,2961,5702


##### **Conclusion:** We still have a high number of records that are not getting correctly categorized.

##### Although we ended up with only 295 additional predictions using the embeddings approach, the efficiency of this approach is clearly superior over the prompting technique. The prompting approach took 45-60 minutes whereas the embeddings approach took 5 minutes.

##### In the future, it would be interesting to explore if combining the two approaches can yield greater accuracy.  