## Project 4 work

#### Data anomalies (criteria 6 - 10)

##### Criteria 6

In [12]:
%%bigquery
select source_airport_id from air_travel_raw.flight_routes
where source_airport_id like '%\\N%'
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,source_airport_id
0,\N
1,\N
2,\N
3,\N
4,\N


##### Criteria 7

In [1]:
%%bigquery
select airport_name from air_travel_raw.flight_delays
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_name
0,"Kona, HI: Ellison Onizuka Kona International a..."
1,"Pensacola, FL: Pensacola International"
2,"Raleigh/Durham, NC: Raleigh-Durham International"
3,"Detroit, MI: Detroit Metro Wayne County"
4,"Salt Lake City, UT: Salt Lake City International"


##### Criteria 8

In [3]:
%%bigquery
select equipment from air_travel_raw.flight_routes
where equipment like '% %'
order by length(equipment) desc
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,equipment
0,320 738 330 319 739 321 343 73W 77W
1,319 320 CR9 321 717 73G 73W 738 736
2,M90 M88 320 753 757 73H 777 319 738
3,320 319 CR9 321 717 73G 73W 738 736
4,321 320 333 772 773 777 744 E70 74E


##### Criteria 9

In [4]:
%%bigquery
select airportRef, airportIdent
from air_travel_raw.airport_reviews
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airportRef,airportIdent
0,2212,EDDF
1,323797,EBDR
2,46563,IN-0040
3,511051,IT-0718
4,509439,CA-0876


In [5]:
%%bigquery
select airport_id
from air_travel_raw.airports
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_id
0,50
1,80
2,120
3,153
4,215


In [6]:
%%bigquery
select airport_code
from air_travel_raw.tsa_traffic
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_code
0,TPA
1,GSP
2,PHL
3,SJC
4,CHS


##### Criteria 10

In [9]:
%%bigquery
select carrier, carrier_name, airport, airport_name
from air_travel_raw.flight_delays
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,carrier,carrier_name,airport,airport_name
0,AS,Alaska Airlines Network,KOA,"Kona, HI: Ellison Onizuka Kona International a..."
1,9E,Endeavor Air Inc.,PNS,"Pensacola, FL: Pensacola International"
2,NK,Spirit Airlines,RDU,"Raleigh/Durham, NC: Raleigh-Durham International"
3,AS,Alaska Airlines Network,DTW,"Detroit, MI: Detroit Metro Wayne County"
4,AS,Alaska Airlines Network,SLC,"Salt Lake City, UT: Salt Lake City International"


###### Note: It is the `carrier_name` and `airport_name` which are redundant in this table

#### Data enrichment

In [15]:
!pip install json_repair

Collecting json_repair
  Downloading json_repair-0.51.0-py3-none-any.whl.metadata (11 kB)
Downloading json_repair-0.51.0-py3-none-any.whl (26 kB)
Installing collected packages: json_repair
Successfully installed json_repair-0.51.0


##### Define the constants and import the libraries

In [13]:
import pandas, csv, json, time, os
from pathlib import Path
from google import genai
from google.genai.types import CreateBatchJobConfig, JobState
from google.cloud import bigquery
from google.cloud import storage
from google.cloud.storage import transfer_manager

storage_client = storage.Client()
genai_client = genai.Client(vertexai=True, project=project_id, location=region)

In [24]:
project_id = "cs378-fa2025"
region = "us-central1"
bucket_name = "air-travel-open-data"
model_name = "gemini-2.5-pro"
input_file_name = "airport-comments.tsv"
input_file_folder = "initial-loads/our-airports"
input_file_path = f"{input_file_folder}/{input_file_name}" # where our input file is stored
file_chunk_path = f"{input_file_folder}/chunks"
jsonl_folder = "jsonl/our-airports"
input_data_path = f"{jsonl_folder}/input-data.jsonl" # used for both the local file and GCS file, contains the prediction requests
input_uri = f"gs://{bucket_name}/{input_data_path}" # complete path to the file containing the prediction requests
tmp_dataset = "air_travel_tmp"
tmp_table = f"{tmp_dataset}.airport_reviews_data" # output dataset and table in BQ
tmp_table_path = f"bq://{project_id}.{tmp_table}"
tmp_parquet_file = "airport_reviews_data.parquet"
lakehouse_parquet_file = f"lakehouse/our-airports/airport_comments.parquet"
system_instruction = """
You are a helpful airport analyst who reads and interprets user reviews. Your goal is to ensure that each review is categorized properly.
You double-check your answers to make sure you are not mislabeling the review and always return your answer in json format.
"""
prompt = """
What are the primary and secondary categories of this user review? What is its overall sentiment? And what is its spoken language?

Use the broad categories: Transportation & Parking, Check-in & Bag Drop, Terminal Facilities, Security & Immigration, Terminal Experience, Boarding & Departure
Use the subcategories: Security Screening, Immigration & Passport Control, Ambience & Comfort, Food, Beverage & Retail, Amenities & Facilities, Wayfinding & Information, Gate Area, Boarding Process, Deplaning & Arrival, Baggage Claim, Customs, Connecting Flights, Airport Staff, Accessibility, Flight Operations, Safety & Health, Miscellaneous.

Use the sentiments: Positive, Negative, Neutral.
Use the languages: English, Spanish, Italian, German, etc.

Use the schema: {"review_id": "INTEGER", "broad_category": "STRING", "sub_category": "STRING", "sentiment": "STRING", "language": "STRING"}

The review_id will be given to you as part of the input.
"""

In [None]:
def create_gemini_request(file_path):

    parts = [
        {
            "text": prompt
        }
    ]

    parts.append({
        "file_data": {"file_uri": file_path, "mime_type": "text/tab-separated-values"}
    })

    request_dict = {
        "system_instruction": {
            "parts": [
                {
                    "text": system_instruction
                }
            ]
        },
        "contents": {
            "role": "user",
            "parts": parts
        },
        "generation_config": {
            "temperature": 1,
        },
    }

    return request_dict

##### Run the batch job prediction

In [None]:
jsonl_lines = []

bucket = storage_client.bucket(bucket_name)

num_file_chunks = 0
file_chunk_size = 10 # try to submit a file with 10 lines at a time

blob = bucket.blob(input_file_path)
blob.download_to_filename(input_file_name)
print(f"Downloaded {input_file_name} from GCS")

os.makedirs(file_chunk_path, exist_ok=True)
filenames = []

# split the tsv file because it is too large to pass to Gemini in a single request
with open(input_file_name, "r", newline="", encoding="utf-8") as infile:
    reader = csv.reader(infile, delimiter="\t")
    header = next(reader)

    chunk_number = 0
    chunk_rows = []

    for i, row in enumerate(reader):
        chunk_rows.append(row)

        if i > 0 and i % file_chunk_size == 0:
            file_name = f"{i}.tsv"
            with open(f"{file_chunk_path}/{file_name}", "w") as out_file:
                for chunk_row in chunk_rows:
                    out_file.write(f"{chunk_row}\n")
                chunk_rows.clear()
                filenames.append(file_name)

    if len(chunk_rows) > 0:
        file_name = f"{i}.tsv"
        with open(f"{file_chunk_path}/{file_name}", "w") as out_file:
            for chunk_row in chunk_rows:
                out_file.write(f"{chunk_row}\n")
            filenames.append(file_name)

print(f"There are {len(filenames)} file chunks")
print(f"Using file_chunk_path: {file_chunk_path}")

# temporarily, until we figure out if this works
#blob = bucket.blob(f"{file_chunk_path}/{filenames[0]}")
#blob.upload_from_filename(f"{file_chunk_path}/{filenames[0]}")

transfer_manager.upload_many_from_filenames(
          bucket, filenames, source_directory=f"/content/{file_chunk_path}",
          blob_name_prefix=f"{file_chunk_path}/", max_workers=8)
print(f"Uploaded the file chunks to GCS: {results}")


# prepare the jsonl file
num_files = 0

for blob in bucket.list_blobs(prefix=file_chunk_path):

    num_files += 1

    # make a separate request for each chunk file
    file_name = blob.name.replace("air-travel-open-data/initial-loads/our-airports", "")
    file_path = f"gs://{bucket_name}/{blob.name}"

    request_dict = create_gemini_request(file_path)

    jsonl_line = json.dumps({
        "key": file_name,  # row identifier
        "request": request_dict
    })

    jsonl_lines.append(jsonl_line)


# write all the lines to a jsonl file
os.makedirs(jsonl_folder, exist_ok=True)

with open(f"{jsonl_folder}/input-data.jsonl", 'w') as f:
    for line in jsonl_lines:
        f.write(line + '\n')

print(f"Wrote jsonl file: {input_data_path}. There are {len(jsonl_lines)} requests in it")

blob = bucket.blob(input_data_path)
blob.upload_from_filename(input_data_path)

print(f"Uploaded jsonl file to {input_data_path} on GCS")

job = genai_client.batches.create(
    model=model_name,
    src=input_uri,
    config=CreateBatchJobConfig(
    dest=tmp_table_path
  )
)

print(f"Created job: {job.name} with {num_files} requests")
print(f"Job {job.name} is currently in {job.state} state")

Downloaded airport-comments.tsv from GCS
There are 1545 file chunks
Using file_chunk_path: initial-loads/our-airports/chunks
Uploaded the file chunks to GCS: [None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, N

In [None]:
job = genai_client.batches.get(name=job.name)

while job.state not in (JobState.JOB_STATE_SUCCEEDED, JobState.JOB_STATE_FAILED,
                        JobState.JOB_STATE_CANCELLED, JobState.JOB_STATE_PAUSED):
    job = genai_client.batches.get(name=job.name)
    print(f"Job state: {job.state}")
    time.sleep(45)

print(f"Job state: {job.state}")

Job state: JobState.JOB_STATE_SUCCEEDED


##### Check the job results

In [3]:
%%bigquery
select key, response
from air_travel_tmp.airport_reviews_data
where status not like '%error%'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,key,response
0,initial-loads/our-airports/chunks/13080.tsv,"{""candidates"":[{""avgLogprobs"":-11.405478641904..."
1,initial-loads/our-airports/chunks/1050.tsv,"{""candidates"":[{""avgLogprobs"":-6.8948159053407..."
2,initial-loads/our-airports/chunks/1220.tsv,"{""candidates"":[{""avgLogprobs"":-6.4595905172413..."
3,initial-loads/our-airports/chunks/1040.tsv,"{""candidates"":[{""avgLogprobs"":-10.002239679886..."
4,initial-loads/our-airports/chunks/940.tsv,"{""candidates"":[{""avgLogprobs"":-5.2651424084679..."
...,...,...
1540,initial-loads/our-airports/chunks/3660.tsv,"{""candidates"":[{""avgLogprobs"":-0.4656598522404..."
1541,initial-loads/our-airports/chunks/10.tsv,"{""candidates"":[{""avgLogprobs"":-0.8035627885298..."
1542,initial-loads/our-airports/chunks/920.tsv,"{""candidates"":[{""avgLogprobs"":-0.8527074977976..."
1543,initial-loads/our-airports/chunks/11630.tsv,"{""candidates"":[{""avgLogprobs"":-1.2997814274256..."


##### Create the enriched table

In [29]:
from json_repair import repair_json

sql = f"""select response from {tmp_table} where status not like '%error% limit 1'
"""

df = pandas_gbq.read_gbq(
    sql,
    project_id=project_id,
    dialect="standard",
)

prediction_results = [] # key = product_id, value = dictionary containing predictions

# extract predictions from response
for index, row in df.iterrows():
    response_dict = json.loads(row["response"])
    prediction_str = response_dict["candidates"][0]["content"]["parts"][0]["text"].replace("```", "").replace("json", "")

    try:
        prediction_list = json.loads(prediction_str)
    except Exception as e:
        # ugly code is needed due to invalid json objects in the response
        prediction_str = repair_json(prediction_str)
        prediction_list = json.loads(prediction_str)

    for prediction_dict in prediction_list:
        if type(prediction_dict) != dict:
            continue
        prediction_results.append(prediction_dict)

print(f"number of results: {len(prediction_results)}")

df = pandas.DataFrame(prediction_results)
df["review_id"] = df["review_id"].astype(str)

enriched_table_id = f"{tmp_dataset}.airport_reviews_data_enriched"
pandas_gbq.to_gbq(df, enriched_table_id, project_id=project_id, if_exists='replace')
print(f"\nwrote enriched table to BQ: {table_id}")

Downloading: 100%|[32m██████████[0m|
number of results: 14791


100%|██████████| 1/1 [00:00<00:00, 7752.87it/s]


wrote enriched table to BQ: air_travel_tmp.airport_reviews_data_enriched





In [28]:
%%bigquery
select * from air_travel_tmp.airport_reviews_data_enriched

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,review_id,broad_category,sub_category,sentiment,language
0,35141,Terminal Experience,Accessibility,Negative,English
1,35140,Terminal Experience,Accessibility,Negative,English
2,3371,Terminal Experience,Accessibility,Negative,English
3,8869,Transportation & Parking,Accessibility,Negative,English
4,5596,Terminal Experience,Accessibility,Negative,English
...,...,...,...,...,...
14786,27198,Security & Immigration,Wayfinding & Information,Positive,Filipino
14787,83365,Terminal Experience,Wayfinding & Information,Positive,English
14788,16579,Terminal Facilities,Wayfinding & Information,Positive,English
14789,27299,Terminal Facilities,Wayfinding & Information,Positive,English


##### Join the Iceberg table with the enriched table

In [33]:
sql = f"""select id, threadRef, airportRef, airportIdent, date, memberNickname,
      subject, body, broad_category, sub_category, sentiment, language
      from air_travel_raw.airport_reviews r
      join {enriched_table_id} e on cast(r.id as STRING) = e.review_id
"""

df = pandas_gbq.read_gbq(
    sql,
    project_id=project_id,
    dialect="standard",
)

parquet_file = "airport_reviews.parquet"
df.to_parquet(parquet_file)
print("converted to parquet")

# upload to our lakehouse folder in GCS
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(lakehouse_parquet_file)
blob.upload_from_filename(parquet_file)
print(f"wrote parquet file to gs://{bucket_name}/{lakehouse_parquet_file}")

Downloading: 100%|[32m██████████[0m|
converted to parquet
wrote parquet file to gs://air-travel-open-data/lakehouse/our-airports/airport_comments.parquet


##### Create and load the new Iceberg table

In [34]:
%%bigquery

CREATE OR REPLACE TABLE air_travel_raw.airport_reviews
(
  id INT64,
  threadRef FLOAT64,
  airportRef INT64,
  airportIdent STRING,
  date STRING,
  memberNickname STRING,
  subject STRING,
  body STRING,
  broad_category STRING,
  sub_category STRING,
  sentiment STRING,
  language STRING
)
WITH CONNECTION `988876466742.us-central1.cloud-storage-connection`
OPTIONS (
 file_format = 'PARQUET',
 table_format = 'ICEBERG',
 storage_uri = 'gs://air-travel-open-data/lakehouse/our-airports'
);

Query is running:   0%|          |

In [35]:
%%bigquery

load data into air_travel_raw.airport_reviews
 from files (
 format = 'parquet',
 uris = ['gs://air-travel-open-data/lakehouse/our-airports/airport_comments.parquet']);

Query is running:   0%|          |

##### Check the output

In [37]:
%%bigquery
select * from air_travel_raw.airport_reviews
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,threadRef,airportRef,airportIdent,date,memberNickname,subject,body,broad_category,sub_category,sentiment,language
0,83812,81140.0,351714,AU-0279,2022-01-21 22:31:12,adidas,MISSING FROM WEBSITE,MISSING FROM WEBSITE,Not Applicable,Not Applicable,Neutral,English
1,26961,24320.0,5698,RPME,2017-08-12 9:26:57,,Comical arrival!,The first of two times I landed at BXU I was w...,Boarding & Departure,Baggage Claim,Positive,English
2,20940,18299.0,309577,WIMM,2015-06-14 3:16:02,,Kasar,Masa koperku begitu turun langsung penyok ujun...,Boarding & Departure,Baggage Claim,Negative,Indonesian
3,20653,18012.0,5744,RPVP,2015-02-26 22:06:19,,Cute and friendly,Now double the size but still small :-) Very f...,Boarding & Departure,Baggage Claim,Positive,English
4,82396,79759.0,2563,ELLX,2019-11-21 6:02:27,mannebk,handling fees are a rip off,I have been there with a 172. Not even the ver...,Security & Immigration,Security Screening,Negative,English


##### Add the `_load_time` and `_data_source` fields to the table

In [38]:
%%bigquery

alter table air_travel_raw.airport_reviews
    add column _data_source STRING, add column _load_time TIMESTAMP;

Query is running:   0%|          |

In [41]:
%%bigquery

update air_travel_raw.airport_reviews
    set _data_source = 'our-airports,llm', _load_time = current_timestamp()
    where 1 = 1

Query is running:   0%|          |

##### Check the final output

In [42]:
%%bigquery
select * from air_travel_raw.airport_reviews
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,threadRef,airportRef,airportIdent,date,memberNickname,subject,body,broad_category,sub_category,sentiment,language,_data_source,_load_time
0,233,206.0,1304,CNF4,2007-07-22 19:29:03,AnthonyNalli,Buttertarts,Famous for the best buttertarts anywhere!,Terminal Facilities,"Food, Beverage & Retail",Positive,English,"our-airports,llm",2025-09-19 18:37:01.277997+00:00
1,5461,4514.0,22661,N07,2010-04-05 19:08:48,,Ed Gorski,"Wow, cant believe I found this sight. I worked...",Terminal Experience,Miscellaneous,Positive,English,"our-airports,llm",2025-09-19 18:37:01.277997+00:00
2,1772,1526.0,35150,KXSA,2008-05-21 8:05:24,david,New airport,This is a newly-built airport which opened in ...,Terminal Experience,Miscellaneous,Neutral,English,"our-airports,llm",2025-09-19 18:37:01.277997+00:00
3,4712,3897.0,2882,FKKD,2009-11-16 13:57:30,,re: Cute Pure Bred English Bulldog Puppies,Im going through the same thing right with the...,Miscellaneous,Miscellaneous,Negative,English,"our-airports,llm",2025-09-19 18:37:01.277997+00:00
4,7809,6675.0,4533,LTBJ,2011-02-16 11:49:05,maverickk_85,LTBJ,ADNAN MENDERES HAVAALANI,Terminal Experience,Miscellaneous,Neutral,Turkish,"our-airports,llm",2025-09-19 18:37:01.277997+00:00
