## Project 3 work

In [97]:
!pip install json_repair

Collecting json_repair
  Downloading json_repair-0.50.1-py3-none-any.whl.metadata (10 kB)
Downloading json_repair-0.50.1-py3-none-any.whl (26 kB)
Installing collected packages: json_repair
Successfully installed json_repair-0.50.1


#### Define common constants and import common libraries

In [None]:
project_id = "cs378-fa2025"
region = "us-central1"
model_name = "gemini-2.5-pro"
bucket_name = "air-travel-open-data"

In [None]:
import pandas, json, time, os
from pathlib import Path
from google import genai
from google.genai.types import CreateBatchJobConfig, JobState
from google.cloud import bigquery
from google.cloud import storage

storage_client = storage.Client()
genai_client = genai.Client(vertexai=True, project=project_id, location=region)

#### Create the temporary dataset in BQ:

In [None]:
%%bigquery

create schema air_travel_tmp
  options (location = 'us-central1');

Query is running:   0%|          |

### Airport Maps

In [None]:
input_folder_prefix = "initial-loads/airport-maps" # where our input images are located
input_folder = "jsonl/airport-maps"
input_data_path = f"{input_folder}/input-data.jsonl" # used for both the local file and GCS file, contains the prediction requests
input_uri = f"gs://{bucket_name}/{input_data_path}" # complete path to file containing the prediction requests
tmp_table = "air_travel_tmp.airport_maps_data" # output dataset and table in BQ
tmp_table_path = f"bq://{project_id}.{tmp_table}"
tmp_parquet_file = "airport_maps_data.parquet"
lakehouse_parquet_file = f"lakehouse/airport-maps/airport_maps_data.parquet"
system_instruction = """
You are a helpful assistant who reads and understands airport terminal maps. Your goal is to ensure that an airport's businesses are properly reported.
You double-check your answers to make sure you are not mislabeling a business and always return your answers in json format.
"""
prompt = """Which businesses appear on the terminal map of this airport? Return their terminal, business name, category, and the nearest landmarks as they are shown on the map.
If the business is a dining establishment, return the menu items that it is known for.
Make sure your answer conforms to the output schema: {"terminal": "string", "business": "string", "category": "string", "location": "string", "dining": "boolean", "menu_items": "string"}
"""

#### Run the batch job prediction

In [None]:
def create_gemini_request(file_list):

    parts = [
        {
            "text": prompt
        }
    ]

    for file_data in file_list:
        parts.append({
            "file_data": {"file_uri": file_data, "mime_type": "application/pdf"}
        })

    request_dict = {
        "system_instruction": {
            "parts": [
                {
                    "text": system_instruction
                }
            ]
        },
        "contents": {
            "role": "user",
            "parts": parts
        },
        "generation_config": {
            "temperature": 1,
        },
    }

    return request_dict

In [None]:
jsonl_lines = []
file_listing = []
# get the file listings for the maps that we're going to process
bucket = storage_client.bucket(bucket_name)

cur_airport = None
prev_airport = None
num_airports = 0

for blob in bucket.list_blobs(prefix=input_folder_prefix):

    # make a separate dictionary request for each airport
    cur_airport = blob.name.split("-")[2].replace("maps/", "")

    if prev_airport == None or cur_airport == prev_airport:
        file_listing.append(f"gs://{bucket_name}/{blob.name}")
        prev_airport = cur_airport
    else:
        num_airports += 1
        request_dict = create_gemini_request(file_listing)

        jsonl_line = json.dumps({
            "key": prev_airport,  # row identifier
            "request": request_dict
        })

        #print(f"adding {prev_airport} to the input file")
        jsonl_lines.append(jsonl_line)
        file_listing.clear()

        # process cur_airport
        file_listing.append(f"gs://{bucket_name}/{blob.name}")
        prev_airport = cur_airport

if len(file_listing) > 0:
    request_dict = create_gemini_request(file_listing)

    jsonl_line = json.dumps({
        "key": cur_airport,  # use your row identifier here
        "request": request_dict
    })

    num_airports += 1
    jsonl_lines.append(jsonl_line)

print("number of lines in jsonl file:", len(jsonl_lines))

# write all the lines to the jsonl file
dir_path = Path(input_folder)
dir_path.mkdir(parents=True, exist_ok=True)

with open(input_data_path, 'w') as f:
    for line in jsonl_lines:
        f.write(line + '\n')

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(input_data_path)
blob.upload_from_filename(input_data_path)

job = genai_client.batches.create(
    model=model_name,
    src=input_uri,
    config=CreateBatchJobConfig(
    dest=tmp_table_path
  )
)

print(f"Created job: {job.name} with {num_airports} airports")
print(f"Job {job.name} is currently in {job.state} state")

number of lines in jsonl file: 88
Created job: projects/988876466742/locations/us-central1/batchPredictionJobs/4094091142990135296 with 88 airports
Job projects/988876466742/locations/us-central1/batchPredictionJobs/4094091142990135296 is currently in JobState.JOB_STATE_PENDING state


In [None]:
job = genai_client.batches.get(name=job.name)

while job.state not in (JobState.JOB_STATE_SUCCEEDED, JobState.JOB_STATE_FAILED,
                        JobState.JOB_STATE_CANCELLED, JobState.JOB_STATE_PAUSED):
    job = genai_client.batches.get(name=job.name)
    print(f"Job state: {job.state}")
    time.sleep(45)

print(f"Job state: {job.state}")

Job state: JobState.JOB_STATE_PENDING
Job state: JobState.JOB_STATE_QUEUED
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_RUNNING
Job state: JobState.JOB_STATE_SUCCEEDED
Job state: JobState.JOB_STATE_SUCCEEDED


#### Check the job results

In [None]:
%%bigquery
select key, response
from air_travel_tmp.airport_maps_data
where status not like '%error%'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,key,response
0,pns,"{""candidates"":[{""avgLogprobs"":-0.5950318842983..."
1,bzn,"{""candidates"":[{""avgLogprobs"":-0.3319354090813..."
2,cak,"{""candidates"":[{""avgLogprobs"":-0.3857411111538..."
3,oma,"{""candidates"":[{""avgLogprobs"":-0.5188454791998..."
4,lch,"{""candidates"":[{""avgLogprobs"":-0.5910918087562..."
...,...,...
83,las,"{""candidates"":[{""avgLogprobs"":-0.2409803360766..."
84,oak,"{""candidates"":[{""avgLogprobs"":-0.4583189660951..."
85,fll,"{""candidates"":[{""avgLogprobs"":-0.0938675694779..."
86,lax,"{""candidates"":[{""avgLogprobs"":-0.0598059962473..."


#### Create the parquet file

In [None]:
import pandas, pandas_gbq
import json
from google.cloud import storage

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

sql = f"""select key, response from {tmp_table} where status not like '%error%'
"""

df = pandas_gbq.read_gbq(
    sql,
    project_id=project_id,
    dialect="standard",
)

prediction_results = [] # key = product_id, value = dictionary containing predictions

# extract predictions from response
for index, row in df.iterrows():
    airport = row["key"]
    response_dict = json.loads(row["response"])
    prediction_str = response_dict["candidates"][0]["content"]["parts"][0]["text"].replace("```", "").replace("json", "")
    prediction_list = json.loads(prediction_str)

    for prediction_dict in prediction_list:
        if type(prediction_dict) != dict:
            print(f"discarding: {prediction_dict}")
            continue
        prediction_dict.update({"airport": airport})
        prediction_results.append(prediction_dict)

print(f"number of results: {len(prediction_results)}")

# convert to parquet file
df = pandas.DataFrame(prediction_results)
df.to_parquet(tmp_parquet_file)
print("converted to parquet")

# upload to our lakehouse folder in GCS
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(lakehouse_parquet_file)
blob.upload_from_filename(tmp_parquet_file)
print(f"wrote parquet file to gs://{bucket_name}/{lakehouse_parquet_file}")

Downloading: 100%|[32m██████████[0m|
number of results: 1487
converted to parquet
wrote parquet file to gs://air-travel-open-data/lakehouse/airport-maps/airport_maps_data.parquet


#### Create and load the Iceberg table

In [None]:
%%bigquery

CREATE OR REPLACE TABLE air_travel_raw.airport_maps
(
    airport STRING,
    terminal STRING,
    business STRING,
    category STRING,
    location STRING,
    dining BOOLEAN,
    menu_items STRING
)
WITH CONNECTION `988876466742.us-central1.cloud-storage-connection`
OPTIONS (
 file_format = 'PARQUET',
 table_format = 'ICEBERG',
 storage_uri = 'gs://air-travel-open-data/lakehouse/airport-maps'
);

Query is running:   0%|          |

In [None]:
%%bigquery

load data into air_travel_raw.airport_maps
 from files (
 format = 'parquet',
 uris = ['gs://air-travel-open-data/lakehouse/airport-maps/airport_maps_data.parquet']);

Query is running:   0%|          |

#### Check the output

In [None]:
%%bigquery
select * from air_travel_raw.airport_maps

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport,terminal,business,category,location,dining,menu_items
0,pns,T1 - First Floor,Vending Machine,Dining,Near the Baggage Offices and Ticketing counters,True,Menu items not listed on map
1,pns,T2 - Second Floor,CNBC Store,Shopping,Near Gate 2,False,
2,pns,T2 - Second Floor,Gift Shop,Shopping,Near Gate 1,False,
3,pns,T2 - Second Floor,Pensacola Beach House,Dining,Near Gate 4,True,Menu items not listed on map
4,pns,T2 - Second Floor,Chick-fil-A,Dining,Near the USO and Airport Administration,True,Menu items not listed on map
...,...,...,...,...,...,...,...
1482,sfo,Terminal 3,American Express Centurion Lounge,Lounge,"Concourse F, near Gate F2",False,
1483,sfo,Terminal 3,United Global Services,Lounge,Concourse E,False,
1484,sfo,Terminal 3,Yoga Room,Amenity,Concourse E,False,
1485,sfo,Terminal 3,Kids' Spots,Amenity,Near Gates E7 and F18,False,


In [None]:
%%bigquery
select * from air_travel_raw.airport_maps
where airport = 'aus'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport,terminal,business,category,location,dining,menu_items
0,aus,East Concourse,Mini Market,Shopping,Near Gate 15,False,
1,aus,East Concourse,The Scoreboard,Dining,Near Gate 15,True,"Bar food, beer, cocktails"
2,aus,East Concourse,Hut's Hamburgers,Dining,Near Gate 14,True,"Hamburgers, fries, milkshakes"
3,aus,East Concourse,Pretzels,Dining,Near Gate 14,True,Soft pretzels
4,aus,East Concourse,Starbucks,Dining,Near Gate 14,True,"Coffee, espresso drinks, pastries"
5,aus,East Concourse,Brighton,Shopping,Between Gates 14 and 15,False,
6,aus,East Concourse,Barton Springs,Shopping,Between Gates 13 and 14,False,
7,aus,East Concourse,Taste ATX,Dining,Near Gate 13,True,"Local Austin food, craft beer, cocktails"
8,aus,East Concourse,Second Bar,Dining,Near Gate 13 and Flight Information,True,"Bar food, cocktails, wine"
9,aus,East Concourse,The Beerdrop,Dining,Near Gate 13,True,Craft beer


#### Add the `_load_time` and `_data_source` fields to the table

In [None]:
%%bigquery

alter table air_travel_raw.airport_maps
    add column _data_source STRING, add column _load_time TIMESTAMP;

Query is running:   0%|          |

In [None]:
%%bigquery

update air_travel_raw.airport_maps
    set _data_source = 'airport.guide', _load_time = current_timestamp()
    where 1 = 1

Query is running:   0%|          |

#### Check final output

In [None]:
%%bigquery

select * from air_travel_raw.airport_maps
where airport = 'lax'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport,terminal,business,category,location,dining,menu_items,_data_source,_load_time
0,lax,1,Einstein Bros. Bagels,Dining,Near Gate 14,True,"Bagels, breakfast sandwiches, coffee",airport.guide,2025-09-12 22:27:13.846732+00:00
1,lax,4,LA Tapenade,Dining,Central Hall,True,Mediterranean sandwiches and salads,airport.guide,2025-09-12 22:27:13.846732+00:00
2,lax,7,Wolfgang Puck Express,Dining,Near Gate 74B,True,"Gourmet pizza, salads, sandwiches",airport.guide,2025-09-12 22:27:13.846732+00:00
3,lax,8,Engine Co No. 28,Dining,Near Gate 85,True,"American comfort food, steaks, pasta",airport.guide,2025-09-12 22:27:13.846732+00:00
4,lax,2,Barney's Beanery,Dining,Near Gate 23A,True,"American diner food, burgers, chili, breakfast",airport.guide,2025-09-12 22:27:13.846732+00:00
...,...,...,...,...,...,...,...,...,...
165,lax,International,Hugo Boss,Shopping,Great Hall,False,,airport.guide,2025-09-12 22:27:13.846732+00:00
166,lax,6,The Marketplace by Wolfgang Puck,Dining,Near Gate 65A,True,"Multiple food concepts including pizza, salads...",airport.guide,2025-09-12 22:27:13.846732+00:00
167,lax,International,iStore Boutique,Shopping,Great Hall,False,,airport.guide,2025-09-12 22:27:13.846732+00:00
168,lax,1,Trejo's Tacos,Dining,Near Gate 13,True,"Tacos, burritos, bowls",airport.guide,2025-09-12 22:27:13.846732+00:00


### TSA Traffic

In [None]:
input_folder_prefix = "initial-loads/tsa-traffic" # where our input pdfs are located
input_folder = "jsonl/tsa-traffic"
input_data_path = f"{input_folder}/input-data.jsonl" # used for both the local file and GCS file, contains the prediction requests
input_uri = f"gs://{bucket_name}/{input_data_path}" # complete path to file containing the prediction requests
tmp_table = "air_travel_tmp.tsa_traffic_data" # output dataset and table in BQ
tmp_table_path = f"bq://{project_id}.{tmp_table}"
tmp_parquet_file = "tsa_traffic_data.parquet"
lakehouse_parquet_file = f"lakehouse/tsa-traffic/tsa_traffic_data.parquet"
system_instruction = """
You are a helpful assistant who reads and understands tsa traffic reports. Your goal is to ensure that an airport's traffic patterns is properly reported.
You double-check your answers to make sure you are not mislabeling a checkpoint or reporting wrong numbers and always return your answers in json format.
"""
prompt = """Convert the file to json format. Return the date, hour, airport code, airport name, city, state, checkpoint name, and total traffic reported for each time period and location.
Please make sure that your answer conforms to the output schema:
  {"event_date": "string", "event_hour": "string, "airport_code": "string", "airport_name": "string", "airport_city": "string", "airport_state": "string", "security_checkpoint": "string", "total_traffic": "integer"}
"""

#### Run the batch job prediction

In [None]:
def create_gemini_request(file_path):

    parts = [
        {
            "text": prompt
        }
    ]

    parts.append({
        "file_data": {"file_uri": file_path, "mime_type": "application/pdf"}
    })

    request_dict = {
        "system_instruction": {
            "parts": [
                {
                    "text": system_instruction
                }
            ]
        },
        "contents": {
            "role": "user",
            "parts": parts
        },
        "generation_config": {
            "temperature": 1,
        },
    }

    return request_dict

In [None]:
jsonl_lines = []

# get the file listings for the maps that we're going to process
bucket = storage_client.bucket(bucket_name)

num_reports = 0

for blob in bucket.list_blobs(prefix=input_folder_prefix):

    num_reports += 1

    # make a separate request per pdf
    file_name = blob.name.replace("air-travel-open-data/initial-loads/tsa-traffic", "")
    file_path = f"gs://{bucket_name}/{blob.name}"

    request_dict = create_gemini_request(file_path)

    jsonl_line = json.dumps({
        "key": file_name,  # row identifier
        "request": request_dict
    })

    jsonl_lines.append(jsonl_line)

print("number of lines in jsonl file:", len(jsonl_lines))

# write all the lines to the jsonl file
dir_path = Path(input_folder)
dir_path.mkdir(parents=True, exist_ok=True)

with open(input_data_path, 'w') as f:
    for line in jsonl_lines:
        f.write(line + '\n')

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(input_data_path)
blob.upload_from_filename(input_data_path)

job = genai_client.batches.create(
    model=model_name,
    src=input_uri,
    config=CreateBatchJobConfig(
    dest=tmp_table_path
  )
)

print(f"Created job: {job.name} with {num_reports} tsa reports")
print(f"Job {job.name} is currently in {job.state} state")

number of lines in jsonl file: 61
Created job: projects/988876466742/locations/us-central1/batchPredictionJobs/6246618250826743808 with 61 tsa reports
Job projects/988876466742/locations/us-central1/batchPredictionJobs/6246618250826743808 is currently in JobState.JOB_STATE_PENDING state


In [None]:
job = genai_client.batches.get(name=job.name)

while job.state not in (JobState.JOB_STATE_SUCCEEDED, JobState.JOB_STATE_FAILED,
                        JobState.JOB_STATE_CANCELLED, JobState.JOB_STATE_PAUSED):
    job = genai_client.batches.get(name=job.name)
    print(f"Job state: {job.state}")
    time.sleep(45)

print(f"Job state: {job.state}")

Job state: JobState.JOB_STATE_SUCCEEDED


#### Check the job results

In [None]:
%%bigquery
select *
from air_travel_tmp.tsa_traffic_data
where status not like '%error%'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,key,status,processed_time,request,response
0,initial-loads/tsa-traffic/tsa_throughput_data_...,,2025-09-13 13:42:16.285000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0232383894120..."
1,initial-loads/tsa-traffic/tsa_throughput_data_...,,2025-09-13 13:32:05.318000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0423213001000..."
2,initial-loads/tsa-traffic/october-15-2023-to-o...,,2025-09-13 13:21:43.196000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0537476771644..."
3,initial-loads/tsa-traffic/august-7-2022-to-aug...,,2025-09-13 13:21:43.101000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0855125221046..."
4,initial-loads/tsa-traffic/august-20-2023-to-au...,,2025-09-13 13:21:46.608000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0499218318876..."
...,...,...,...,...,...
56,initial-loads/tsa-traffic/july-10-2022-to-july...,,2025-09-13 13:32:05.640000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0349749120919..."
57,initial-loads/tsa-traffic/june-12-2022-to-june...,,2025-09-13 13:42:16.209000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0456072423814..."
58,initial-loads/tsa-traffic/october-23-2022-to-o...,,2025-09-13 13:42:16.204000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0592483836132..."
59,initial-loads/tsa-traffic/july-9-2023-to-july-...,,2025-09-13 13:51:23.243000+00:00,"{""contents"":{""parts"":[{""file_data"":null,""text""...","{""candidates"":[{""avgLogprobs"":-0.0529459942869..."


#### Create the parquet file

In [115]:
import pandas, pandas_gbq
import json, re
from json_repair import repair_json

storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

sql = f"""select key, response from {tmp_table} where status not like '%error%'
"""

df = pandas_gbq.read_gbq(
    sql,
    project_id=project_id,
    dialect="standard",
)

prediction_results = [] # key = product_id, value = dictionary containing predictions

# extract predictions from response
for index, row in df.iterrows():
    file_name = row["key"]
    #print(f"processing: {file_name}")
    response_dict = json.loads(row["response"])
    prediction_str = response_dict["candidates"][0]["content"]["parts"][0]["text"].replace("```", "").replace("json", "").strip()

    prediction_list = [] # stores the list of properly formatted predictions from this response

    try:
        prediction_list = json.loads(prediction_str)
    except Exception as e:
        # ugly code is needed due to invalid json objects in the response
        prediction_str = repair_json(prediction_str)
        prediction_str = re.sub(r"(?<=\d),(?=\d)", "", prediction_str) # remove the comma from the total traffic count
        prediction_list = json.loads(prediction_str)

    for prediction_dict in prediction_list:
        if type(prediction_dict) != dict:
            continue
        prediction_results.append(prediction_dict)

print(f"number of results: {len(prediction_results)}")

with open("final_prediction_str", 'w') as f:
    for line in prediction_results:
        f.write(str(line))

# convert to parquet file
df = pandas.DataFrame(prediction_results)
columns_expected = ['event_date', 'event_hour', 'airport_code', 'airport_name', 'airport_city', 'airport_state', 'security_checkpoint', 'total_traffic']
columns_found = df.columns.tolist()
print("initial columns in dataframe:", columns_found)

for column_found in columns_found:
    if column_found not in columns_expected:
        df = df.drop(column_found, axis=1)

columns_final = df.columns.tolist()
print("final columns in dataframe:", columns_final)

df['total_traffic'] = df['total_traffic'].astype(str)

df.to_parquet(tmp_parquet_file)
print("converted to parquet")

# upload to our lakehouse folder in GCS
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(lakehouse_parquet_file)
blob.upload_from_filename(tmp_parquet_file)
print(f"wrote parquet file to gs://{bucket_name}/{lakehouse_parquet_file}")

Downloading: 100%|[32m██████████[0m|
number of results: 23449
initial columns in dataframe: ['event_date', 'event_hour', 'airport_code', 'airport_name', 'airport_city', 'airport_state', 'security_checkpoint', 'total_traffic', 'airport_code BQN', 'total_traffic- 32\n  },\n  {\n    "event_date', 'airport name', 'total_traffic- 358\n  },\n  {\n    "event_date', 'security_checkpoint- "EAST Checkpoint', 'airport_code MDW', 'eventdate', 'A CONC', 'OAK', 'Metropolitan Oakland International', 'Oakland', 'CA', 'TERM2', 'code', 'security_checkpoint A21', 'airport_code_', 'security_checkpoint_name', 'security_checkpoint.1', 'airport_codeDCA', 'e', 'security checkpoint', 'security_checkpoint A-East', 'airport city', 'airport state', 'PRCON', 'Security Checkpoint', 'Terminal A', 'Checkpoint BC', 'Gates B', 'Main Support', 'South Checkpoint', 'Rafael Hernandez A', 'Main', 'Consolidated Checkpoint', 'C1/ETS', 'C3/ETS-1', 'Main-FAI', 'A Checkpoint', 'B Checkpoint', 'Main Checkpoint', 'Checkpoint 1',

#### Create and load the Iceberg table

In [116]:
%%bigquery

CREATE OR REPLACE TABLE air_travel_raw.tsa_traffic
(
    event_date STRING,
    event_hour STRING,
    airport_code STRING,
    airport_name STRING,
    airport_city STRING,
    airport_state STRING,
    security_checkpoint STRING,
    total_traffic STRING
)
WITH CONNECTION `988876466742.us-central1.cloud-storage-connection`
OPTIONS (
 file_format = 'PARQUET',
 table_format = 'ICEBERG',
 storage_uri = 'gs://air-travel-open-data/lakehouse/tsa-traffic'
);

Query is running:   0%|          |

In [117]:
%%bigquery

load data into air_travel_raw.tsa_traffic
 from files (
 format = 'parquet',
 uris = ['gs://air-travel-open-data/lakehouse/tsa-traffic/tsa_traffic_data.parquet']);

Query is running:   0%|          |

#### Check the output

In [118]:
%%bigquery
select * from air_travel_raw.tsa_traffic

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,event_date,event_hour,airport_code,airport_name,airport_city,airport_state,security_checkpoint,total_traffic
0,11/19/2017,00:00,BQN,Rafael Hernandez,Aguadilla,PR,Rafael Hernandez Air,20
1,11/19/2017,00:00,FAI,Fairbanks International,Fairbanks,AK,ASAA-FAI,108
2,11/19/2017,00:00,FAT,Fresno Air Terminal,Fresno,CA,FAT 01,11
3,11/19/2017,00:00,GUM,Antonio B. Won Pat International,Tamuning,GU,GUM01,260
4,11/19/2017,00:00,IAG,Niagara Falls International,Niagara Falls,NY,IAG-01,18
...,...,...,...,...,...,...,...,...
23444,06/28/2022,0220-0320,TPA,Tampa International,Tampa,FL,TPA-C,347
23445,06/28/2022,0220-0320,TPA,Tampa International,Tampa,FL,TPA-F,5
23446,7/17/2023,0400-0500,TPA,Tampa International,Tampa,FL,TPA-C,328
23447,7/17/2023,0400-0500,TPA,Tampa International,Tampa,FL,TPA-E,251


#### Add the `_load_time` and `_data_source` fields to the table

In [120]:
%%bigquery

alter table air_travel_raw.tsa_traffic
    add column _data_source STRING, add column _load_time TIMESTAMP;

Query is running:   0%|          |

In [121]:
%%bigquery

update air_travel_raw.tsa_traffic
    set _data_source = 'tsa', _load_time = current_timestamp()
    where 1 = 1

Query is running:   0%|          |

#### Check final output

In [122]:
%%bigquery

select * from air_travel_raw.tsa_traffic

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,event_date,event_hour,airport_code,airport_name,airport_city,airport_state,security_checkpoint,total_traffic,_data_source,_load_time
0,11/19/2017,03:00,GSP,Greenville-Spartanburg,Green,SC,Checkpoint,147,tsa,2025-09-13 17:22:26.134702+00:00
1,11/19/2017,04:00,CHS,Charleston County International,Charleston,SC,Checkpoint,156,tsa,2025-09-13 17:22:26.134702+00:00
2,11/19/2017,04:00,TRI,Tri City Regional,Blountville,TN,TRI-A,1,tsa,2025-09-13 17:22:26.134702+00:00
3,10/23/2023,0300-0400,HPN,Westchester County,White Plains,NY,Main Terminal,1,tsa,2025-09-13 17:22:26.134702+00:00
4,10/23/2023,0300-0400,ICT,Wichita Dwight D. Eisenhower National,Wichita,KS,Wichita Main,4,tsa,2025-09-13 17:22:26.134702+00:00
...,...,...,...,...,...,...,...,...,...,...
23444,11/26/2017,02:00,TPA,Tampa International,Tampa,FL,TPA E,0,tsa,2025-09-13 17:22:26.134702+00:00
23445,11/26/2017,03:00,TPA,Tampa International,Tampa,FL,TPA F,50,tsa,2025-09-13 17:22:26.134702+00:00
23446,11/13/2023,04:00 - 04:59,TPA,Tampa International,Tampa,FL,TPA-F,147,tsa,2025-09-13 17:22:26.134702+00:00
23447,2/12/2024,0100-0200,TPA,Tampa International,Tampa,FL,TPA C,34,tsa,2025-09-13 17:22:26.134702+00:00
