### staging dataset

In [None]:
from google.cloud import bigquery

project_id = "cs378-fa2024"
dataset = "air_travel_stg"
region = "us-central1"

bq_client = bigquery.Client()

dataset_id = bigquery.Dataset(f"{project_id}.{dataset}")
dataset_id.location = region
resp = bq_client.create_dataset(dataset_id, exists_ok=True)
print("Created dataset {}.{}".format(bq_client.project, resp.dataset_id))

Created dataset cs378-fa2024.air_travel_stg


### `aircrafts` table

##### Replace `'\\N'` with null

In [None]:
%%bigquery
select aircraft_name, iata_code,
case icao_code when '\\N' then null else icao_code end as icao_code,
  _data_source, _load_time
from air_travel_raw.aircrafts;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,aircraft_name,iata_code,icao_code,_data_source,_load_time
0,Airbus A330,330,,openflights,2024-08-25 16:15:09.744963+00:00
1,Airbus A340,340,,openflights,2024-08-25 16:15:09.744963+00:00
2,Airbus A350,350,,openflights,2024-08-25 16:15:09.744963+00:00
3,Airbus A380,380,,openflights,2024-08-25 16:15:09.744963+00:00
4,BAe 146,146,,openflights,2024-08-25 16:15:09.744963+00:00
...,...,...,...,...,...
241,Tupolev Tu-204,T20,T204,openflights,2024-08-25 16:15:09.744963+00:00
242,Pilatus Britten-Norman BN-2A Mk III Trislander,BNT,TRIS,openflights,2024-08-25 16:15:09.744963+00:00
243,Yakovlev Yak-40,YK4,YK40,openflights,2024-08-25 16:15:09.744963+00:00
244,Yakovlev Yak-42,YK2,YK42,openflights,2024-08-25 16:15:09.744963+00:00


#### Create staging table

In [18]:
%%bigquery
create or replace table air_travel_stg.aircrafts as
  select aircraft_name as name,
  case iata_code when '\\N' then null else iata_code end as iata,
  case icao_code when '\\N' then null else icao_code end as icao,
  _data_source,
  _load_time
  from air_travel_raw.aircrafts;

Query is running:   0%|          |

### `airlines` table

##### Replace `'\\N'` with null

In [None]:
%%bigquery
select airline_id as id,
name,
case alias when '\\N' then null else alias end as alias,
iata, icao, callsign,
case country when '\\N' then null else country end as country,
active, _data_source, _load_time
from air_travel_raw.airlines;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,alias,iata,icao,callsign,country,active,_data_source,_load_time
0,415,Aerojet de Costa Rica,,,ARJ,S.A.,,False,openflights,2024-08-25 15:52:55.956309+00:00
1,1509,Bellview Airlines,,,BVU,Sierra Leone,,False,openflights,2024-08-25 15:52:55.956309+00:00
2,1516,BAX Global,,8W,,,,False,openflights,2024-08-25 15:52:55.956309+00:00
3,2080,Electronic Data Systems,,,1Y,,,False,openflights,2024-08-25 15:52:55.956309+00:00
4,2636,Gulf African Airlines - Gambia,,,GUF,GULF AFRICAN,,False,openflights,2024-08-25 15:52:55.956309+00:00
...,...,...,...,...,...,...,...,...,...,...
6157,2773,Hong Kong Airlines,,HX,CRK,BAUHINIA,Hong Kong SAR of China,True,openflights,2024-08-25 15:52:55.956309+00:00
6158,2774,Hong Kong Express Airways,,UO,HKE,HONGKONG SHUTTLE,Hong Kong SAR of China,True,openflights,2024-08-25 15:52:55.956309+00:00
6159,3233,Lao Airlines,,QV,LAO,LAO,Lao Peoples Democratic Republic,True,openflights,2024-08-25 15:52:55.956309+00:00
6160,11724,SVG Air,,,SVG,Grenadines,Saint Vincent and the Grenadines,True,openflights,2024-08-25 15:52:55.956309+00:00


##### Look for empty strings

In [None]:
%%bigquery
select count(*) as empty_alias
from air_travel_raw.airlines
where alias = '';

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_alias
0,505


In [None]:
%%bigquery
select count(*) as empty_country
from air_travel_raw.airlines
where country = '';

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_country
0,15


##### Putting it all together

In [None]:
%%bigquery
select airline_id as id,
name,
case alias
  when '\\N' then null
  when '' then null
  else alias
  end as alias,
case iata
  when '' then null
  else iata
  end as iata,
case icao
  when '' then null
  else icao
  end as icao,
case callsign
  when '' then null
  else callsign
  end as callsign,
case country
  when '\\N' then null
  when '' then null
  else country end as country,
active, _data_source, _load_time
from air_travel_raw.airlines;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,alias,iata,icao,callsign,country,active,_data_source,_load_time
0,415,Aerojet de Costa Rica,,,ARJ,S.A.,,False,openflights,2024-08-25 15:52:55.956309+00:00
1,1509,Bellview Airlines,,,BVU,Sierra Leone,,False,openflights,2024-08-25 15:52:55.956309+00:00
2,1516,BAX Global,,8W,,,,False,openflights,2024-08-25 15:52:55.956309+00:00
3,2080,Electronic Data Systems,,,1Y,,,False,openflights,2024-08-25 15:52:55.956309+00:00
4,2636,Gulf African Airlines - Gambia,,,GUF,GULF AFRICAN,,False,openflights,2024-08-25 15:52:55.956309+00:00
...,...,...,...,...,...,...,...,...,...,...
6157,2773,Hong Kong Airlines,,HX,CRK,BAUHINIA,Hong Kong SAR of China,True,openflights,2024-08-25 15:52:55.956309+00:00
6158,2774,Hong Kong Express Airways,,UO,HKE,HONGKONG SHUTTLE,Hong Kong SAR of China,True,openflights,2024-08-25 15:52:55.956309+00:00
6159,3233,Lao Airlines,,QV,LAO,LAO,Lao Peoples Democratic Republic,True,openflights,2024-08-25 15:52:55.956309+00:00
6160,11724,SVG Air,,,SVG,Grenadines,Saint Vincent and the Grenadines,True,openflights,2024-08-25 15:52:55.956309+00:00


##### Create staging table

In [None]:
%%bigquery
create or replace table air_travel_stg.airlines as
  select airline_id as id,
  name,
  case alias
    when '\\N' then null
    when '' then null
    else alias
    end as alias,
  case iata
    when '' then null
    else iata
    end as iata,
  case icao
    when '' then null
    else icao
    end as icao,
  case callsign
    when '' then null
    else callsign
    end as callsign,
  case country
    when '\\N' then null
    when '' then null
    else country end as country,
  active, _data_source, _load_time
  from air_travel_raw.airlines;

Query is running:   0%|          |

### `airport_businesses` table

#### Profile category values

In [None]:
%%bigquery
select distinct category
from air_travel_raw.airport_businesses
order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category
0,ATM
1,Administration
2,Air Service
3,Air Transportation
4,Airline
...,...
184,restaurant
185,security
186,shipping company
187,shopping


In [None]:
%%bigquery
select category, count(*) as airline_categories
from air_travel_raw.airport_businesses
where category like '%Airline%'
group by category
order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category,airline_categories
0,Airline,108
1,Airline Club & Lounges,3
2,Airline Office,2
3,Airline Services,1
4,Airline Ticket Counters,1
5,Airline Ticketing,3
6,Airlines,29


In [None]:
%%bigquery
select category, count(*) as airport_categories
from air_travel_raw.airport_businesses
where category like '%Airport%'
group by category
order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category,airport_categories
0,Airport Facility,1
1,Airport Security,2
2,Airport Service,40
3,Airport Services,108


In [None]:
%%bigquery
select category, count(*) as food_categories
from air_travel_raw.airport_businesses
where category like '%Food%'
group by category
order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category,food_categories
0,Fast Food,13
1,Food,2
2,Food & Drink,4
3,Food Court,2
4,Food/Drink,3


In [None]:
%%bigquery
select category, count(*) as news_categories
from air_travel_raw.airport_businesses
where category like '%News%'
group by category
order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category,news_categories
0,News,2
1,News & Media,5
2,News Kiosk,1
3,News Stand,2
4,Newsagent,1
5,Newsstand,39
6,Newsstand/Coffee Shop,1


In [None]:
%%bigquery
select category, count(*) as shop_categories
from air_travel_raw.airport_businesses
where category like 'Shop%'
group by category
order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category,shop_categories
0,Shop,2
1,Shopping,160


#### Standardize the category field with LLM

In [None]:
import vertexai
from vertexai.generative_models import GenerativeModel, Part

project_id = "cs378-fa2024"
region = "us-central1"
model_name = "gemini-1.5-flash-001"
prompt = """Look for categories that have similar meanings, but are distinct from each other.
Replace them with a standard category.
For example, some records have a category of 'Airline Ticket Counters' and others
have a category of 'Airline Ticketing' and they both mean similar things.
Map 'Airline Ticket Counters' to 'Airline Ticketing'.
Return the entire list of original categories along with the new categories.
Format the results as json with the schema: current_category:string, new_category:string.

Below is the list of current_categories:
Airline
Airline Club & Lounges
Airline Office
Airline Services
Airline Ticket Counters
Airline Ticketing
Airlines
"""

vertexai.init(project=project_id, location=region)
model = GenerativeModel(model_name)
resp = model.generate_content([prompt])
resp_text = resp.text.replace("```json", "").replace("```", "").replace("\n", "")
print(resp_text)

[  {    "current_category": "Airline",    "new_category": "Airlines"  },  {    "current_category": "Airline Club & Lounges",    "new_category": "Airline Services"  },  {    "current_category": "Airline Office",    "new_category": "Airlines"   },  {    "current_category": "Airline Services",    "new_category": "Airline Services"  },  {    "current_category": "Airline Ticket Counters",    "new_category": "Airline Ticketing"  },  {    "current_category": "Airline Ticketing",    "new_category": "Airline Ticketing"  },  {    "current_category": "Airlines",    "new_category": "Airlines"  }] 


#### Refine the prompt and make it more dynamic

In [None]:
import json
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from google.cloud import bigquery

project_id = "cs378-fa2024"
region = "us-central1"
model_name = "gemini-1.5-flash-001"
prompt = """Go through this list of categories and look for categories that have similar meanings, but were given different names.
For example, 'Airline Ticket Counters' and 'Airline Ticketing' have similar meanings and 'Mother Room and 'Mother's Room' have similar meanings.
Suggest a standard category, mapping the current one to the new one.
Return the list of original categories along with their new categories.
Format the results as a json object with the schema: current_category:string, new_category:string.
Do not include any unchanged categories with your answer.
Do not include an explanation with your answer.
"""
category_sql = """select distinct category from air_travel_raw.airport_businesses
where category like 'R%' order by category limit 30"""

airport_businesses_sql = """select airport_code, terminal, business, category, location
from air_travel_raw.airport_businesses where category like 'R%'
and category != 'Restaurant'
order by category limit 30"""

bq_client = bigquery.Client()
rows = bq_client.query_and_wait(category_sql)

category_list = []
for row in rows:
    category_list.append(row["category"])
category_str = '\n'.join(category_list)
print("category_str:", category_str)

vertexai.init(project=project_id, location=region)
model = GenerativeModel(model_name)
resp = model.generate_content([category_str, prompt])
resp_text = resp.text.replace("```json", "").replace("```", "").replace("\n", "")
print(resp_text)
categories = json.loads(resp_text)
print("categories:", categories)

# filter out any unchanged categories
replacements = {}
for old, new in categories.items():
    if old == new:
        continue
    else:
        replacements[old] = new
print("replacements:", replacements)

df = bq_client.query_and_wait(airport_businesses_sql).to_dataframe()
print("orig df:", df)

df["category"] = df["category"].map(replacements).fillna(df["category"])
print("new df:", df)

category_str: Rental
Restaurant
Restroom
Restrooms
Retail
{  "Restroom": "Restrooms",  "Retail": "Restaurant"}
categories: {'Restroom': 'Restrooms', 'Retail': 'Restaurant'}
replacements: {'Restroom': 'Restrooms', 'Retail': 'Restaurant'}
orig df:    airport_code terminal                 business   category  \
0           syr        1  Automated Rental Lounge     Rental   
1           abq        1             Nursing Room   Restroom   
2           pgd        1            Mother's Room  Restrooms   
3           blv        1         Family Restrooms  Restrooms   
4           atw        1              Mother Room  Restrooms   
5           atw        1              Mother Room  Restrooms   
6           gtr        1         Vending machines     Retail   
7           chs        1               Amazon One     Retail   
8           fsd        1                Gift Shop     Retail   
9           fsd        1                Gift Shop     Retail   
10          lax        3           Hudson Nonstop 

#### Process entire table and save results to BQ

In [None]:
import json
import pandas_gbq
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from google.cloud import bigquery

project_id = "cs378-fa2024"
region = "us-central1"
model_name = "gemini-1.5-flash-001"
prompt = """Go through this list of categories and look for categories that have similar meanings, but were given different names.
For example, 'Airline Ticket Counters' and 'Airline Ticketing' have similar meanings and 'Mother Room and 'Mother's Room' have similar meanings.
Suggest a standard category, mapping the current one to the new one.
Return the list of original categories along with their new categories.
Format the results as a json object with the schema: current_category:string, new_category:string.
Do not include any unchanged categories with your answer.
Do not include an explanation with your answer.
"""
category_sql = "select distinct category from air_travel_raw.airport_businesses"

airport_businesses_sql = "select * from air_travel_raw.airport_businesses"

bq_client = bigquery.Client()
rows = bq_client.query_and_wait(category_sql)

category_list = []
for row in rows:
    category_list.append(row["category"])
category_str = '\n'.join(category_list)
#print("category_str:", category_str)

vertexai.init(project=project_id, location=region)
model = GenerativeModel(model_name)
resp = model.generate_content([category_str, prompt])
resp_text = resp.text.replace("```json", "").replace("```", "").replace("\n", "")
print(resp_text)
categories = json.loads(resp_text)
print(type(categories))

replacements = {} # will store the new categories

# categories can be either a dictionary or list type (depending on what LLM decides to do!)
if type(categories) == dict:
    for old, new in categories.items():
        if old == new:
            continue
        else:
            replacements[old] = new

if type(categories) == list:
    for cat_entry in categories:
        if cat_entry['current_category'] == cat_entry['new_category']:
            continue
        else:
            replacements[cat_entry['current_category']] = cat_entry['new_category']

print("replacements:", replacements)

# read the table and merge in the changes
df = bq_client.query_and_wait(airport_businesses_sql).to_dataframe()
print("orig df:", df)

df["category"] = df["category"].map(replacements).fillna(df["category"])
print("new df:", df)

table_id = "air_travel_stg.tmp_airport_businesses"
pandas_gbq.to_gbq(df, table_id, project_id=project_id, if_exists="replace")

{  "Airline Ticket Counters": "Airline Ticketing",  "Beverage": "Food & Drink",  "Snacks": "Food & Drink",  "Food/Drink": "Food & Drink",  "News & Media": "News",  "Banking": "Financial Services",  "Public Space": "Airport Facility",  "Work Space": "Office",  "Government": "Administration",  "Convenience Store": "Shop",  "Community": "Airport Facility",  "Airline Ticketing": "Airline Services",  "Dessert": "Food & Drink",  "Pizza": "Food & Drink",  "Shoe Shine": "Services",  "Cosmetics": "Beauty Products",  "Sunglasses": "Accessories",  "Salon": "Beauty Services",  "Liquor Store": "Shop",  "Lottery": "Other",  "Jewelry": "Accessories",  "Sporting Goods": "Shop",  "Travel Agency": "Services",  "Sporting Goods Store": "Shop",  "Pretzels": "Food & Drink",  "Wine Bar": "Bar/Restaurant",  "Grocery Store": "Shop",  "Newsagent": "News",  "Café": "Coffee Shop",  "Observation Point": "Airport Facility",  "Juice Bar": "Food & Drink",  "Phone Service": "Communication",  "Healthcare": "Medical",  

100%|██████████| 1/1 [00:00<00:00, 5454.23it/s]


#### Check the output table

In [None]:
%%bigquery
select distinct category from air_travel_stg.tmp_airport_businesses order by category

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,category
0,Accessories
1,Administration
2,Airline Services
3,Airline Ticketing
4,Airlines
5,Airport Facility
6,Airport Services
7,Art
8,Art Exhibit
9,Baby Care


#### Now do the same thing with the business names, i.e. standardize them to reduce inconsistent spellings

In [None]:
import json
import pandas_gbq
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from google.cloud import bigquery

project_id = "cs378-fa2024"
region = "us-central1"
model_name = "gemini-1.5-flash-001"
prompt = """Go through this list of business names and standardize them to remove the variations in spelling.
For example, 'Sweet Jill Bakery' and 'Sweet Jill's Bakery' both refer to the same business, so standardize on one or the other.
Another example is, 'Alaska Airline' and 'Alaska Airlines'. Since they both refer to the same business, standardize on 'Alaska Airlines'.
Suggest a standard name, mapping the current one to the new one.
Return the list of original business names along with their new names.
Format the results as a json object with the schema: current_name:string, new_new:string.
Do not include any unchanged business names with your answer.
Do not include an explanation with your answer.
"""
business_names_sql = "select distinct business from air_travel_stg.tmp_airport_businesses order by business"

airport_businesses_sql = "select * from air_travel_stg.tmp_airport_businesses"


def do_inference(input_str):

    vertexai.init(project=project_id, location=region)
    model = GenerativeModel(model_name)
    resp = model.generate_content([input_str, prompt])
    resp_text = resp.text.replace("```json", "").replace("```", "").replace("\n", "")
    print(resp_text)

    names = json.loads(resp_text)
    print(type(names))

    replacements = {}

    # names can be either a dictionary or list type (depending on what the LLM decides to do!)
    if type(names) == dict:
        for old, new in names.items():
            if old == new:
                continue
            else:
                replacements[old] = new

    if type(names) == list:
        for name_entry in names:
            if name_entry['current_name'] == name_entry['new_name']:
                continue
            else:
                replacements[cat_entry['current_name']] = cat_entry['new_name']

    return replacements


bq_client = bigquery.Client()
rows = bq_client.query_and_wait(business_names_sql)

batch_size = 500
business_names = []
combined_replacements = {}

for i, row in enumerate(rows):

    business_names.append(row["business"])

    if i > 0 and i % batch_size == 0:
        # process batch
        print("processing batch")
        business_names_str = '\n'.join(business_names)
        replacements = do_inference(business_names_str)
        combined_replacements.update(replacements)

        # reset business_names to process next batch
        business_names = []

if len(business_names) > 0:
    print("processing last batch")
    business_names_str = '\n'.join(business_names)
    replacements = do_inference(business_names_str)
    combined_replacements.update(replacements)

print("replacements:", replacements)

# read the table and merge in the changes
df = bq_client.query_and_wait(airport_businesses_sql).to_dataframe()
print("orig df:", df)

df["business"] = df["business"].map(replacements).fillna(df["business"])
print("new df:", df)

table_id = "air_travel_stg.airport_businesses" # output table
pandas_gbq.to_gbq(df, table_id, project_id=project_id, if_exists="replace")

processing batch
{"13th Street Pub and Grill": "13th Street Pub","3 Daughters Brewing at PIE": "3 Daughters Brewing","49 Mile Market": "49th Mile Market","Air Canada Airlines": "Air Canada","Airline Ticketing Lobbe": "Airline Ticketing","Airport Administration Office": "Airport Administration","Airport Management Offices": "Airport Management","Airport Offices": "Airport Office","Alamo Rent A Car": "Alamo Car Rental","Alamo Rental Car": "Alamo Car Rental","Alaska Airline": "Alaska Airlines","Alaska Airlines Bag Claim Office": "Alaska Airlines Baggage Claim","Alaska Airlines Lounge": "Alaska Airlines Lounge","Alaska Airlines Ticketing": "Alaska Airlines Ticketing","Alaska Offices": "Alaska Office","Alaska Seaplanes": "Alaska Seaplane","Alaska Ticketing": "Alaska Airlines Ticketing","Allegiant Air": "Allegiant Airlines","Allegiant Airline": "Allegiant Airlines","American Air": "American Airlines","American Air Lines": "American Airlines","American Airline": "American Airlines","American 

100%|██████████| 1/1 [00:00<00:00, 6615.62it/s]


In [None]:
%%bigquery
select * from air_travel_stg.airport_businesses
order by business desc

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_code,terminal,business,category
0,lax,international,iStore Boutique,Electronics
1,bdl,1,iStore,Shopping
2,hou,1,iStore,Electronics Store
3,sfo,1,iStore,Electronics
4,las,1,iCandy,Candy Store
...,...,...,...,...
1569,lgb,1,4th Vine,Restaurant
1570,sfo,2,49 Mile Market,Shopping
1571,pie,1,3 Daughters Brewing at PIE,Dining
1572,boi,1,13th Street Pub and Grill,Dining


#### Look for certain unwanted characters in the name of the business

In [None]:
%%bigquery
select * from air_travel_stg.airport_businesses
where business like '“%'
order by business

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_code,terminal,business,category,location,menu_items,_data_source,_load_time
0,sba,1,“Fiesta,Entertainment,Near gates G3,,airportguide,2024-08-25 15:16:58.467182+00:00
1,sba,1,“Good Time Clock,Entertainment,Near Car Rental National,,airportguide,2024-08-25 15:16:58.467182+00:00
2,sba,1,“SB Sky Gazing,Entertainment,Near gates G3,,airportguide,2024-08-25 15:16:58.467182+00:00


#### Create staging table and remove unwanted characters

In [None]:
%%bigquery
create or replace table air_travel_stg.airport_businesses as
    select airport_code, terminal, replace(business, '“', '') as business,
      category, location, menu_items, _data_source, _load_time
    from air_travel_stg.airport_businesses

Query is running:   0%|          |

In [None]:
%%bigquery
select * from air_travel_stg.airport_businesses
where business like '“%'
order by business

Query is running:   0%|          |

Downloading: |          |

Unnamed: 0,airport_code,terminal,business,category,location,menu_items,_data_source,_load_time


#### Cleanup

In [None]:
%%bigquery
drop table air_travel_stg.tmp_airport_businesses

Query is running:   0%|          |

### `airport_reviews` table

#### Rename fields and create staging table

In [None]:
%%bigquery
create or replace table air_travel_stg.airport_reviews as
	select id,
	threadRef as thread_id,
	airportRef as airport_id,
	airportIdent as airport_code,
	date as date_created,
	memberNickname as author,
	subject,
	body,
	_data_source,
	_load_time
	from air_travel_raw.airport_reviews

Query is running:   0%|          |

### `airports` table



#### Profile `source` and `type` fields

In [None]:
%%bigquery
select count(*) source_count
from air_travel_raw.airports
where source != '\\N' and source is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,source_count
0,11017


In [None]:
%%bigquery
select count(*) type_count
from air_travel_raw.airports
where type != '\\N' and type is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,type_count
0,11017


In [None]:
%%bigquery
select distinct source
from air_travel_raw.airports
where source != '\\N' and source is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,source
0,User
1,Legacy
2,OurAirports


In [None]:
%%bigquery
select distinct type
from air_travel_raw.airports
where type != '\\N' and type is not null
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,type
0,unknown
1,airport
2,station
3,port


In [None]:
%%bigquery
	select safe_cast(airport_id as INTEGER) as airport_id,
	airport_name,
	city,
	country,
	case iata
		when '\\N' then null
		else iata
		end as iata,
	case icao
		when '\\N' then null
		else icao
		end as icao,
	latitude,
	longitude,
	altitude,
	safe_cast(timezone as INTEGER) as timezone_delta,
  daylight_savings_time,
  case tz_database_timezone
		when '\\N' then null
		else tz_database_timezone
		end as timezone_name,
	case type
		when '\\N' then null
		else type
		end as type,
	case source
		when '\\N' then null
		else source
		end as source,
	_data_source
	_load_time
	from air_travel_raw.airports

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_id,airport_name,city,country,iata,icao,latitude,longitude,altitude,timezone_delta,daylight_savings_time,timezone_name,type,source,_load_time
0,11746,Kona Dock,Kailua-Kona Hawaii,United States,,,19.63858700000000000000000000000000000000,-155.99673900000000000000000000000000000000,0,-10,A,,,,openflights
1,12060,Kanuhuraa Island,Kanuhuraa Island,Maldives,,,5.53413100000000000000000000000000000000,73.50598100000000000000000000000000000000,0,5,N,,,,openflights
2,12061,Kwara Airstrip,Kwara Camp,Botswana,,,-19.10142900000000000000000000000000000000,23.28379800000000000000000000000000000000,0,1,U,,,,openflights
3,12062,Matemo Island,Matemo Island,Mozambique,,,-12.20077300000000000000000000000000000000,40.56896900000000000000000000000000000000,0,1,U,,,,openflights
4,12063,Medjumbe Island,Medjumbe Island,Mozambique,,,-11.81743200000000000000000000000000000000,40.60365600000000000000000000000000000000,0,1,U,,,,openflights
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12663,2500,Santa Cruz Airport,Santa Cruz,Argentina,RZA,SAWU,-50.01650000000000000000000000000000000000,-68.57920000000000000000000000000000000000,364,-3,N,America/Argentina/Rio_Gallegos,airport,OurAirports,openflights
12664,4061,El Calafate Airport,El Calafate,Argentina,FTE,SAWC,-50.28030000000000000000000000000000000000,-72.05310100000000000000000000000000000000,669,-3,N,America/Argentina/Rio_Gallegos,airport,OurAirports,openflights
12665,6028,Las Heras Airport,Las Heras,Argentina,LHS,SAVH,-46.53829956049999000000000000000000000000,-68.96530151370000000000000000000000000000,1082,-3,N,America/Argentina/Rio_Gallegos,airport,OurAirports,openflights
12666,6030,Lago Argentino Airport,El Calafate,Argentina,ING,SAWA,-50.33610200000000000000000000000000000000,-72.24859600000000000000000000000000000000,732,-3,N,America/Argentina/Rio_Gallegos,airport,OurAirports,openflights


In [None]:
%%bigquery
select count(*) as type_not_null
from air_travel_raw.airports
where type is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,type_not_null
0,12668


In [None]:
%%bigquery
select count(*) as source_not_null
from air_travel_raw.airports
where source is not null

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,source_not_null
0,12668


#### Create staging table

In [None]:
%%bigquery
create or replace table air_travel_stg.airports as
	select safe_cast(airport_id as INTEGER) as airport_id,
	airport_name,
	city,
	country,
	case iata
		when '\\N' then null
		else iata
		end as iata,
	case icao
		when '\\N' then null
		else icao
		end as icao,
	latitude,
	longitude,
	altitude,
	safe_cast(timezone as INTEGER) as timezone_delta,
  daylight_savings_time,
  case tz_database_timezone
		when '\\N' then null
		else tz_database_timezone
		end as timezone_name,
	case type
		when '\\N' then null
		else type
		end as type,
	case source
		when '\\N' then null
		else source
		end as source,
	_data_source,
	_load_time
	from air_travel_raw.airports

Query is running:   0%|          |

### `countries` table

##### Profile `country_name`, `iso_code`, and `dafif_code`

In [None]:
%%bigquery
select count(*) as empty_country_name
from air_travel_raw.countries
where country_name = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_country_name
0,0


In [None]:
%%bigquery
select count(*) as empty_iso_code
from air_travel_raw.countries
where iso_code = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_iso_code
0,19


In [None]:
%%bigquery
select count(*) as empty_dafif_code
from air_travel_raw.countries
where dafif_code = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_dafif_code
0,0


#### Replace `'\\N'` with null

In [None]:
%%bigquery
select country_name as name,
case iso_code
  when '\\N' then null
  else iso_code
  end as iso_code,
dafif_code
from air_travel_raw.countries

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,name,iso_code,dafif_code
0,United Arab Emirates,AE,AE
1,Afghanistan,AF,AF
2,Antigua and Barbuda,AG,AC
3,Anguilla,AI,AV
4,Albania,AL,AL
...,...,...,...
256,Midway Islands,,MQ
257,Paracel Islands,,PF
258,Spratly Islands,,PG
259,Tromelin Island,,TE


#### Create staging table

In [None]:
%%bigquery
create or replace table air_travel_stg.countries as
    select country_name as name,
    case iso_code
      when '\\N' then null
      else iso_code
      end as iso_code,
    dafif_code,
    _data_source,
    _load_time
    from air_travel_raw.countries

Query is running:   0%|          |

### `flight_delays` table

In [None]:
%%bigquery
select airport_name
from air_travel_raw.flight_delays
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport_name
0,"Albany, NY: Albany International"
1,"Atlanta, GA: Hartsfield-Jackson Atlanta Intern..."
2,"Birmingham, AL: Birmingham-Shuttlesworth Inter..."
3,"Burlington, VT: Burlington International"
4,"Buffalo, NY: Buffalo Niagara International"


#### Split `airport_name` into three components: city, state, airport

In [None]:
%%bigquery
select split(airport_name, ':')[0] as city_state, split(airport_name, ':')[1] as airport
from air_travel_raw.flight_delays
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,city_state,airport
0,"Albany, NY",Albany International
1,"Atlanta, GA",Hartsfield-Jackson Atlanta International
2,"Birmingham, AL",Birmingham-Shuttlesworth International
3,"Burlington, VT",Burlington International
4,"Buffalo, NY",Buffalo Niagara International


In [None]:
%%bigquery
select split(city_state, ',')[0] as city, split(city_state, ',')[1] as state, airport
from
  (select split(airport_name, ':')[0] as city_state, split(airport_name, ':')[1] as airport
  from air_travel_raw.flight_delays
  limit 5)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,city,state,airport
0,Albany,NY,Albany International
1,Atlanta,GA,Hartsfield-Jackson Atlanta International
2,Birmingham,AL,Birmingham-Shuttlesworth International
3,Burlington,VT,Burlington International
4,Buffalo,NY,Buffalo Niagara International


#### Putting it all together

In [16]:
%%bigquery
select event_month, carrier, carrier_name, airport_code,
  split(city_state, ',')[0] as airport_city, split(city_state, ',')[1] as airport_state,
  airport_name, * except (event_month, carrier, carrier_name, airport_code, airport_name, city_state)
from
  (select date(year, month, 01) as event_month, carrier, carrier_name, airport as airport_code,
  split(airport_name, ':')[0] as city_state, split(airport_name, ':')[1] as airport_name,
  safe_cast(arr_flights as INTEGER) as arr_total, safe_cast(arr_cancelled as INTEGER) as arr_cancelled,
  safe_cast(arr_diverted as INTEGER) as arr_diverted, safe_cast(arr_delay as INTEGER) as arr_delay_min,
  safe_cast(weather_delay as INTEGER) as weather_delay_min, safe_cast(nas_delay as INTEGER) as nas_delay_min,
  safe_cast(late_aircraft_delay as INTEGER) as late_aircraft_delay_min, _data_source, _load_time
  from air_travel_raw.flight_delays
  limit 5)

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,event_month,carrier,carrier_name,airport_code,airport_city,airport_state,airport_name,arr_total,arr_cancelled,arr_diverted,arr_delay_min,weather_delay_min,nas_delay_min,late_aircraft_delay_min,_data_source,_load_time
0,2024-04-01,9E,Endeavor Air Inc.,ALB,Albany,NY,Albany International,115,0,0,865,41,88,219,transtats,2024-08-25 15:45:31.691980+00:00
1,2024-04-01,9E,Endeavor Air Inc.,ATL,Atlanta,GA,Hartsfield-Jackson Atlanta International,2032,3,1,16233,1528,1745,7223,transtats,2024-08-25 15:45:31.691980+00:00
2,2024-04-01,9E,Endeavor Air Inc.,BHM,Birmingham,AL,Birmingham-Shuttlesworth International,95,1,0,898,55,130,610,transtats,2024-08-25 15:45:31.691980+00:00
3,2024-04-01,9E,Endeavor Air Inc.,BTV,Burlington,VT,Burlington International,142,5,1,1546,9,121,1260,transtats,2024-08-25 15:45:31.691980+00:00
4,2024-04-01,9E,Endeavor Air Inc.,BUF,Buffalo,NY,Buffalo Niagara International,193,1,1,668,60,197,184,transtats,2024-08-25 15:45:31.691980+00:00


#### Create staging table

In [17]:
%%bigquery
create or replace table air_travel_stg.flight_delays as
    select event_month, carrier, carrier_name, airport_code,
    split(city_state, ',')[0] as airport_city, split(city_state, ',')[1] as airport_state,
    airport_name, * except (event_month, carrier, carrier_name, airport_code, airport_name, city_state)
    from
        (select date(year, month, 01) as event_month, carrier, carrier_name, airport as airport_code,
          split(airport_name, ':')[0] as city_state, split(airport_name, ':')[1] as airport_name,
          safe_cast(arr_flights as INTEGER) as arr_total, safe_cast(arr_cancelled as INTEGER) as arr_cancelled,
          safe_cast(arr_diverted as INTEGER) as arr_diverted, safe_cast(arr_delay as INTEGER) as arr_delay_min,
          safe_cast(weather_delay as INTEGER) as weather_delay_min, safe_cast(nas_delay as INTEGER) as nas_delay_min,
          safe_cast(late_aircraft_delay as INTEGER) as late_aircraft_delay_min, _data_source, _load_time
          from air_travel_raw.flight_delays)

Query is running:   0%|          |

### `flight_routes` table

#### Profile `source_airport`, `source_airport_id`, `dest_airport`, `dest_airport_id`

In [None]:
%%bigquery
select count(*) as empty_source_airport
from air_travel_raw.flight_routes
where source_airport = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_source_airport
0,0


In [None]:
%%bigquery
select count(*) as empty_source_airport_id
from air_travel_raw.flight_routes
where source_airport_id = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_source_airport_id
0,220


In [None]:
%%bigquery
select count(*) as empty_dest_airport
from air_travel_raw.flight_routes
where dest_airport = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_dest_airport
0,0


In [None]:
%%bigquery
select count(*) as empty_dest_airport_id
from air_travel_raw.flight_routes
where dest_airport_id = '\\N'

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,empty_dest_airport_id
0,221


#### Cast airline_id, replace `'\\N'` with null and rename airport fields

In [5]:
%%bigquery
select airline_code,
  safe_cast(airline_id as INTEGER) as airline_id,
  source_airport as source_airport_code,
  case source_airport_id
    when '\\N' then null
    else safe_cast(source_airport_id as INTEGER)
    end as source_airport_id,
  dest_airport as dest_airport_code,
  case dest_airport_id
    when '\\N' then null
    else safe_cast(dest_airport_id as INTEGER)
    end as dest_airport_id,
  codeshare,
  stops,
  equipment,
  _data_source,
  _load_time
from air_travel_raw.flight_routes
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airline_code,airline_id,source_airport_code,source_airport_id,dest_airport_code,dest_airport_id,codeshare,stops,equipment,_data_source,_load_time
0,2B,410,AER,2965,KZN,2990,,0,CR2,openflights,2024-08-25 16:19:41.311112+00:00
1,2B,410,ASF,2966,KZN,2990,,0,CR2,openflights,2024-08-25 16:19:41.311112+00:00
2,2B,410,ASF,2966,MRV,2962,,0,CR2,openflights,2024-08-25 16:19:41.311112+00:00
3,2B,410,CEK,2968,KZN,2990,,0,CR2,openflights,2024-08-25 16:19:41.311112+00:00
4,2B,410,CEK,2968,OVB,4078,,0,CR2,openflights,2024-08-25 16:19:41.311112+00:00


#### Create staging table

In [6]:
%%bigquery
create or replace table air_travel_stg.flight_routes as
  select airline_code,
    safe_cast(airline_id as INTEGER) as airline_id,
    source_airport,
    case source_airport_id
      when '\\N' then null
      else safe_cast(source_airport_id as INTEGER)
      end as source_airport_id,
    dest_airport,
    case dest_airport_id
      when '\\N' then null
      else safe_cast(dest_airport_id as INTEGER)
      end as dest_airport_id,
    codeshare,
    stops,
    equipment,
    _data_source,
    _load_time
  from air_travel_raw.flight_routes

Query is running:   0%|          |

### `tsa_traffic` table

#### Cast `date` from STRING to DATE

In [None]:
%%bigquery
select date as orig_date, safe_cast(date as DATE) as new_date
from air_travel_raw.tsa_traffic
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orig_date,new_date
0,5/8/2024,NaT
1,7/24/2024,NaT
2,3/27/2024,NaT
3,8/17/2022,NaT
4,6/7/2023,NaT


In [None]:
%%bigquery
select date as orig_date, safe_cast(date as DATE format 'MM/DD/YYYY') as new_date
from air_travel_raw.tsa_traffic
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,orig_date,new_date
0,5/8/2024,2024-05-08
1,7/24/2024,2024-07-24
2,3/27/2024,2024-03-27
3,8/17/2022,2022-08-17
4,6/7/2023,2023-06-07


#### Putting it all together

In [None]:
%%bigquery
select safe_cast(date as DATE format 'MM/DD/YYYY') as event_date,
  safe_cast(hour as INTEGER) as event_hour,
  airport_code,
  airport_name,
  city as airport_city,
  state as airport_state,
  checkpoint as tsa_checkpoint,
  total_count as passenger_count
from air_travel_raw.tsa_traffic
limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,event_date,event_hour,airport_code,airport_name,airport_city,airport_state,tsa_checkpoint,passenger_count
0,2024-05-08,12,ADK,Adak,Adak,AK,Checkpoint1,23
1,2024-07-24,13,ADK,Adak,Adak,AK,Checkpoint1,1
2,2024-03-27,12,ADK,Adak,Adak,AK,Checkpoint1,7
3,2022-08-17,13,ADK,Adak Naval Air Station,Adak,AK,Checkpoint1,67
4,2023-06-07,12,ADK,Adak Naval Air Station,Adak,AK,Checkpoint1,3


#### Create staging table

In [None]:
%%bigquery
create or replace table air_travel_stg.tsa_traffic as
  select safe_cast(date as DATE format 'MM/DD/YYYY') as event_date,
    safe_cast(hour as INTEGER) as event_hour,
    airport_code,
    airport_name,
    city as airport_city,
    state as airport_state,
    checkpoint as tsa_checkpoint,
    total_count as passenger_count,
    _data_source,
    _load_time
  from air_travel_raw.tsa_traffic

Query is running:   0%|          |