# Data Ingest Notebook

We're using the 'us' region to store our BQ tables so that we can make use of the `%%bigquery` built-in magic when querying the tables.

In [2]:
from google.cloud import storage
from google.cloud import bigquery

project_id = "cs329e-sp2024"
bucket_name = "cs329e-open-access"
folder_name = "initial_load"
dataset_name = "airline_raw"
region = "us"

storage_client = storage.Client()
bq_client = bigquery.Client()

In the next sections, we load the CSV files into BQ tables. Each section loads a different file.

Refer to the [SDK documentation](https://cloud.google.com/python/docs/reference/bigquery/latest/index.html) for details on `LoadJobConfig`.

In [9]:
def create_load_table(file_name, table_name, schema, delimiter=","):

  uri = "gs://{}/{}/{}".format(bucket_name, folder_name, file_name)
  table_id = "{}.{}.{}".format(project_id, dataset_name, table_name)

  table = bigquery.Table(table_id, schema=schema)
  table = bq_client.create_table(table, exists_ok=True)
  print("Created table {}".format(table.table_id))

  # remove the load_time field from the schema before loading the data,
  # the load_time value will be auto-generated
  del schema[-1]

  job_config = bigquery.LoadJobConfig(
        schema=schema,
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
        write_disposition="WRITE_TRUNCATE",
        field_delimiter=delimiter
      )

  load_job = bq_client.load_table_from_uri(uri, table_id, job_config=job_config)
  load_job.result()

  destination_table = bq_client.get_table(table_id)
  print("Loaded {} rows.".format(destination_table.num_rows))


# air_carriers

In [10]:
file_name = 'air_carriers.csv'
table_name = 'air_carriers'

schema = [
  bigquery.SchemaField("code", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("description", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table air_carriers
Loaded 1656 rows.


# bird_airports

In [11]:
file_name = 'bird_airports.csv'
table_name = 'bird_airports'

schema = [
  bigquery.SchemaField("code", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("description", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table bird_airports
Loaded 6510 rows.


# faker_airports

In [12]:
file_name = 'faker_airports.csv'
table_name = 'faker_airports'
delimiter = "|"

schema = [
  bigquery.SchemaField("airport", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("iata", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("icao", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("city", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("state", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("country", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema, delimiter)

Created table faker_airports
Loaded 397 rows.


# airlines

In [13]:
file_name = 'airlines.csv'
table_name = 'airlines'

schema = [
  bigquery.SchemaField("fl_date", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("op_carrier_airline_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("tail_num", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("op_carrier_fl_num", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("origin_airport_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("origin_airport_seq_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("origin_city_market_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("origin", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("dest_airport_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("dest_airport_seq_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("dest_city_market_id", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("dest", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("crs_dep_time", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("dep_time", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("dep_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("dep_delay_new", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("arr_time", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("arr_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("arr_delay_new", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("cancelled", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("cancellation_code", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("crs_elapsed_time", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("actual_elapsed_time", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("carrier_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("weather_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("nas_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("security_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("late_aircraft_delay", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table airlines
Loaded 701352 rows.


# meals

In [14]:
file_name = 'meals.csv'
table_name = 'meals'
delimiter = '|'

schema = [
  bigquery.SchemaField("meal_id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("meal_name", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("meal_image", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("cat_name", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("tags", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("area", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("ingredient1", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("ingredient2", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("ingredient3", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("ingredient4", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("ingredient5", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("source", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("youtube", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema, delimiter)

Created table meals
Loaded 3322 rows.


# snacks

In [15]:
file_name = 'snacks.csv'
table_name = 'snacks'

schema = [
  bigquery.SchemaField("code", "FLOAT", mode="REQUIRED"),
  bigquery.SchemaField("url", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("product_name", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("brands", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("categories", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("countries_en", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("ingredients_text", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("image_url", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("load_time", "TIMESTAMP", mode="REQUIRED", default_value_expression="CURRENT_TIMESTAMP"),
]

create_load_table(file_name, table_name, schema)

Created table snacks
Loaded 226831 rows.


# Verify loads

In [18]:
sql = "select table_name from {}.INFORMATION_SCHEMA.TABLES order by table_name".format(dataset_name)
query = (sql)

query_job = bq_client.query(
    query,
    location=region,
)

results = query_job.result()

for table in query_job:
    table_name = table.values()[0]
    print("table:", table_name)

table: air_carriers
table: airlines
table: bird_airports
table: faker_airports
table: meals
table: snacks


In [20]:
%%bigquery
select * from airline_raw.air_carriers limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,code,description,load_time
0,19031,Mackey International Inc.: MAC,2024-01-26 17:03:56.222532+00:00
1,19032,Munz Northern Airlines Inc.: XY,2024-01-26 17:03:56.222532+00:00
2,19033,Cochise Airlines Inc.: COC,2024-01-26 17:03:56.222532+00:00
3,19034,Golden Gate Airlines Inc.: GSA,2024-01-26 17:03:56.222532+00:00
4,19035,Aeromech Inc.: RZZ,2024-01-26 17:03:56.222532+00:00


In [21]:
%%bigquery
select * from airline_raw.airlines limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,fl_date,op_carrier_airline_id,tail_num,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,origin_city_market_id,origin,dest_airport_id,dest_airport_seq_id,...,cancelled,cancellation_code,crs_elapsed_time,actual_elapsed_time,carrier_delay,weather_delay,nas_delay,security_delay,late_aircraft_delay,load_time
0,2018/8/1,20368,247NV,1682,14082,1408202,34082,PGD,10135,1013505,...,0,,155,153,0.0,41.0,0.0,0.0,0.0,2024-01-26 17:05:18.456648+00:00
1,2018/8/3,20368,257NV,1682,14082,1408202,34082,PGD,10135,1013505,...,0,,155,159,,,,,,2024-01-26 17:05:18.456648+00:00
2,2018/8/6,20368,258NV,1682,14082,1408202,34082,PGD,10135,1013505,...,0,,155,156,,,,,,2024-01-26 17:05:18.456648+00:00
3,2018/8/8,20368,258NV,1682,14082,1408202,34082,PGD,10135,1013505,...,0,,155,155,,,,,,2024-01-26 17:05:18.456648+00:00
4,2018/8/10,20368,260NV,1682,14082,1408202,34082,PGD,10135,1013505,...,0,,155,153,,,,,,2024-01-26 17:05:18.456648+00:00


In [22]:
%%bigquery
select * from airline_raw.bird_airports limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,code,description,load_time
0,CBK,"Colby, KS: Murray",2024-01-26 17:04:04.138900+00:00
1,ANP,"Annapolis, MD: Lee",2024-01-26 17:04:04.138900+00:00
2,KCA,"Kuqa, China: Kuche",2024-01-26 17:04:04.138900+00:00
3,MOL,"Molde, Norway: Aro",2024-01-26 17:04:04.138900+00:00
4,OIC,"Norwich, NY: Eaton",2024-01-26 17:04:04.138900+00:00


In [23]:
%%bigquery
select * from airline_raw.faker_airports limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,airport,iata,icao,city,state,country,load_time
0,Tarapoto airport,TPP,,Tarapoto,San Martin,Peru,2024-01-26 17:04:45.652290+00:00
1,El Loa airport,CJC,,Calama,Antofagasta,Chile,2024-01-26 17:04:45.652290+00:00
2,La Florida airport,LSC,,Compañía Alta,Coquimbo,Chile,2024-01-26 17:04:45.652290+00:00
3,Hefei-Luogang airport,HFE,,Hefei,Anhui,China,2024-01-26 17:04:45.652290+00:00
4,Guizhou,KWE,,Guiyang,Guizhou,China,2024-01-26 17:04:45.652290+00:00


In [24]:
%%bigquery
select * except(meal_image, source, youtube) from airline_raw.meals limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,meal_id,meal_name,cat_name,tags,area,ingredient1,ingredient2,ingredient3,ingredient4,ingredient5,load_time
0,52968,Mbuzi Choma (Roasted Goat),Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,2024-01-26 17:06:07.349652+00:00
1,52968,Mbuzi Choma (Roasted Goat),Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,2024-01-26 17:06:07.349652+00:00
2,52968,Mbuzi Choma (Roasted Goat),Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,2024-01-26 17:06:07.349652+00:00
3,52968,Mbuzi Choma (Roasted Goat),Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,2024-01-26 17:06:07.349652+00:00
4,52968,Mbuzi Choma (Roasted Goat),Goat,"BBQ,Meat",Kenyan,Goat Meat,Corn Flour,Tomatoes,Salt,Onion,2024-01-26 17:06:07.349652+00:00


In [25]:
%%bigquery
select * except(url, image_url) from airline_raw.snacks limit 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,code,product_name,brands,categories,countries_en,ingredients_text,load_time
0,8606014000000.0,,,snacks,,idk,2024-01-26 17:06:44.361891+00:00
1,7311041000000.0,,,snacks,,"Hvetemel, salt 3,6 %, rapsolje, surhetsreguler...",2024-01-26 17:06:44.361891+00:00
2,4901561000000.0,焼するめ,,"en:Snacks, en:Salty snacks",,いか (中国)、砂糖、食塩、乳糖/ソルビット、調味料(ア ミノ酸等)、酸味料、リン酸塩(Na...,2024-01-26 17:06:44.361891+00:00
3,5000160000000.0,Peanut Chocolate Treat Bag,,"Snacks, en:Confiseries, en:Snacks sucrés",,R zoz 82 20:00 ¥2170/20 000159 504355&quot;&gt;,2024-01-26 17:06:44.361891+00:00
4,4009901000000.0,Minis,,"Snacks, Sweet snacks, Confectioneries, Candies",,"sugar, glucose syrup, palm fat, acid citric ac...",2024-01-26 17:06:44.361891+00:00
