### Load change records into the raw layer of the Air Travel warehouse


##### Don't run this notebook before you have:
*   copied your raw tables to the `inc_` dataset
*   Ran the `orchestrate_first.sh` to create and populate the snapshot and model tables with the initial data.


##### SDK documentation links (in case you need to modify the common function):
*   [BQ Client](https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.client.Client)
*   [LoadJobConfig](https://cloud.google.com/python/docs/reference/bigquery/latest/google.cloud.bigquery.job.LoadJobConfig)


#### Common functions

In [7]:
from google.cloud import bigquery

project_id = "cs378-fa2024"
bucket = "air-travel-data"
parent_folder = "incrementals"
region = "us-central1"
dataset = "inc_air_travel_raw" # be sure to add the "in_" prefix

bq_client = bigquery.Client()

def load_table_from_csv(folder, file_name, table, schema, delimiter=",", quote_character="\""):

  uri = f"gs://{bucket}/{parent_folder}/{folder}/{file_name}"
  table_id = f"{project_id}.{dataset}.{table}"

  job_config = bigquery.LoadJobConfig(
        schema=schema,
        skip_leading_rows=1,
        source_format=bigquery.SourceFormat.CSV,
        create_disposition=bigquery.CreateDisposition.CREATE_NEVER,
        write_disposition=bigquery.WriteDisposition.WRITE_APPEND,
        field_delimiter=delimiter,
        quote_character=quote_character,
        allow_jagged_rows=True,
        ignore_unknown_values=True
      )

  load_job = bq_client.load_table_from_uri(uri, table_id, job_config=job_config)
  load_job.result()

  destination_table = bq_client.get_table(table_id)
  print("Table has {} rows after load.".format(destination_table.num_rows))

#### Load `airports`
##### Removed the `_load_time` and `_data_source` fields from the schema below because we are not creating a new table, just loading into **an** existing one

In [8]:
folder = "openflights"
file_name = "airports_ext.csv"
table = "airports"
delimiter = ","

schema = [
  bigquery.SchemaField("airport_id", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("airport_name", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("city", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("country", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("iata", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("icao", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("latitude", "BIGNUMERIC", mode="REQUIRED"),
  bigquery.SchemaField("longitude", "BIGNUMERIC", mode="REQUIRED"),
  bigquery.SchemaField("altitude", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("timezone", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("daylight_savings_time", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("tz_database_timezone", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("type", "STRING", mode="REQUIRED"),
  bigquery.SchemaField("source", "STRING", mode="REQUIRED")
]

load_table_from_csv(folder, file_name, table, schema, delimiter)

Table has 25336 rows after load.


#### Load `airport_reviews`

In [9]:
folder = "our-airports"
file_name = "airport-comments.tsv"
table = "airport_reviews"
delimiter = "\t"
quote_character = "'"

schema = [
  bigquery.SchemaField("id", "INTEGER", mode="REQUIRED"),
  bigquery.SchemaField("threadRef", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("airportRef", "INTEGER", mode="NULLABLE"),
  bigquery.SchemaField("airportIdent", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("date", "DATETIME", mode="NULLABLE"),
  bigquery.SchemaField("memberNickname", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("subject", "STRING", mode="NULLABLE"),
  bigquery.SchemaField("body", "STRING", mode="NULLABLE")
]

load_table_from_csv(folder, file_name, table, schema, delimiter, quote_character)

Table has 30942 rows after load.
