
# Unit 2 — Team Classification (Flights, BQML)

**Goal (team):** Build an *ops-ready* classifier in **BigQuery ML** to predict **`diverted`** on U.S. flights. Minimal handholding by design.

**What you deliver (inside this notebook):**
- One **LOGISTIC_REG** model (baseline), one **engineered** model using `TRANSFORM`
- **Evaluation** via `ML.EVALUATE` and **confusion matrices** (default 0.5 + your custom threshold)
- **Threshold choice** + 3–5 sentence ops justification
- Embedded **rubric** below (self-check before submission)

> Choose *one* dataset table that exists at your institution:  
> • `bigquery-public-data.faa.us_flights` **or** `bigquery-public-data.flights.*`  
> Make sure the table has `carrier`, `dep_delay`, `arr_delay` (for filters), `origin`, `dest`, `diverted` (or equivalent).


In [2]:

# --- Minimal setup (edit 3 vars) ---
from google.colab import auth
auth.authenticate_user()

import os
from google.cloud import bigquery

PROJECT_ID = "mgmt467-lab"      # e.g., mgmt-467-47888
REGION     = "us-central1"
TABLE_PATH = "mgmt467-lab.assignment_2flights.assignment2_BQML"   # or your `bigquery-public-data.flights` table/view

os.environ["PROJECT_ID"] = PROJECT_ID
os.environ["REGION"]     = REGION
bq = bigquery.Client(project=PROJECT_ID)

print("BQ Project:", PROJECT_ID)
print("Source table:", TABLE_PATH)


BQ Project: mgmt467-lab
Source table: mgmt467-lab.assignment_2flights.assignment2_BQML


### Quick sanity check

In [3]:

preview_sql = f"SELECT * FROM `{TABLE_PATH}` LIMIT 5"
bq.query(preview_sql).result().to_dataframe()


Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,5,2,6,AS,64,N615AS,PSG,WRG,1530,...,1542,-10,0,0,,,,,,
1,2015,1,18,7,AS,64,N768AS,PSG,WRG,1524,...,1549,0,0,0,,,,,,
2,2015,11,15,7,AS,65,N706AS,WRG,PSG,1106,...,1136,7,0,0,,,,,,
3,2015,10,2,5,AS,64,N764AS,14256,15841,1523,...,1541,-4,0,0,,,,,,
4,2015,4,4,6,AS,65,N792AS,WRG,PSG,1101,...,1058,-23,0,0,,,,,,



## 1) Canonical mapping (adjust as needed)
Map to a minimal schema used in the rest of the notebook:
- `flight_date` (DATE), `dep_delay` (NUM), `distance` (NUM), `carrier` (STRING), `origin` (STRING), `dest` (STRING), `diverted` (BOOL)


In [4]:
# Adjust ONLY if your table uses different column names.
CANONICAL_BASE_SQL = f'''
WITH canonical_flights AS (
  SELECT
    PARSE_DATE('%Y-%m-%d', CONCAT(CAST(YEAR AS STRING), '-', CAST(MONTH AS STRING), '-', CAST(DAY AS STRING))) AS flight_date,
    CAST(DEPARTURE_DELAY AS FLOAT64) AS dep_delay,
    CAST(DISTANCE  AS FLOAT64) AS distance,
    CAST(AIRLINE   AS STRING)  AS carrier,
    CAST(ORIGIN_AIRPORT AS STRING)  AS origin,
    CAST(DESTINATION_AIRPORT AS STRING) AS dest,
    CAST((CASE WHEN SAFE_CAST(DIVERTED AS INT64)=1 OR LOWER(CAST(DIVERTED AS STRING))='true' THEN TRUE ELSE FALSE END) AS BOOL) AS diverted,
    DAY_OF_WEEK AS day_of_week_alias
  FROM `{TABLE_PATH}`
  WHERE DEPARTURE_DELAY IS NOT NULL
)
'''
print(CANONICAL_BASE_SQL[:600] + "\n...")


WITH canonical_flights AS (
  SELECT
    PARSE_DATE('%Y-%m-%d', CONCAT(CAST(YEAR AS STRING), '-', CAST(MONTH AS STRING), '-', CAST(DAY AS STRING))) AS flight_date,
    CAST(DEPARTURE_DELAY AS FLOAT64) AS dep_delay,
    CAST(DISTANCE  AS FLOAT64) AS distance,
    CAST(AIRLINE   AS STRING)  AS carrier,
    CAST(ORIGIN_AIRPORT AS STRING)  AS origin,
    CAST(DESTINATION_AIRPORT AS STRING) AS dest,
    CAST((CASE WHEN SAFE_CAST(DIVERTED AS INT64)=1 OR LOWER(CAST(DIVERTED AS STRING))='true' THEN TRUE ELSE FALSE END) AS BOOL) AS diverted,
    DAY_OF_WEEK AS day_of_week_alias
  FROM `mgmt467-lab.ass
...


### 2) Split (80/20)

In [5]:

SPLIT_CLAUSE = r'''
, split AS (
  SELECT cf.*,
         CASE WHEN RAND(12345) < 0.8 THEN 'TRAIN' ELSE 'EVAL' END AS split
  FROM canonical_flights cf
)
'''
print(SPLIT_CLAUSE)



, split AS (
  SELECT cf.*,
         CASE WHEN RAND(12345) < 0.8 THEN 'TRAIN' ELSE 'EVAL' END AS split
  FROM canonical_flights cf
)



In [6]:
# Step 1: Create canonical flights temporary table
CANONICAL_TEMP_TABLE = f"{PROJECT_ID}.unit2_flights.canonical_flights_temp"

# Extract the inner SELECT statement from CANONICAL_BASE_SQL
select_start = CANONICAL_BASE_SQL.find("AS (") + 4
select_end = CANONICAL_BASE_SQL.rfind(")")
inner_select_sql = CANONICAL_BASE_SQL[select_start:select_end].strip()


create_canonical_temp_sql = f'''
CREATE OR REPLACE TABLE `{CANONICAL_TEMP_TABLE}`
OPTIONS(
  expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR) -- Temporary table
) AS
{inner_select_sql}
;
'''

print("Creating temporary table for canonical flights:", CANONICAL_TEMP_TABLE)
job = bq.query(create_canonical_temp_sql); _ = job.result()
print("Temporary table created:", CANONICAL_TEMP_TABLE)

Creating temporary table for canonical flights: mgmt467-lab.unit2_flights.canonical_flights_temp
Temporary table created: mgmt467-lab.unit2_flights.canonical_flights_temp


In [7]:
# Step 2: Create split data temporary table
SPLIT_TEMP_TABLE = f"{PROJECT_ID}.unit2_flights.split_data_temp"

create_split_temp_sql = f'''
CREATE OR REPLACE TABLE `{SPLIT_TEMP_TABLE}`
OPTIONS(
  expiration_timestamp=TIMESTAMP_ADD(CURRENT_TIMESTAMP(), INTERVAL 1 HOUR) -- Temporary table
) AS
SELECT *,
       CASE WHEN MOD(ABS(FARM_FINGERPRINT(CONCAT(CAST(flight_date AS STRING), carrier, origin, dest))), 10) < 8 THEN 'TRAIN' ELSE 'EVAL' END AS split
FROM `{CANONICAL_TEMP_TABLE}`
;
'''

print("Creating temporary table for split data:", SPLIT_TEMP_TABLE)
job = bq.query(create_split_temp_sql); _ = job.result()
print("Temporary table created:", SPLIT_TEMP_TABLE)

Creating temporary table for split data: mgmt467-lab.unit2_flights.split_data_temp
Temporary table created: mgmt467-lab.unit2_flights.split_data_temp



## 3) Baseline model — LOGISTIC_REG (`diverted`)
Use **only** a small set of signals for the baseline (keep it honest).


In [8]:
MODEL_BASE = f"{PROJECT_ID}.unit2_flights.clf_diverted_base"

sql_baseline = f'''
CREATE SCHEMA IF NOT EXISTS `{PROJECT_ID}.unit2_flights`;

CREATE OR REPLACE MODEL `{MODEL_BASE}`
OPTIONS (MODEL_TYPE='LOGISTIC_REG', INPUT_LABEL_COLS=['diverted']) AS
SELECT
  diverted,
  dep_delay, distance, carrier, origin, dest,
  day_of_week_alias
FROM `{SPLIT_TEMP_TABLE}`
WHERE split='TRAIN'
;

SELECT * FROM ML.EVALUATE(
  MODEL `{MODEL_BASE}`,
  (
    SELECT
      diverted,
      dep_delay, distance, carrier, origin, dest,
      day_of_week_alias
    FROM `{SPLIT_TEMP_TABLE}`
    WHERE split='EVAL'
  )
);
'''
job = bq.query(sql_baseline); _ = job.result()
print("Baseline model trained:", MODEL_BASE)

Baseline model trained: mgmt467-lab.unit2_flights.clf_diverted_base


### Confusion matrix — default 0.5 threshold

In [9]:
cm_default_sql = f'''
WITH scored AS (
  SELECT
    t.diverted AS label,
    p.predicted_diverted AS pred_label,
    p.predicted_diverted_probs[OFFSET(0)].prob AS score,
    t.dep_delay, t.distance, t.carrier, t.origin, t.dest, t.day_of_week_alias -- Include join columns from t
  FROM `{SPLIT_TEMP_TABLE}` t
  JOIN ML.PREDICT(MODEL `{MODEL_BASE}`,
      (SELECT dep_delay, distance, carrier, origin, dest, day_of_week_alias FROM `{SPLIT_TEMP_TABLE}` WHERE split='EVAL')) AS p
  ON t.dep_delay = p.dep_delay
     AND t.distance = p.distance
     AND t.carrier = p.carrier
     AND t.origin = p.origin
     AND t.dest = p.dest
     AND t.day_of_week_alias = p.day_of_week_alias
  WHERE t.split='EVAL'
)
SELECT
  SUM(CASE WHEN label=TRUE  AND pred_label=TRUE  THEN 1 ELSE 0 END) AS TP,
  SUM(CASE WHEN label=FALSE AND pred_label=TRUE  THEN 1 ELSE 0 END) AS FP,
  SUM(CASE WHEN label=TRUE  AND pred_label=FALSE THEN 1 ELSE 0 END) AS FN,
  SUM(CASE WHEN label=FALSE AND pred_label=FALSE THEN 1 ELSE 0 END) AS TN
FROM scored;
'''
bq.query(cm_default_sql).result().to_dataframe()

Unnamed: 0,TP,FP,FN,TN
0,0,0,7517,3641333


### Confusion matrix — your custom threshold

In [10]:
CUSTOM_THRESHOLD = 0.75  # TODO: justify in ops terms

cm_thresh_sql = f'''
WITH scored AS (
  SELECT
    t.diverted AS label,
    CAST(p.predicted_diverted_probs[OFFSET(0)].prob >= {CUSTOM_THRESHOLD} AS BOOL) AS pred_label,
    p.predicted_diverted_probs[OFFSET(0)].prob AS score,
    t.dep_delay, t.distance, t.carrier, t.origin, t.dest, t.day_of_week_alias -- Include join columns from t
  FROM `{SPLIT_TEMP_TABLE}` t
  JOIN ML.PREDICT(MODEL `{MODEL_BASE}`,
      (SELECT dep_delay, distance, carrier, origin, dest, day_of_week_alias FROM `{SPLIT_TEMP_TABLE}` WHERE split='EVAL')) AS p
  ON t.dep_delay = p.dep_delay
     AND t.distance = p.distance
     AND t.carrier = p.carrier
     AND t.origin = p.origin
     AND t.dest = p.dest
     AND t.day_of_week_alias = p.day_of_week_alias
  WHERE t.split='EVAL'
)
SELECT
  SUM(CASE WHEN label=TRUE  AND pred_label=TRUE  THEN 1 ELSE 0 END) AS TP,
  SUM(CASE WHEN label=FALSE AND pred_label=TRUE  THEN 1 ELSE 0 END) AS FP,
  SUM(CASE WHEN label=TRUE  AND pred_label=FALSE THEN 1 ELSE 0 END) AS FN,
  SUM(CASE WHEN label=FALSE AND pred_label=FALSE THEN 1 ELSE 0 END) AS TN
FROM scored;
'''
bq.query(cm_thresh_sql).result().to_dataframe()

Unnamed: 0,TP,FP,FN,TN
0,0,0,7517,3641333



## 4) Engineered model — `TRANSFORM` (same label, stricter bar)
Create **route**, extract **day_of_week**, and **bucketize dep_delay**. Compare metrics to baseline.


In [11]:
MODEL_XFORM = f"{PROJECT_ID}.unit2_flights.clf_diverted_xform"

sql_xform = f'''
CREATE OR REPLACE MODEL `{MODEL_XFORM}`
TRANSFORM (
  CONCAT(origin, '-', dest) AS route,
  EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week,
  CASE
    WHEN dep_delay < -5  THEN 'early'
    WHEN dep_delay <=  5 THEN 'on_time'
    WHEN dep_delay <= 15 THEN 'minor'
    WHEN dep_delay <= 45 THEN 'moderate'
    ELSE 'major'
  END AS dep_delay_bucket,
  dep_delay, distance, carrier, origin, dest, diverted, day_of_week_alias -- Include original features and label
)
OPTIONS (MODEL_TYPE='LOGISTIC_REG', INPUT_LABEL_COLS=['diverted']) AS
SELECT * FROM `{SPLIT_TEMP_TABLE}` WHERE split='TRAIN'
;

SELECT 'baseline' AS model_version, * FROM ML.EVALUATE(
  MODEL `{MODEL_BASE}`,
  (SELECT
     dep_delay, distance, carrier, origin, dest, day_of_week_alias, diverted
   FROM `{SPLIT_TEMP_TABLE}` WHERE split='EVAL')
)
UNION ALL
SELECT 'engineered' AS model_version, * FROM ML.EVALUATE(
  MODEL `{MODEL_XFORM}`,
  (SELECT * FROM `{SPLIT_TEMP_TABLE}` WHERE split='EVAL')
);
'''
job = bq.query(sql_xform);
evaluation_results = job.result().to_dataframe()
print("Engineered model trained:", MODEL_XFORM)
display(evaluation_results)

Engineered model trained: mgmt467-lab.unit2_flights.clf_diverted_xform


Unnamed: 0,model_version,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,baseline,0.0,0.0,0.997386,0.0,0.017711,0.698338
1,engineered,0.211268,0.004998,0.99735,0.009766,0.017601,0.701851



### Write-up (concise)
- **Threshold chosen & ops rationale:**

    - For the customized shreshold, I stayed with 0.75 because the matrix did not change as the threshold changes(0.2 - 0.8) except for 0.1. Although it has approximately 3 less than the 0.75 one, but it trades with 11 more false positives which is more costly on the airline side. It may cost a big financial expenses on things like fuel dumps, landing fees, and passenger care which they don't even have to pay at the first place. The balance of FP and FN is also not even, so it is better to stick with threshold 0.75.
- **Baseline vs engineered — observed changes in AUC/precision/recall:**

    - From the comparison of both model, we can see that the engineered did perform a slightly better compare to the baseline model. Although both has a very high accuracy score, but with a very imbalanced data it can be very misleading. By looking at the precision and recall of the engineered model, we can see that it prediction of diverted flight are about 21% correct, and it correctly identify 0.5% of actual diverted flights. With f1 score, log_loss, and ROC are all slightly better than the baseline model, we can be sure that it is a better version compare to the baseline model. But it left a hugh growing space for other new models, and the engineered model may not be significantly important.
- **Risk framing:** cost of FP vs FN for diversion planning; what is your acceptable FN-rate?

    - For diversion planning, false positive will cost some unnecessary expenses of fuel disposal,landing fees, and passenger care if the divert flight did not happen. For false negative, it will rely solely on the pilot and if the airport they ask to divert has an empty space for the plane to land. So, in this case the FN rate should be as low as possible and at least under 10-15 percent.



---

## Rubric (Flights, 100 pts)
**Team-only deliverable in this notebook**

- Baseline LOGISTIC_REG + evaluation (AUC + confusion @0.5) — **20**  
- Custom threshold confusion matrix + ops justification — **20**  
- Engineered model with `TRANSFORM` (route, DOW, delay bucket) — **20**  
- Comparison table (baseline vs engineered) + 3–5 sentence interpretation — **20**  
- Reproducibility: parameters clear, no hidden magic; schema mapping documented — **10**  
- Governance notes: assumptions/limitations + slices you would monitor — **10**

> **Strictness:** No screenshots; use actual results cells. Keep explanations concise (bullet points OK).


# Code For Model C

In [12]:
MODEL_LOCALIZED = f"{PROJECT_ID}.unit2_flights.clf_diverted_localized"

sql_localized = f'''
CREATE OR REPLACE MODEL `{MODEL_LOCALIZED}`
TRANSFORM (
  CONCAT(origin, '-', dest) AS route,
  EXTRACT(DAYOFWEEK FROM flight_date) AS day_of_week,
  CASE
    WHEN dep_delay < -5  THEN 'early'
    WHEN dep_delay <=  5 THEN 'on_time'
    WHEN dep_delay <= 15 THEN 'minor'
    WHEN dep_delay <= 45 THEN 'moderate'
    ELSE 'major'
  END AS dep_delay_bucket,
  dep_delay, distance, carrier, origin, dest, diverted, day_of_week_alias -- Include original features and label
)
OPTIONS (MODEL_TYPE='LOGISTIC_REG', INPUT_LABEL_COLS=['diverted']) AS
SELECT * FROM `{SPLIT_TEMP_TABLE}`
WHERE split='TRAIN'
  AND origin IN ('ATL', 'ORD', 'JFK')
;

SELECT 'localized' AS model_version, * FROM ML.EVALUATE(
  MODEL `{MODEL_LOCALIZED}`,
  (SELECT * FROM `{SPLIT_TEMP_TABLE}`
   WHERE split='EVAL' AND origin IN ('ATL', 'ORD', 'JFK'))
);
'''
print("Creating and evaluating localized model:", MODEL_LOCALIZED)
job = bq.query(sql_localized);
localized_evaluation_results = job.result().to_dataframe()
print("Localized model trained and evaluated:", MODEL_LOCALIZED)
display(localized_evaluation_results)

Creating and evaluating localized model: mgmt467-lab.unit2_flights.clf_diverted_localized
Localized model trained and evaluated: mgmt467-lab.unit2_flights.clf_diverted_localized


Unnamed: 0,model_version,precision,recall,accuracy,f1_score,log_loss,roc_auc
0,localized,1.0,0.00627,0.997778,0.012461,0.015474,0.663949


In [13]:
cm_localized_default_sql = f'''
SELECT
  *
FROM
  ML.CONFUSION_MATRIX(MODEL `{MODEL_LOCALIZED}`,
    (SELECT * FROM `{SPLIT_TEMP_TABLE}`
     WHERE split='EVAL' AND origin IN ('ATL', 'ORD', 'JFK')))
;
'''
print("Confusion matrix for localized model at default 0.5 threshold:")
job = bq.query(cm_localized_default_sql);
localized_cm_default = job.result().to_dataframe()
display(localized_cm_default)

Confusion matrix for localized model at default 0.5 threshold:


Unnamed: 0,expected_label,FALSE,TRUE
0,False,142346,0
1,True,317,2
