<a href="https://colab.research.google.com/github/ethandlouiee/MGMT467_Team11/blob/main/team/Final_Project/MGMT467_FinalProject_Batch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Step 1: Setup & Authentication
# Install required libraries for Google Cloud and Kaggle
!pip install -q kaggle google-cloud-bigquery google-cloud-storage pandas db-dtypes

# Authenticate User for Google Cloud access
# This will trigger a popup to allow access to your GCP resources
from google.colab import auth
auth.authenticate_user()

print("Libraries installed and Google Cloud successfully authenticated!")

Libraries installed and Google Cloud successfully authenticated!


In [3]:
# @title Step 2: Configuration & Kaggle JSON Upload
import os
import json
from google.colab import files
from google.cloud import bigquery
from google.cloud import storage

# --- Google Cloud Config ---
# User Input for Project ID
project_id = "mgmt-467-nh" # @param {type:"string"}
region = "us-central1" # @param {type:"string"}

# Define Resource Names
bucket_name = f"air_quality_raw_{project_id}" # Unique bucket name
dataset_name = "air_quality_dataset"
table_name = "sensor_data"

# Set the environment variable for the project
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id

# Initialize Clients
bq_client = bigquery.Client(project=project_id)
storage_client = storage.Client(project=project_id)

print(f"Configuration set for Project: {project_id}")
print(f"Target Bucket: {bucket_name}")
print("-------------------------------------------------")

# --- Kaggle Authentication ---
print("Please upload your kaggle.json file now...")
uploaded = files.upload()

# Check if kaggle.json was uploaded
if 'kaggle.json' in uploaded:
    # Create the .kaggle directory if it doesn't exist
    !mkdir -p ~/.kaggle

    # Move the uploaded file to the .kaggle directory
    !mv kaggle.json ~/.kaggle/

    # Change permissions to ensure the file is secure (required by Kaggle API)
    !chmod 600 ~/.kaggle/kaggle.json

    print("\nSUCCESS: kaggle.json uploaded and permissions set.")
else:
    print("\nERROR: kaggle.json not found. Please run the cell again and upload the correct file.")

Configuration set for Project: mgmt-467-nh
Target Bucket: air_quality_raw_mgmt-467-nh
-------------------------------------------------
Please upload your kaggle.json file now...


Saving kaggle.json to kaggle.json

SUCCESS: kaggle.json uploaded and permissions set.


In [4]:
# @title Step 3: Ingest from Kaggle to GCS (Raw Data Lake)
from google.cloud import storage
import os

# 1. Download Dataset from Kaggle
print("Downloading data from Kaggle...")
!kaggle datasets download -d fedesoriano/air-quality-data-set

# 2. Unzip the file
print("Unzipping data...")
!unzip -o air-quality-data-set.zip

# Find the .csv file name (it varies sometimes)
files = [f for f in os.listdir('.') if f.endswith('.csv')]
if not files:
    raise ValueError("No CSV file found in the downloaded dataset!")
source_file_name = files[0]
print(f"Found raw file: {source_file_name}")

# 3. Create GCS Bucket (if it doesn't exist)
bucket = storage_client.bucket(bucket_name)
if not bucket.exists():
    print(f"Creating bucket {bucket_name}...")
    bucket = storage_client.create_bucket(bucket_name, location=region)
else:
    print(f"Bucket {bucket_name} already exists.")

# 4. Upload File to GCS
blob_name = source_file_name # Keep the same name in GCS
blob = bucket.blob(blob_name)

print(f"Uploading {source_file_name} to gs://{bucket_name}/{blob_name}...")
blob.upload_from_filename(source_file_name)

print("Success! Raw data is now stored in Google Cloud Storage.")


Downloading data from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/fedesoriano/air-quality-data-set
License(s): copyright-authors
Downloading air-quality-data-set.zip to /content
  0% 0.00/248k [00:00<?, ?B/s]
100% 248k/248k [00:00<00:00, 518MB/s]
Unzipping data...
Archive:  air-quality-data-set.zip
  inflating: AirQuality.csv          
Found raw file: AirQuality.csv
Creating bucket air_quality_raw_mgmt-467-nh...
Uploading AirQuality.csv to gs://air_quality_raw_mgmt-467-nh/AirQuality.csv...
Success! Raw data is now stored in Google Cloud Storage.


In [6]:
# @title Step 4 (Retry): Curated Load to BigQuery (Schema & Partitioning)
import pandas as pd
import re
from google.cloud import bigquery

# 1. Read Raw Data from GCS
source_uri = f"gs://{bucket_name}/{source_file_name}"
print(f"Reading raw data from {source_uri}...")
df = pd.read_csv(source_uri, sep=';', decimal=',')

# 2. Data Curation
# Drop empty columns (artifacts)
df = df.dropna(axis=1, how='all')

# Fix Date Format for Partitioning (DD/MM/YYYY -> YYYY-MM-DD)
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y').dt.date

# --- FIX: Robust Column Renaming ---
# Function to clean column names for BigQuery (only Alphanumeric and _)
def clean_col_name(name):
    # Replace non-alphanumeric characters (like . or () ) with _
    clean = re.sub(r'[^a-zA-Z0-9]', '_', name)
    return clean

# Apply cleaning
df.columns = [clean_col_name(c) for c in df.columns]

print("Cleaned Column Names:", df.columns.tolist())

# 3. Define BigQuery Schema & Partitioning
# We map the specific clean names to types
job_config = bigquery.LoadJobConfig(
    schema=[
        bigquery.SchemaField("Date", "DATE"),
        bigquery.SchemaField("Time", "STRING"),
        # We explicitly map the cleaned names:
        bigquery.SchemaField("CO_GT_", "FLOAT"),
        bigquery.SchemaField("PT08_S1_CO_", "FLOAT"),
        bigquery.SchemaField("NMHC_GT_", "FLOAT"),
        bigquery.SchemaField("C6H6_GT_", "FLOAT"),
        bigquery.SchemaField("PT08_S2_NMHC_", "FLOAT"),
        bigquery.SchemaField("NOx_GT_", "FLOAT"),
        bigquery.SchemaField("PT08_S3_NOx_", "FLOAT"),
        bigquery.SchemaField("NO2_GT_", "FLOAT"),
        bigquery.SchemaField("PT08_S4_NO2_", "FLOAT"),
        bigquery.SchemaField("PT08_S5_O3_", "FLOAT"),
        bigquery.SchemaField("T", "FLOAT"),
        bigquery.SchemaField("RH", "FLOAT"),
        bigquery.SchemaField("AH", "FLOAT"),
    ],
    # --- PARTITIONING ---
    time_partitioning=bigquery.TimePartitioning(
        type_=bigquery.TimePartitioningType.DAY,
        field="Date"
    ),
    write_disposition="WRITE_TRUNCATE",
)

# 4. Load to BigQuery
dataset_ref = bq_client.dataset(dataset_name)
table_ref = dataset_ref.table(table_name)

# Ensure dataset exists
try:
    bq_client.get_dataset(dataset_ref)
except:
    bq_client.create_dataset(dataset_ref)

print(f"Loading data into {project_id}.{dataset_name}.{table_name}...")
job = bq_client.load_table_from_dataframe(df, table_ref, job_config=job_config)
job.result()

print("Success! Data loaded with Cleaned Schema and Partitioning.")


Reading raw data from gs://air_quality_raw_mgmt-467-nh/AirQuality.csv...
Cleaned Column Names: ['Date', 'Time', 'CO_GT_', 'PT08_S1_CO_', 'NMHC_GT_', 'C6H6_GT_', 'PT08_S2_NMHC_', 'NOx_GT_', 'PT08_S3_NOx_', 'NO2_GT_', 'PT08_S4_NO2_', 'PT08_S5_O3_', 'T', 'RH', 'AH']
Loading data into mgmt-467-nh.air_quality_dataset.sensor_data...
Success! Data loaded with Cleaned Schema and Partitioning.
