In [None]:
# Cell 1: Install necessary libraries
# Run this cell if you haven't installed the BigQuery client library yet.
!pip install google-cloud-bigquery



In [None]:
# Cell 2: Import libraries
import json
from google.cloud import bigquery
from datetime import datetime
from google.cloud import storage
import os
from google.api_core import exceptions # Import for catching Google API exceptions



In [None]:
# Cell 3: Configuration
# IMPORTANT: Replace "your-gcp-project-id" with your actual Google Cloud Project ID.
PROJECT_ID = "jellyfish-training-demo-6"
DATASET_ID = "dsl_project"  # As specified in your BigQuery schema
TABLE_ID = "web_visits"     # As specified in your BigQuery schema
GCS_BUCKET_NAME = "guillermo-lake" # bucket name where jsonl files are stored

# BigQuery Insert Batch Size
# Adjust this value based on the average size of your rows.
# A common starting point is 1000 or 10000 rows.
BIGQUERY_INSERT_BATCH_SIZE = 5000

# Path to your JSONL data file.
# Make sure this file is accessible from your Jupyter Notebook environment.
#storage_client = storage.Client(project=PROJECT_ID)
##bucket = storage_client.get_bucket(BUCKET)
#JSONL_FILE_PATH = "visits_data.jsonl"
#GCS_FILE_PATH = "" #*.jsonl

In [None]:
# Cell 4: BigQuery Client Initialization
# Ensure you have authenticated to GCP before running this script.
# (e.g., by setting GOOGLE_APPLICATION_CREDENTIALS env variable
# or running `gcloud auth application-default login` in your terminal
# where you launch Jupyter, or using `gcloud auth application-default login`
# within a separate notebook cell if running on a GCP managed notebook)
try:
    client = bigquery.Client(project=PROJECT_ID)
    table_ref = client.dataset(DATASET_ID).table(TABLE_ID)
    print(f"BigQuery client initialized for project: {PROJECT_ID}")
    print(f"Target table: {DATASET_ID}.{TABLE_ID}")
except Exception as e:
    print(f"Error initializing BigQuery client: {e}")
    print("Please ensure your Google Cloud credentials are set up correctly.")

In [None]:
# Cell 5: Function to transform a single JSON line to BigQuery row format
def transform_json_to_bigquery_row(json_line: str) -> dict:
    """
    Transforms a single JSON line from the input file into a dictionary
    suitable for insertion into the BigQuery table.

    Args:
        json_line (str): A single line of JSON string from the input file.

    Returns:
        dict: A dictionary representing a single row in the BigQuery table.
    """
    data = json.loads(json_line)

    # --- Geolocation Transformation ---
    # Input: "latitude,longitude" (e.g., "31.764576,28.583238")
    # BigQuery GEOGRAPHY type expects WKT format: "POINT(longitude latitude)"
    geolocation_wkt = None
    if 'geolocation' in data and data['geolocation']:
        try:
            lat, lon = map(float, data['geolocation'].split(','))
            geolocation_wkt = f"POINT({lon} {lat})"
        except ValueError:
            print(f"Warning: Invalid geolocation format '{data['geolocation']}', setting to NULL for this row.")

    # --- Determine visit_start_time and visit_end_time from events ---
    # visit_start_time is the timestamp of the earliest event.
    # visit_end_time is the timestamp of the latest event.
    visit_start_time = None
    visit_end_time = None
    if data.get('events'):
        # Filter out events with missing or invalid timestamps to avoid errors
        valid_events = []
        for e in data['events']:
            if 'event' in e and 'timestamp' in e['event']:
                try:
                    datetime.fromisoformat(e['event']['timestamp']) # Validate format
                    valid_events.append(e)
                except ValueError:
                    print(f"Warning: Invalid timestamp format '{e['event'].get('timestamp')}' for an event. Skipping this event for time calculation.")

        if valid_events:
            # Sort events by timestamp to find the earliest and latest
            sorted_events = sorted(
                valid_events,
                key=lambda x: datetime.fromisoformat(x['event']['timestamp'])
            )
            visit_start_time = sorted_events[0]['event']['timestamp']
            visit_end_time = sorted_events[-1]['event']['timestamp']
        else:
            print("Warning: No valid event timestamps found to determine visit start/end times.")


    # --- Transform events array ---
    transformed_events = []
    for event_data in data.get('events', []):
        event_details = event_data.get('event', {}) # Use .get() for safer access
        event_type = event_details.get('event_type')
        event_timestamp = event_details.get('timestamp')
        details_payload = event_details.get('details', {})

        # Initialize all possible event structs to None for BigQuery's NULL
        page_view_struct = None
        add_cart_struct = None
        purchase_struct = None

        # Populate the specific struct based on event_type
        if event_type == 'page_view':
            page_view_struct = {
                'page_url': details_payload.get('page_url'),
                'referrer_url': details_payload.get('referrer_url')
            }
        elif event_type == 'add_item_to_cart':
            add_cart_struct = {
                'product_id': details_payload.get('product_id'),
                'product_name': details_payload.get('product_name'),
                'category': details_payload.get('category'),
                # Safely convert to float/int, default to 0.0/0 if missing or invalid
                'price': float(details_payload.get('price', 0.0)) if details_payload.get('price') is not None else 0.0,
                'quantity': int(details_payload.get('quantity', 0)) if details_payload.get('quantity') is not None else 0
            }
        elif event_type == 'purchase':
            purchase_items = []
            for item in details_payload.get('items', []):
                purchase_items.append({
                    'product_id': item.get('product_id'),
                    'product_name': item.get('product_name'),
                    'category': item.get('category'),
                    'price': float(item.get('price', 0.0)) if item.get('price') is not None else 0.0,
                    'quantity': int(item.get('quantity', 0)) if item.get('quantity') is not None else 0
                })
            purchase_struct = {
                'order_id': details_payload.get('order_id'),
                'amount': float(details_payload.get('amount', 0.0)) if details_payload.get('amount') is not None else 0.0,
                'currency': details_payload.get('currency'),
                'items': purchase_items
            }

        transformed_events.append({
            'event_type': event_type,
            'event_timestamp': event_timestamp,
            'page_view': page_view_struct,
            'add_cart': add_cart_struct,
            'purchase': purchase_struct
        })

    # Construct the final row dictionary matching BigQuery schema
    return {
        'session_id': data.get('session_id'),
        'user_id': data.get('user_id'),
        'device_type': data.get('device_type'),
        'geolocation': geolocation_wkt,
        'user_agent': data.get('user_agent'),
        'visit_start_time': visit_start_time,
        'visit_end_time': visit_end_time,
        'events': transformed_events
    }


# Cell 6: Main script execution function
def load_data_to_bigquery():
    """
    Reads all JSONL files from the specified Google Cloud Storage bucket's root,
    transforms each line, and loads the transformed data into the
    specified BigQuery table.
    """
    all_rows_to_insert = [] # Collect all rows from all files
    print(f"Attempting to read all .jsonl files from gs://{GCS_BUCKET_NAME}...")

    try:
        storage_client = storage.Client(project=PROJECT_ID)
        bucket = storage_client.get_bucket(GCS_BUCKET_NAME)

        # List all blobs (files) in the root of the bucket
        blobs = bucket.list_blobs(prefix='')

        processed_files_count = 0
        for blob in blobs:
            if blob.name.endswith('.jsonl'):
                print(f"  Processing file: {blob.name}")
                processed_files_count += 1
                try:
                    jsonl_content = blob.download_as_text()
                    lines = jsonl_content.splitlines()

                    for line_num, line in enumerate(lines):
                        line = line.strip()
                        if not line:
                            continue
                        try:
                            row = transform_json_to_bigquery_row(line)
                            all_rows_to_insert.append(row)
                        except json.JSONDecodeError as e:
                            print(f"    Error decoding JSON on line {line_num + 1} in {blob.name}: {e}. Skipping line: {line[:100]}...")
                        except Exception as e:
                            print(f"    An unexpected error occurred processing line {line_num + 1} in {blob.name}: {e}. Skipping line: {line[:100]}...")
                except Exception as e:
                    print(f"  Error downloading or processing file {blob.name}: {e}")

        if processed_files_count == 0:
            print(f"No .jsonl files found in the root of bucket '{GCS_BUCKET_NAME}'. Nothing to insert.")
            return

        if not all_rows_to_insert:
            print("No valid rows were extracted from the JSONL files. Nothing to insert.")
            return

        print(f"Successfully processed {len(all_rows_to_insert)} rows from {processed_files_count} JSONL files.")
        print(f"Attempting to insert {len(all_rows_to_insert)} rows into BigQuery table '{DATASET_ID}.{TABLE_ID}' in batches...")

        # --- Batching for BigQuery Inserts ---
        total_inserted_rows = 0
        for i in range(0, len(all_rows_to_insert), BIGQUERY_INSERT_BATCH_SIZE):
            batch = all_rows_to_insert[i:i + BIGQUERY_INSERT_BATCH_SIZE]
            print(f"  Inserting batch {i // BIGQUERY_INSERT_BATCH_SIZE + 1} ({len(batch)} rows)...")
            try:
                errors = client.insert_rows_json(table_ref, batch)
                if errors:
                    print(f"    Encountered errors in batch {i // BIGQUERY_INSERT_BATCH_SIZE + 1}:")
                    for error in errors:
                        print(f"      Row index in batch: {error.get('index', 'N/A')}, Errors: {error.get('errors', 'N/A')}")
                else:
                    print(f"    Batch {i // BIGQUERY_INSERT_BATCH_SIZE + 1} successfully inserted.")
                    total_inserted_rows += len(batch)
            except exceptions.GoogleAPICallError as e:
                # Catching specific HTTP errors like 413, 500, etc.
                print(f"    Google API Call Error inserting batch {i // BIGQUERY_INSERT_BATCH_SIZE + 1}: {e}")
                print("    This batch might have failed. Please review the error details.")
                # You might want to add more robust error handling here, e.g.,
                # logging failed rows for retry, or stopping if error is critical.
            except Exception as e:
                print(f"    An unexpected error occurred during batch insertion: {e}")

        print(f"\nBigQuery data loading process completed. Total successfully inserted rows: {total_inserted_rows} into '{DATASET_ID}.{TABLE_ID}'.")

    except exceptions.NotFound as e: # Corrected exception for GCS bucket/file not found
        print(f"Error: Bucket '{GCS_BUCKET_NAME}' or an object within it not found. Please check the name and permissions. Details: {e}")
    except Exception as e:
        print(f"An unexpected error occurred during GCS file processing or BigQuery operation: {e}")


In [None]:
# Cell 7: Execute the data loading process
if __name__ == "__main__":
    load_data_to_bigquery()
    print("\nBigQuery data loading process completed.")