In [None]:
from google.cloud import bigquery
from google.cloud import storage

import json
from datetime import datetime

import os

from dotenv import load_dotenv
load_dotenv('../.env')

In [None]:
# Get parse_visit function
import sys
lib_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'dsllib'))
if lib_path not in sys.path:
    print(f"Appending path {lib_path}")
    sys.path.append(lib_path)

from dsllib.visits import parse_visit

In [None]:
import google.auth
credentials, project = google.auth.default()
print(project)

In [None]:
# Load environment variables
PROJECT_ID=os.environ.get("GOOGLE_CLOUD_PROJECT")
REGION=os.environ.get("REGION")
BUCKET=os.environ.get("BUCKET")
DATASET=os.environ.get("DATASET")
TABLE=os.environ.get("TABLE")
PROJECT_ID

In [None]:
# pull in the table schema
with open('../dsllib/table_schema.json', 'r') as f:
    bq_schema = json.load(f)

In [None]:
#Testing date.
#timestamp = "2024-07-01T20:40:00"
#timestamp2 = datetime.fromisoformat(timestamp)
#timestamp2 #Succeeds

In [None]:
def load_visits(filename):
    data = []
    with open(filename, 'r') as file:
        for line in file:
            data.append(line)
    
    records = [parse_visit(x) for x in data]
    return records

In [None]:
def insert_visits(filename, bq_client, table):
    records = load_visits(filename)

    errors = bq_client.insert_rows(table, records)
    if not errors:
        print(f"Inserted successfully from {filename}")
    else:
        print(errors)
    
    return errors

In [None]:
#test_data = load_visits('../challenge-clickstream/data/visits-2024-07-01.jsonl')

In [None]:
#Create the BigQuery client

#should already be loaded at setup.
#PROJECT_ID = os.environ.get("PROJECT_ID")
#DATASET = os.environ.get("DATASET")
#TABLE = os.environ.get("TABLE")
#TABLE = "web_visits" #Hardcoded for testing

bq_client = bigquery.Client(project=PROJECT_ID)
dataset_ref = bq_client.dataset(DATASET, project=PROJECT_ID)
table_ref = dataset_ref.table(TABLE)
table = bq_client.get_table(table_ref) 

table.schema

In [None]:
#test_file = '../challenge-clickstream/data/visits-2024-07-02.jsonl'
#errors = insert_visits(test_file, bq_client, table)

In [None]:
# Insert a local directory of files into BigQuery
data_dir = '../challenge-clickstream/data'

for filename in sorted(os.listdir(data_dir)):
    if filename.endswith(".jsonl"):
        file_path = os.path.join(data_dir, filename)
        print(f"Processing file: {file_path}")
        #errors = insert_visits(file_path, bq_client, table)
        errors = None
        if errors:
            print(f"Errors occurred while inserting data from {filename}: {errors}")

In [None]:
def insert_from_gcs(bucket_name: str, gcs_prefix: str, bq_client, table, project: str):
    """
    Batch inserts JSONL files from a Cloud Storage bucket into a BigQuery table.

    Args:
        bucket_name (str): The name of the Cloud Storage bucket.
        gcs_prefix (str): The prefix for the JSONL files in the bucket (e.g., 'data/').
        bq_client (google.cloud.bigquery.Client): The BigQuery client.
        table (google.cloud.bigquery.Table): The BigQuery table object.
        project (str): The Google Cloud project ID used for accessing resources.
    """

    storage_client = storage.Client(project=project)
    bucket = storage_client.get_bucket(bucket_name)

    for blob in bucket.list_blobs(prefix=gcs_prefix):
        if blob.name.endswith(".jsonl"):
            print(f"Processing file: gs://{bucket_name}/{blob.name}")
            # Download the blob's content as a string
            jsonl_string = blob.download_as_string().decode("utf-8")

            # Split the string into individual JSON lines
            jsonl_lines = jsonl_string.splitlines()

            # Parse each JSON line into a record
            records = [parse_visit(line) for line in jsonl_lines if line.strip()]

            # Insert the records into BigQuery
            errors = bq_client.insert_rows(table, records)

            if errors:
                print(f"Errors occurred while inserting data from gs://{bucket_name}/{blob.name}: {errors}")
            else:
                print(f"Inserted successfully from gs://{bucket_name}/{blob.name}")


In [None]:
# Insert jsonl data from Cloud Storage to BigQuery.
gcs_prefix = "data/"
insert_from_gcs(BUCKET, gcs_prefix, bq_client, table, PROJECT_ID)