In [8]:
# imports
import pandas as pd
import numpy as np
import duckdb

import os
import uuid

from google.colab import userdata

import vertexai
from vertexai.language_models import TextEmbeddingInput, TextEmbeddingModel
from vertexai.generative_models import GenerativeModel, ChatSession
from vertexai.generative_models import Part
from vertexai.generative_models import (
    Content,
    FunctionDeclaration,
    GenerationConfig,
    Tool,
    ToolConfig
)

In [2]:
# easiest path, auth with your BU account that you are using on GCP
from google.colab import auth
auth.authenticate_user()

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
# set the service account

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/drive/MyDrive/Boston Uni/2024 Fall/BA882 D1 Deploying Analytics Pipelines/strava-etl-e190fe552de1.json'

In [14]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, SafetySetting, Part
from google.cloud import bigquery

# Step 1: Fetch data from BigQuery
def fetch_data_from_bigquery():
    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Set project, dataset, and table IDs.
    project_id = 'strava-etl'
    dataset_id = 'strava_data'
    table_name = 'activities'

    # Construct the query.
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_name}`
        LIMIT 20
    """

    # Execute the query and fetch results.
    query_job = client.query(query)
    results = query_job.result()  # Wait for query to finish

    # Return the results as a list of rows
    return results

# Step 2: Generate content using Vertex AI model
def multiturn_generate_content(activity_row):
    vertexai.init(project="strava-etl", location="us-central1")
    model = GenerativeModel("gemini-1.5-flash-002")
    chat = model.start_chat()

    # Convert all columns of the row into a string
    activity_data = f"""Here is my activity data:
    {activity_row}"""

    # Define the prompt with short post instructions
    prompt = """
    Based on the provided Strava activity data, generate a casual and engaging social media post of around 50-100 words.
    Highlight key aspects of the activity such as weather, distance, elevation, heart rate, and average speed.
    Sometimes, compare the current performance to the past 30 days and reflect on any improvements or changes.
    The tone should be suitable for a fitness enthusiast's social media post.
    Do not mention location. Avoid mentioning missing data.
    """

    # Define the generation config with higher temperature for more variation
    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 2.0,  # Increased temperature for more variation
        "top_p": 0.95,
    }

    # Send the activity data along with the prompt to the model and generate the sentence
    response = chat.send_message([prompt, activity_data], generation_config=generation_config)
    return response.text

# Step 3: Collect the generated content in a DataFrame
def generate_and_store_content(results):
    activity_texts = []  # List to store generated texts
    activity_ids = []   # List to store activity ids
    activity_names = []
    Dates = []

    for row in results:
        # Convert the row to a dictionary
        activity_row = dict(row)

        # Generate content using Vertex AI
        activity_text = multiturn_generate_content(activity_row)

        # Remove mentions of missing data in the text
        if "None" in activity_text:
            activity_text = activity_text.replace("None", "")

        # Store the activity id and the generated text
        activity_ids.append(activity_row['id'])
        activity_names.append(activity_row['name'])
        activity_texts.append(activity_text)

    # Create a DataFrame to hold the activity ID, name and generated text
    activity_text_df = pd.DataFrame({
        'activity_id': activity_ids,
        'activity_name': activity_names,
        'generated_text': activity_texts
    })

    return activity_text_df

# Fetch data from BigQuery
activity_data = fetch_data_from_bigquery()

# Generate content and store in DataFrame
activity_text_df = generate_and_store_content(activity_data)

# Print the DataFrame with generated texts
activity_text_df

Unnamed: 0,activity_id,activity_name,generated_text
0,11333547845,Afternoon Run,Crushed an 8k run this afternoon! ☀️ Despite ...
1,11345802647,Morning Run,Crushed a 9.7km run this morning! ☀️ Despite ...
2,8539379751,Afternoon Run,Crushed a 2.8km run this afternoon! 💨 Despite...
3,9047742185,Beer Mile WU,Crushed a 1.6 mile Beer Mile warm-up this afte...
4,8922404852,Evening Run,Crushed a 13k evening run! 🏃💨 The cool air ma...
5,8462274491,Afternoon Run,Crushed a 10.4km run this afternoon! ☀️ The w...
6,8749498523,Evening Run,Crushed a 9.7K evening run! 💨 Despite the coo...
7,8805887200,Evening Run,Crushed a 7-mile evening run! 💨 The cool air ...
8,8987077739,Afternoon Run,Crushed a 6.5km run this afternoon! ☀️ Despi...
9,8820119132,Lunch Run,Crushed a 7-mile lunch run today! ☀️ The weat...


In [17]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, SafetySetting, Part
from google.cloud import bigquery
from datetime import datetime

# Step 1: Fetch data from BigQuery
def fetch_data_from_bigquery():
    # Construct a BigQuery client object.
    client = bigquery.Client()

    # Set project, dataset, and table IDs.
    project_id = 'strava-etl'
    dataset_id = 'strava_data'
    table_name = 'activities'

    # Construct the query.
    query = f"""
        SELECT *
        FROM `{project_id}.{dataset_id}.{table_name}`
        ORDER BY start_date DESC  # Sort by start_date in descending order
        LIMIT 20  # Fetch the most recent 20 activities
    """

    # Execute the query and fetch results.
    query_job = client.query(query)
    results = query_job.result()  # Wait for query to finish

    # Return the results as a list of rows
    return results

# Step 2: Generate content using Vertex AI model
def multiturn_generate_content(activity_row):
    vertexai.init(project="strava-etl", location="us-central1")
    model = GenerativeModel("gemini-1.5-flash-002")
    chat = model.start_chat()

    # Format the start_date to only include the date part
    start_date = activity_row.get('start_date', None)
    if start_date:
        # Check if start_date is a string or datetime object
        if isinstance(start_date, str):
            # If it's a string, convert it to a datetime object
            start_date_obj = datetime.strptime(start_date, "%Y-%m-%d %H:%M:%S UTC")
        else:
            # If it's already a datetime object, use it directly
            start_date_obj = start_date

        # Format the datetime object to only show the date
        start_date_formatted = start_date_obj.strftime("%Y-%m-%d")

        # Add the formatted start_date to the activity data
        activity_row['start_date'] = start_date_formatted

    # Convert all columns of the row into a string (including formatted start_date)
    activity_data = f"""Here is my activity data:
    {activity_row}"""

    # Define the prompt with short post instructions
    prompt = """
    Based on the provided Strava activity data, generate a casual and engaging social media post of around 50-100 words.
    Highlight key aspects of the activity such as weather, distance, elevation, heart rate, and average speed.
    Sometimes, compare the current performance to the past 30 days and reflect on any improvements or changes.
    The tone should be suitable for a fitness enthusiast's social media post.
    Avoid mentioning missing data.
    """

    # Define the generation config with higher temperature for more variation
    generation_config = {
        "max_output_tokens": 8192,
        "temperature": 2.0,  # Increased temperature for more variation
        "top_p": 0.95,
    }

    # Send the activity data along with the prompt to the model and generate the sentence
    response = chat.send_message([prompt, activity_data], generation_config=generation_config)
    return response.text

# Step 3: Collect the generated content in a DataFrame
def generate_and_store_content(results):
    activity_texts = []
    activity_ids = []
    activity_names = []
    activity_dates = []

    for row in results:
        # Convert the row to a dictionary
        activity_row = dict(row)

        # Generate content using Vertex AI
        activity_text = multiturn_generate_content(activity_row)

        # Remove mentions of missing data in the text
        if "None" in activity_text:
            activity_text = activity_text.replace("None", "")

        # Store the activity id and the generated text
        activity_ids.append(activity_row['id'])
        activity_names.append(activity_row['name'])
        activity_dates.append(activity_row['start_date'])
        activity_texts.append(activity_text)

    # Create a DataFrame to hold the activity ID, name and generated text
    activity_text_df = pd.DataFrame({
        'activity_id': activity_ids,
        'activity_name': activity_names,
        'activity_date': activity_dates,
        'generated_text': activity_texts
    })

    return activity_text_df

# Fetch data from BigQuery
activity_data = fetch_data_from_bigquery()

# Generate content and store in DataFrame
activity_text_df = generate_and_store_content(activity_data)

# Print the DataFrame with generated texts
activity_text_df

Unnamed: 0,activity_id,activity_name,activity_date,generated_text
0,13059717485,16x 400m intervals,2024-12-06,Crushed a 16x400m interval session this mornin...
1,13054569722,changeup,2024-12-05,Crushed a 13.5km run this afternoon! 🏃💨 Despi...
2,13045346401,spotify wrapped,2024-12-04,Crushed a 12.9K run this morning! 🏃‍♀️💨 Despi...
3,13041672276,4x 5min threshold,2024-12-03,Crushed a 4x5min threshold run today! 💪 Cover...
4,13038538369,3x 10min HM,2024-12-03,Crushed a 3x10min HM run this morning! 💪 Desp...
5,13030707634,cold hands!,2024-12-02,"BRR! Cold hands, but a hot run this morning! 🥶..."
6,13017059642,sa2 battle,2024-11-30,Crushed a 3.5K run this morning! 💨 The chilly...
7,13009754435,lead foot,2024-11-29,Crushed a 8-mile run this morning! 💨 Despite...
8,12997094833,bv ballin,2024-11-27,Crushed a 9.3k run today! 🏃💨 The weather was ...
9,12990455950,press garden,2024-11-26,Crushed a 5.1-mile run in the crisp autumn air...
