In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import jupyter_black
import instructor
from pydantic import BaseModel, Field, field_validator, ValidationError
import os
import sys
from openai import OpenAI
from enum import Enum
import pandas
import pandas as pd
import io
from hashlib import md5
from typing import Literal
import contextlib

data_dict = {}

jupyter_black.load()

In [2]:
running_activities = pandas.read_csv("../static/dummy/running_activities.csv")
print(running_activities.shape)

running_activities.rename(
    columns={
        "distance": "distance_meters",
        "moving_time": "moving_time_seconds",
        "start_date_local": "date",
        "average_speed": "average_speed_meters_per_second",
    },
    inplace=True,
)

running_activities.head(10)

(126, 8)


Unnamed: 0,name,distance_meters,moving_time_seconds,sport_type,date,average_speed_meters_per_second,average_heartrate,max_heartrate
0,Evening Run,5971.8,1920,Run,2024-01-12T20:10:16Z,3.11,167.2,182.0
1,Afternoon Run,8005.5,2593,Run,2024-01-13T15:02:48Z,3.087,152.2,181.0
2,Lunch Run,7316.4,2284,Run,2024-01-14T11:15:14Z,3.203,152.3,170.0
3,Morning Run,5060.6,1662,Run,2024-01-15T10:24:27Z,3.045,149.6,172.0
4,Morning Run,5050.0,1826,Run,2024-01-16T08:07:55Z,2.766,131.5,145.0
5,Come back to CPH they said 🧊,8505.7,2850,Run,2024-01-17T07:17:52Z,2.984,145.0,164.0
6,Morning Run,5079.3,1640,Run,2024-01-18T07:35:21Z,3.097,147.1,157.0
7,Afternoon Run,5620.0,1773,Run,2024-01-19T13:49:06Z,3.17,158.6,187.0
8,With Vitto,5092.3,1849,Run,2024-01-20T16:35:05Z,2.754,138.4,148.0
9,Morning Run,16926.3,5103,Run,2024-01-21T10:40:50Z,3.317,162.0,195.0


In [3]:
formatter = TextFormatter()
# video_id = "3HyQ7Q-XVKA"  # runwise podcast french
# video_id = "2_rfOfLYpXM"  # planted runner english
# video_id = "FhOr5RLt5HI"  # prep running addict french
# video_id = "CtyIAzfdBUI"
# video_id = "kBo-q4DwqPM"
video_id = "59aEpVvRXxI"


def fetch_transcript(video_id: str, language_out: str = "en"):
    available_languages = [
        t.language_code for t in YouTubeTranscriptApi.list_transcripts(video_id)
    ]
    if language_out in available_languages:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    else:
        transcript = (
            YouTubeTranscriptApi.list_transcripts(video_id)
            .find_transcript(available_languages[0:1])
            .translate(language_out)
            .fetch()
        )

    return formatter.format_transcript(transcript)


transcript = fetch_transcript(video_id=video_id)
print(transcript[0:240], "....")

on October 17, 2021 in front of more than a hundred
thousand spectators Domingo India and
the best known streamer on Twitch
ran the Paris marathon in three
hours 24 minutes and 25 seconds
sporting events in which I had
the honor of particip ....


In [4]:
running_activities

Unnamed: 0,name,distance_meters,moving_time_seconds,sport_type,date,average_speed_meters_per_second,average_heartrate,max_heartrate
0,Evening Run,5971.8,1920,Run,2024-01-12T20:10:16Z,3.110,167.2,182.0
1,Afternoon Run,8005.5,2593,Run,2024-01-13T15:02:48Z,3.087,152.2,181.0
2,Lunch Run,7316.4,2284,Run,2024-01-14T11:15:14Z,3.203,152.3,170.0
3,Morning Run,5060.6,1662,Run,2024-01-15T10:24:27Z,3.045,149.6,172.0
4,Morning Run,5050.0,1826,Run,2024-01-16T08:07:55Z,2.766,131.5,145.0
...,...,...,...,...,...,...,...,...
121,Morning Run,6916.7,2225,Run,2024-07-04T07:51:50Z,3.109,154.8,165.0
122,Morning Run,5036.6,1568,Run,2024-07-05T08:19:27Z,3.212,156.8,176.0
123,Afternoon Run,6230.2,2355,Run,2024-07-07T13:48:08Z,2.646,151.2,168.0
124,Afternoon Trail Run,2216.5,1549,TrailRun,2024-07-07T14:37:04Z,1.431,144.4,176.0


In [5]:
class DataField(Enum):
    DISTANCE_METERS = "distance_meters"
    MOVING_TIME_SECONDS = "moving_time_seconds"
    DATE = "date"
    AVERAGE_SPEED_METERS_PER_SECOND = "average_speed_meters_per_second"
    MAX_HEARTRATE = "max_heartrate"
    AVERAGE_HEARTRATE = "average_heartrate"


class Insight(BaseModel):
    observation: str = Field(
        description="The metric observed in the data over a period of time"
    )
    outcome: str = Field(
        description="How the metric observed affects the performance of the athlete"
    )
    data_fields: list[DataField] = Field(
        description="The data fields required to compute the observation"
    )


class ExtractedInsights(BaseModel):
    insights: list[Insight]


system_message = f"""
You are a very experienced running coach.
You are conducting research for your athletes so that they can improve their running performance.
Your goal is to give valuable feedback to an athlete when looking at their running data.
For each athlete you have a table with all their runs year-to-date. For each run, you have the following information: {[e.value for e in DataField]}

You will be given a piece of content related to running. 
Your goal is to analyse the piece of content, and extract valuable insights that can help analyse the current running form of the athlete.
For each insight, you need to ensure it can be calculated with the data you have available. 
The insights should be easy to compute and understand to your athletes.
Only extract insights that are relevant in the context of looking at the last year of running data of an athlete. (You don't know ANY other details)
For example, don't extract observations related to sickness, weather, food, etc. (You don't have data for these.)
""".strip()

user_message = f"""
Content:
===
{transcript}
===
""".strip()

In [6]:
i_client = instructor.from_openai(OpenAI())

extracted_insights = i_client.chat.completions.create(
    model="gpt-4o",
    response_model=ExtractedInsights,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ],
    temperature=0,
)

In [7]:
def string_to_id(string_: str) -> str:
    return md5(string_.encode()).hexdigest()


for insight in extracted_insights.insights:
    print(f"{insight.observation=}")
    print(f"{insight.outcome=}")
    data_dict[string_to_id(insight.observation)] = insight.dict()
    print("=====")

insight.observation='Consistent long runs with progressive increase in duration.'
insight.outcome='Helps build endurance and prepares the body for the marathon distance.'
=====
insight.observation='Incorporation of low-intensity runs (fundamental endurance).'
insight.outcome='Improves aerobic capacity and aids in recovery.'
=====
insight.observation='Structured training with a mix of endurance, interval, and long runs.'
insight.outcome='Balanced training approach that enhances overall running performance.'
=====
insight.observation='Gradual increase in interval training intensity.'
insight.outcome='Improves speed and running economy.'
=====
insight.observation='Monitoring of heart rate during runs.'
insight.outcome='Helps in maintaining appropriate training intensity and avoiding overtraining.'
=====


In [8]:
class PandasDataFrameQA(BaseModel):
    code: str = Field(description="The code to execute")
    cannot_compute: bool = Field(default=False)

    def execute(self, df: pandas.DataFrame):
        output = io.StringIO()
        with contextlib.redirect_stdout(output):
            exec(self.code, globals(), {"df": df.copy()})
        return output.getvalue()

    @field_validator("code")
    @classmethod
    def validate_code(cls, v: str):
        with contextlib.redirect_stdout(io.StringIO()):
            try:
                exec(v, globals(), {"df": running_activities.copy()})
                return v
            except Exception as e:
                raise ValueError(f"Invalid code: {e}")


def get_code(insight: Insight) -> PandasDataFrameQA:
    system_prompt = f"""
    You are working with a pandas dataframe in Python.
    The name of the dataframe is `df`.
    This is the result of `print(df.head())`:
    {running_activities.head()}

    Follow these instructions:
    1. Write some code that will help you answer the user question.
    2. The code should include print statements where appropriate.
    3. The code will be called with the exec() function.
    4. Do not include any other imports other than the pandas library.
    5. Do not include any plots, graphs, or other visualizations.
    6. Print the necessary variables so that once read, we can answer the user question.
    """.strip()

    user_prompt = f""" 
    I would like to understand if we are seeing the following: {insight.observation}
    My goal is to understand if it will lead to the following outcome: {insight.outcome}
    """.strip()
    try:
        code = i_client.chat.completions.create(
            model="gpt-4o",
            response_model=PandasDataFrameQA,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            temperature=0,
        )
    except ValidationError as e:
        print(f"An error occurred: {e}")
        code = PandasDataFrameQA(code="", cannot_compute=True)

    return code

In [9]:
for insight in extracted_insights.insights:
    print(f"{insight.observation=}")
    pandasqa = get_code(insight)
    print(f"{pandasqa.code=}")
    data_dict[string_to_id(insight.observation)]["code"] = pandasqa.dict()

insight.observation='Consistent long runs with progressive increase in duration.'
pandasqa.code="# Convert the 'date' column to datetime format\ndf['date'] = pd.to_datetime(df['date'])\n\n# Sort the dataframe by date\ndf = df.sort_values(by='date')\n\n# Calculate the average distance and moving time per week\ndf['week'] = df['date'].dt.isocalendar().week\nweekly_stats = df.groupby('week').agg({'distance_meters': 'mean', 'moving_time_seconds': 'mean'}).reset_index()\n\n# Print the weekly statistics to observe the trend\nprint(weekly_stats)"
insight.observation='Incorporation of low-intensity runs (fundamental endurance).'
pandasqa.code="# To understand if there is an incorporation of low-intensity runs, we can look at the average and max heart rates.\n# Typically, low-intensity runs have a lower average and max heart rate.\n\n# Let's calculate the average of the average_heartrate and max_heartrate\naverage_heartrate_mean = df['average_heartrate'].mean()\nmax_heartrate_mean = df['max_hea



An error occurred: 1 validation error for PandasDataFrameQA
code
  Value error, Invalid code: agg function failed [how->mean,dtype->object] [type=value_error, input_value='import pandas as pd\n\n#...)\nprint(average_stats)', input_type=str]
    For further information visit https://errors.pydantic.dev/2.7/v/value_error
pandasqa.code=''
insight.observation='Gradual increase in interval training intensity.'
pandasqa.code="# To understand if there is a gradual increase in interval training intensity, we can look at the average and maximum heartrate over time.\n# We will also check if there is an improvement in speed and running economy by looking at the average speed over time.\n\n# Convert the 'date' column to datetime format for easier sorting and analysis\ndf['date'] = pd.to_datetime(df['date'])\n\n# Sort the dataframe by date\ndf = df.sort_values(by='date')\n\n# Print the sorted dataframe to check the order\nprint(df)\n\n# Calculate the correlation between date and average_heartrate, 

In [10]:
def build_feedback_prompt(
    observation: str, outcome: str, code: PandasDataFrameQA, user_data: pandas.DataFrame
) -> str:

    return f"""
You are a very experienced running coach.
Your research has lead to the conclusion that the following observation:

'{observation}'

Leads to the following outcome:

'{outcome}'

When investigating this, you ran the following code on the athletes's data:

```python
{code.code}
```

Which, when executed, produced the following output:

```
{code.execute(df=user_data)}
```

What recommendation would you give the athlete?
    """.strip()

In [11]:
class TrainingFeedback(BaseModel):
    feedback: str = Field(
        description="The feedback to the user, given the observation and outcome."
    )
    usefullness: Literal["useful", "irrelevant"] = Field(
        description="Whether the feedback is useful or not."
    )


def give_feedback_to_user(prompt: str) -> TrainingFeedback:
    feedback = i_client.chat.completions.create(
        model="gpt-4o",
        response_model=TrainingFeedback,
        messages=[
            {"role": "system", "content": prompt},
        ],
        temperature=0,
    )

    return feedback

In [13]:
for item, data in data_dict.items():

    observation = data["observation"]
    outcome = data["outcome"]

    code = PandasDataFrameQA(**data["code"])
    prompt = build_feedback_prompt(observation, outcome, code, running_activities)
    feedback = give_feedback_to_user(prompt)

    print("Prompt:")
    print(prompt)

    print("Feedback:")
    print(feedback.dict())

    print("====")

Prompt:
You are a very experienced running coach.
Your research has lead to the conclusion that the following observation:

'Consistent long runs with progressive increase in duration.'

Leads to the following outcome:

'Helps build endurance and prepares the body for the marathon distance.'

When investigating this, you ran the following code on the athletes's data:

```python
# Convert the 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'])

# Sort the dataframe by date
df = df.sort_values(by='date')

# Calculate the average distance and moving time per week
df['week'] = df['date'].dt.isocalendar().week
weekly_stats = df.groupby('week').agg({'distance_meters': 'mean', 'moving_time_seconds': 'mean'}).reset_index()

# Print the weekly statistics to observe the trend
print(weekly_stats)
```

Which, when executed, produced the following output:

```
    week  distance_meters  moving_time_seconds
0      2      7097.900000          2265.666667
1      3      7333.457143

In [14]:
data_dict

{'5feea5c7455e3c79cc759ea26989a87e': {'observation': 'Consistent long runs with progressive increase in duration.',
  'outcome': 'Helps build endurance and prepares the body for the marathon distance.',
  'data_fields': [<DataField.DISTANCE_METERS: 'distance_meters'>,
   <DataField.DATE: 'date'>],
  'code': {'code': "# Convert the 'date' column to datetime format\ndf['date'] = pd.to_datetime(df['date'])\n\n# Sort the dataframe by date\ndf = df.sort_values(by='date')\n\n# Calculate the average distance and moving time per week\ndf['week'] = df['date'].dt.isocalendar().week\nweekly_stats = df.groupby('week').agg({'distance_meters': 'mean', 'moving_time_seconds': 'mean'}).reset_index()\n\n# Print the weekly statistics to observe the trend\nprint(weekly_stats)",
   'cannot_compute': False}},
 '6652b7fcb26a8536dff0eef3a18fd11d': {'observation': 'Incorporation of low-intensity runs (fundamental endurance).',
  'outcome': 'Improves aerobic capacity and aids in recovery.',
  'data_fields': [<D