In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import jupyter_black
import instructor
from pydantic import BaseModel, Field, field_validator
import os
import sys
from openai import OpenAI
from enum import Enum
import pandas
from pprint import pprint
from typing import Optional
import io

import diskcache

cache = diskcache.Cache("./.cache")

jupyter_black.load()

In [2]:
running_activities = pandas.read_csv("../static/dummy/running_activities.csv")
print(running_activities.shape)

running_activities.rename(
    columns={
        "distance": "distance_meters",
        "moving_time": "moving_time_seconds",
        "start_date_local": "date",
        "average_speed": "average_speed_meters_per_second",
    },
    inplace=True,
)

running_activities.head(10)

(124, 8)


Unnamed: 0,name,distance_meters,moving_time_seconds,sport_type,date,average_speed_meters_per_second,average_heartrate,max_heartrate
0,Afternoon Run,5000.0,1802,Run,2024-01-01T13:29:04Z,2.775,141.9,151.0
1,Morning Run,4943.9,1805,Run,2024-01-03T07:52:38Z,2.739,127.4,139.0
2,Morning Run,6031.7,2037,Run,2024-01-04T07:51:24Z,2.961,161.1,182.0
3,Afternoon Run,6051.1,2303,Run,2024-01-05T16:45:20Z,2.627,148.5,168.0
4,Lunch Run,7013.8,2629,Run,2024-01-07T11:47:59Z,2.668,151.3,166.0
5,Morning Run,4994.0,1801,Run,2024-01-08T07:41:12Z,2.773,124.4,144.0
6,Morning Run,5002.4,1806,Run,2024-01-09T08:32:21Z,2.77,124.8,133.0
7,Evening Run,5971.8,1920,Run,2024-01-12T20:10:16Z,3.11,167.2,182.0
8,Afternoon Run,8005.5,2593,Run,2024-01-13T15:02:48Z,3.087,152.2,181.0
9,Lunch Run,7316.4,2284,Run,2024-01-14T11:15:14Z,3.203,152.3,170.0


In [3]:
formatter = TextFormatter()
# video_id = "3HyQ7Q-XVKA"  # runwise podcast french
# video_id = "2_rfOfLYpXM"  # planted runner english
# video_id = "FhOr5RLt5HI"  # prep running addict french
video_id = "CtyIAzfdBUI"


def fetch_transcript(video_id: str, language_out: str = "en"):
    available_languages = [
        t.language_code for t in YouTubeTranscriptApi.list_transcripts(video_id)
    ]
    if language_out in available_languages:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    else:
        transcript = (
            YouTubeTranscriptApi.list_transcripts(video_id)
            .find_transcript(available_languages[0:1])
            .translate(language_out)
            .fetch()
        )

    return formatter.format_transcript(transcript)


transcript = fetch_transcript(video_id=video_id)
print(transcript[0:240], "....")

hello machines 2nd FAQ we did one
in 2023 so we're doing one in 2024 we're
trying to select a few
different questions to talk about
training, nutrition, more
personal stuff, well, not so personal about the future,
drunwise, etc., sorry  if  ....


In [4]:
running_activities

Unnamed: 0,name,distance_meters,moving_time_seconds,sport_type,date,average_speed_meters_per_second,average_heartrate,max_heartrate
0,Afternoon Run,5000.0,1802,Run,2024-01-01T13:29:04Z,2.775,141.9,151.0
1,Morning Run,4943.9,1805,Run,2024-01-03T07:52:38Z,2.739,127.4,139.0
2,Morning Run,6031.7,2037,Run,2024-01-04T07:51:24Z,2.961,161.1,182.0
3,Afternoon Run,6051.1,2303,Run,2024-01-05T16:45:20Z,2.627,148.5,168.0
4,Lunch Run,7013.8,2629,Run,2024-01-07T11:47:59Z,2.668,151.3,166.0
...,...,...,...,...,...,...,...,...
119,Morning Run,5056.8,1683,Run,2024-06-24T07:17:39Z,3.005,140.0,149.0
120,Morning Run,8535.4,2862,Run,2024-06-25T07:31:22Z,2.982,144.7,171.0
121,Morning Run,5041.1,1682,Run,2024-06-26T08:01:43Z,2.997,145.1,156.0
122,Morning Run,5073.2,1676,Run,2024-06-27T10:12:31Z,3.027,148.9,162.0


In [5]:
class DataField(Enum):
    DISTANCE_METERS = "distance_meters"
    MOVING_TIME_SECONDS = "moving_time_seconds"
    DATE = "date"
    AVERAGE_SPEED_METERS_PER_SECOND = "average_speed_meters_per_second"
    MAX_HEARTRATE = "max_heartrate"
    AVERAGE_HEARTRATE = "average_heartrate"


class Insight(BaseModel):
    observation: str = Field(description="The characteristic observed during training")
    outcome: str = Field(description="The outcome of the insight in the performance")
    data_fields: list[DataField] = Field(
        description="A list of the data fields required to calculate the observation for the athelete (only the ones required)"
    )


class ExtractedInsights(BaseModel):
    insights: list[Insight]


system_message = f"""
You are a very experienced running coach.
You are conducting research for your athletes so that they can improve their running performance.
Your goal is to give valuable feedback to an athlete when looking at their running data.
For each athlete you have a table with all their runs for the past year. For each run, you have the following information: {[e.value for e in DataField]}

You will be given a piece of content related to running. 
Your goal is to analyse the piece of content, and extract valuable insights that can help analyse the current running form of the athlete.
For each insight, you need to ensure it can be calculated with the data you have available. 
Priotize insights that can be easily calculated but are very valuable for the athlete.
""".strip()

user_message = f"""
Content:
===
{transcript}
===
""".strip()

In [6]:
i_client = instructor.from_openai(OpenAI())

insights = i_client.chat.completions.create(
    model="gpt-4o",
    response_model=ExtractedInsights,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ],
    temperature=0,
)

In [7]:
pprint(insights.dict())

{'insights': [{'data_fields': [<DataField.DISTANCE_METERS: 'distance_meters'>,
                               <DataField.MOVING_TIME_SECONDS: 'moving_time_seconds'>,
                               <DataField.DATE: 'date'>],
               'observation': 'Training Volume',
               'outcome': 'Maintaining a high training volume is crucial for '
                          'marathon performance and overall endurance.'},
              {'data_fields': [<DataField.AVERAGE_SPEED_METERS_PER_SECOND: 'average_speed_meters_per_second'>,
                               <DataField.DATE: 'date'>],
               'observation': 'Training Intensity',
               'outcome': 'Balancing high-intensity sessions with easy runs '
                          'can improve speed and recovery.'},
              {'data_fields': [<DataField.MAX_HEARTRATE: 'max_heartrate'>,
                               <DataField.AVERAGE_HEARTRATE: 'average_heartrate'>,
                               <DataField.DATE: 'date'>

In [17]:
system_prompt = f"""
You are working with a pandas dataframe in Python.
The name of the dataframe is `df`.
This is the result of `print(df.head())`:
{running_activities.head()}

Follow these instructions:
1. Write some code that will help you answer the user question.
2. The code should include print statements where appropriate.
3. The code will be called with the exec() function.
4. Do not include any other imports other than the pandas library.
5. Do not include any plots, graphs, or other visualizations.
6. Print the necessary variables so that once read, we can answer the user question.
7. If it's not possible to answer the user question, use the cannot_compute field.
"""

question = "Is the runner increasing their weekly training volume?"


class PandasDataFrameQA(BaseModel):
    code: str = Field(description="The code to execute")
    cannot_compute: bool = Field(default=False)

    def execute(self, df: pandas.DataFrame):
        output = io.StringIO()
        original_stdout = sys.stdout
        sys.stdout = output
        try:
            exec(self.code, globals(), {"df": df.copy()})
        finally:
            sys.stdout = original_stdout

        return output.getvalue()

    @field_validator("code")
    @classmethod
    def validate_code(cls, v: str):
        try:
            original_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
            exec(v, globals(), {"df": running_activities.copy()})
            sys.stdout.close()
            sys.stdout = original_stdout
            return v
        except Exception as e:
            raise ValueError(f"Invalid code: {e}")


code = i_client.chat.completions.create(
    model="gpt-4o",
    response_model=PandasDataFrameQA,
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ],
    temperature=0,
)



In [16]:
code.dict()

{'code': "import pandas as pd\n\ndf['date'] = pd.to_datetime(df['date'])\ndf.set_index('date', inplace=True)\ndf['week'] = df.index.to_period('W')\nweekly_volume = df.groupby('week')['distance_meters'].sum()\nprint(weekly_volume)",
 'cannot_compute': False}

In [9]:
print(code.code)
print(code.execute(running_activities))

import pandas as pd

df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df['week'] = df.index.to_period('W')
weekly_volume = df.groupby('week')['distance_meters'].sum()
print(weekly_volume)
week
2024-01-01/2024-01-07    29040.5
2024-01-08/2024-01-14    31290.1
2024-01-15/2024-01-21    51334.2
2024-01-22/2024-01-28    33955.7
2024-01-29/2024-02-04    55945.0
2024-02-05/2024-02-11    54177.3
2024-02-12/2024-02-18    66538.0
2024-02-19/2024-02-25    68761.0
2024-02-26/2024-03-03    60003.4
2024-03-04/2024-03-10    64366.2
2024-03-11/2024-03-17    46182.0
2024-03-18/2024-03-24    45794.7
2024-03-25/2024-03-31    15070.4
2024-04-01/2024-04-07    57266.3
2024-04-08/2024-04-14    46870.9
2024-04-15/2024-04-21    59101.6
2024-04-22/2024-04-28    18421.1
2024-04-29/2024-05-05    35707.2
2024-05-06/2024-05-12    31922.3
2024-05-13/2024-05-19    31672.4
2024-05-20/2024-05-26    37659.8
2024-05-27/2024-06-02    35180.0
2024-06-03/2024-06-09    24954.2
2024-06-10/2024-06-16 



In [13]:
from typing import Literal


observation = "Increased weekly training volume"
outcome = "Improved endurance and overall performance"

system_prompt = f"""
You are a very experienced running coach.
Your research has lead to the conclusion that the following observation:

'{observation}'

Leads to the following outcome:

'{outcome}'

When investigating this, you ran the following code on the athletes's data:

```python
{code.code}
```

Which, when executed, produced the following output:

```
{code.execute(df=running_activities)}
```

What recommendation would you give the athlete?
""".strip()


class TrainingFeedback(BaseModel):
    feedback: str = Field(
        description="The feedback to the user, given the observation and outcome."
    )
    usefullness: Literal["useful", "irrelevant"] = Field(
        description="Whether the feedback is useful or not."
    )


feedback = i_client.chat.completions.create(
    # model="gpt-3.5-turbo",
    model="gpt-4o",
    response_model=TrainingFeedback,
    messages=[
        {"role": "system", "content": system_prompt},
    ],
    temperature=0,
)



In [15]:
print()

"You are a very experienced running coach.\nYour research has lead to the conclusion that the following observation:\n\n'Increased weekly training volume'\n\nLeads to the following outcome:\n\n'Improved endurance and overall performance'\n\nWhen investigating this, you ran the following code on the athletes's data:\n\n```python\nimport pandas as pd\n\ndf['date'] = pd.to_datetime(df['date'])\ndf.set_index('date', inplace=True)\ndf['week'] = df.index.to_period('W')\nweekly_volume = df.groupby('week')['distance_meters'].sum()\nprint(weekly_volume)\n```\n\nWhich, when executed, produced the following output:\n\n```\nweek\n2024-01-01/2024-01-07    29040.5\n2024-01-08/2024-01-14    31290.1\n2024-01-15/2024-01-21    51334.2\n2024-01-22/2024-01-28    33955.7\n2024-01-29/2024-02-04    55945.0\n2024-02-05/2024-02-11    54177.3\n2024-02-12/2024-02-18    66538.0\n2024-02-19/2024-02-25    68761.0\n2024-02-26/2024-03-03    60003.4\n2024-03-04/2024-03-10    64366.2\n2024-03-11/2024-03-17    46182.0\n

In [14]:
feedback.dict()

{'feedback': 'Based on the data, it appears that your weekly training volume has fluctuated significantly over the past few months. To improve your endurance and overall performance, I recommend aiming for a more consistent weekly training volume. Gradually increase your mileage to avoid injury and ensure steady progress. For example, try to maintain your weekly distance within a 10-15% range of your target volume, and avoid sudden spikes or drops in your training load.',
 'usefullness': 'useful'}

In [11]:
print(feedback.feedback)

Based on the data, it appears that your weekly training volume has varied significantly over the past few months. To improve your endurance and overall performance, I recommend aiming for a more consistent weekly training volume. Gradually increase your weekly distance, ensuring you do not have drastic drops or spikes, as these can lead to injury or burnout. Aim for a steady increase of about 10% per week, and include rest weeks with reduced volume to allow for recovery.
