In [7]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import jupyter_black
import instructor
from pydantic import BaseModel, Field, field_validator
import os
import sys
from openai import OpenAI
from enum import Enum
import pandas
import pandas as pd
import io
from hashlib import md5
from typing import Literal

data_dict = {}

jupyter_black.load()

In [8]:
running_activities = pandas.read_csv("../static/dummy/running_activities.csv")
print(running_activities.shape)

running_activities.rename(
    columns={
        "distance": "distance_meters",
        "moving_time": "moving_time_seconds",
        "start_date_local": "date",
        "average_speed": "average_speed_meters_per_second",
    },
    inplace=True,
)

running_activities.head(10)

(124, 8)


Unnamed: 0,name,distance_meters,moving_time_seconds,sport_type,date,average_speed_meters_per_second,average_heartrate,max_heartrate
0,Afternoon Run,5000.0,1802,Run,2024-01-01T13:29:04Z,2.775,141.9,151.0
1,Morning Run,4943.9,1805,Run,2024-01-03T07:52:38Z,2.739,127.4,139.0
2,Morning Run,6031.7,2037,Run,2024-01-04T07:51:24Z,2.961,161.1,182.0
3,Afternoon Run,6051.1,2303,Run,2024-01-05T16:45:20Z,2.627,148.5,168.0
4,Lunch Run,7013.8,2629,Run,2024-01-07T11:47:59Z,2.668,151.3,166.0
5,Morning Run,4994.0,1801,Run,2024-01-08T07:41:12Z,2.773,124.4,144.0
6,Morning Run,5002.4,1806,Run,2024-01-09T08:32:21Z,2.77,124.8,133.0
7,Evening Run,5971.8,1920,Run,2024-01-12T20:10:16Z,3.11,167.2,182.0
8,Afternoon Run,8005.5,2593,Run,2024-01-13T15:02:48Z,3.087,152.2,181.0
9,Lunch Run,7316.4,2284,Run,2024-01-14T11:15:14Z,3.203,152.3,170.0


In [9]:
formatter = TextFormatter()
# video_id = "3HyQ7Q-XVKA"  # runwise podcast french
# video_id = "2_rfOfLYpXM"  # planted runner english
# video_id = "FhOr5RLt5HI"  # prep running addict french
# video_id = "CtyIAzfdBUI"
# video_id = "kBo-q4DwqPM"
video_id = "59aEpVvRXxI"


def fetch_transcript(video_id: str, language_out: str = "en"):
    available_languages = [
        t.language_code for t in YouTubeTranscriptApi.list_transcripts(video_id)
    ]
    if language_out in available_languages:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    else:
        transcript = (
            YouTubeTranscriptApi.list_transcripts(video_id)
            .find_transcript(available_languages[0:1])
            .translate(language_out)
            .fetch()
        )

    return formatter.format_transcript(transcript)


transcript = fetch_transcript(video_id=video_id)
print(transcript[0:240], "....")

on October 17, 2021 in front of more than a hundred
thousand spectators Domingo India and
the best known streamer on Twitch
ran the Paris marathon in three
hours 24 minutes and 25 seconds
sporting events in which I had
the honor of particip ....


In [10]:
running_activities

Unnamed: 0,name,distance_meters,moving_time_seconds,sport_type,date,average_speed_meters_per_second,average_heartrate,max_heartrate
0,Afternoon Run,5000.0,1802,Run,2024-01-01T13:29:04Z,2.775,141.9,151.0
1,Morning Run,4943.9,1805,Run,2024-01-03T07:52:38Z,2.739,127.4,139.0
2,Morning Run,6031.7,2037,Run,2024-01-04T07:51:24Z,2.961,161.1,182.0
3,Afternoon Run,6051.1,2303,Run,2024-01-05T16:45:20Z,2.627,148.5,168.0
4,Lunch Run,7013.8,2629,Run,2024-01-07T11:47:59Z,2.668,151.3,166.0
...,...,...,...,...,...,...,...,...
119,Morning Run,5056.8,1683,Run,2024-06-24T07:17:39Z,3.005,140.0,149.0
120,Morning Run,8535.4,2862,Run,2024-06-25T07:31:22Z,2.982,144.7,171.0
121,Morning Run,5041.1,1682,Run,2024-06-26T08:01:43Z,2.997,145.1,156.0
122,Morning Run,5073.2,1676,Run,2024-06-27T10:12:31Z,3.027,148.9,162.0


In [11]:
class DataField(Enum):
    DISTANCE_METERS = "distance_meters"
    MOVING_TIME_SECONDS = "moving_time_seconds"
    DATE = "date"
    AVERAGE_SPEED_METERS_PER_SECOND = "average_speed_meters_per_second"
    MAX_HEARTRATE = "max_heartrate"
    AVERAGE_HEARTRATE = "average_heartrate"


class Insight(BaseModel):
    observation: str = Field(
        description="The metric observed in the data over a period of time"
    )
    outcome: str = Field(
        description="How the metric observed affects the performance of the athlete"
    )
    data_fields: list[DataField] = Field(
        description="The data fields required to compute the observation"
    )


class ExtractedInsights(BaseModel):
    insights: list[Insight]


system_message = f"""
You are a very experienced running coach.
You are conducting research for your athletes so that they can improve their running performance.
Your goal is to give valuable feedback to an athlete when looking at their running data.
For each athlete you have a table with all their runs year-to-date. For each run, you have the following information: {[e.value for e in DataField]}

You will be given a piece of content related to running. 
Your goal is to analyse the piece of content, and extract valuable insights that can help analyse the current running form of the athlete.
For each insight, you need to ensure it can be calculated with the data you have available. 
The insights should be easy to compute and understand to your athletes.
Only extract insights that are relevant in the context of looking at the last year of running data of an athlete. (You don't know ANY other details)
""".strip()

user_message = f"""
Content:
===
{transcript}
===
""".strip()

In [12]:
i_client = instructor.from_openai(OpenAI())

extracted_insights = i_client.chat.completions.create(
    model="gpt-4o",
    response_model=ExtractedInsights,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ],
    temperature=0,
)

In [13]:
def string_to_id(string_: str) -> str:
    return md5(string_.encode()).hexdigest()


for insight in extracted_insights.insights:
    print(f"{insight.observation=}")
    print(f"{insight.outcome=}")
    data_dict[string_to_id(insight.observation)] = insight.dict()
    print("=====")

insight.observation='Consistent long runs with progressive increase in duration.'
insight.outcome='Helps build endurance and prepares the body for the marathon distance.'
=====
insight.observation='Incorporation of low-intensity runs (fundamental endurance).'
insight.outcome='Improves aerobic capacity and reduces the risk of injury.'
=====
insight.observation='High-intensity interval training (splits) with progressive increase in intensity.'
insight.outcome='Enhances speed and cardiovascular fitness.'
=====
insight.observation='Four training sessions per week including endurance, interval, and long runs.'
insight.outcome='Balanced training approach that targets different aspects of running fitness.'
=====
insight.observation='Avoidance of major injuries despite aggressive training plan.'
insight.outcome='Indicates effective recovery and injury prevention strategies.'
=====
insight.observation='Significant improvement in running performance over a short period.'
insight.outcome='Demonst

In [15]:
class PandasDataFrameQA(BaseModel):
    code: str = Field(description="The code to execute")
    cannot_compute: bool = Field(default=False)

    def execute(self, df: pandas.DataFrame):
        output = io.StringIO()
        original_stdout = sys.stdout
        sys.stdout = output
        try:
            exec(self.code, globals(), {"df": df.copy()})
        finally:
            sys.stdout = original_stdout

        return output.getvalue()

    @field_validator("code")
    @classmethod
    def validate_code(cls, v: str):
        try:
            original_stdout = sys.stdout
            sys.stdout = open(os.devnull, "w")
            exec(v, globals(), {"df": running_activities.copy()})
            sys.stdout.close()
            sys.stdout = original_stdout
            return v
        except Exception as e:
            raise ValueError(f"Invalid code: {e}")


def get_code(insight: Insight) -> PandasDataFrameQA:
    system_prompt = f"""
    You are working with a pandas dataframe in Python.
    The name of the dataframe is `df`.
    This is the result of `print(df.head())`:
    {running_activities.head()}

    Follow these instructions:
    1. Write some code that will help you answer the user question.
    2. The code should include print statements where appropriate.
    3. The code will be called with the exec() function.
    4. Do not include any other imports other than the pandas library.
    5. Do not include any plots, graphs, or other visualizations.
    6. Print the necessary variables so that once read, we can answer the user question.
    """.strip()

    user_prompt = f""" 
    I would like to understand if we are seeing the following: {insight.observation}
    My goal is to understand if it will lead to the following outcome: {insight.outcome}
    """.strip()

    code = i_client.chat.completions.create(
        model="gpt-4o",
        response_model=PandasDataFrameQA,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0,
    )
    return code

In [16]:
for insight in extracted_insights.insights:
    data_dict[string_to_id(insight.observation)]["code"] = get_code(
        insight=insight
    ).dict()

In [21]:
def build_feedback_prompt(
    observation: str, outcome: str, code: str, user_data: pandas.DataFrame
):

    return f"""
    You are a very experienced running coach.
    Your research has lead to the conclusion that the following observation:

    '{observation}'

    Leads to the following outcome:

    '{outcome}'

    When investigating this, you ran the following code on the athletes's data:

    ```python
    {code.code}
    ```

    Which, when executed, produced the following output:

    ```
    {code.execute(df=user_data)}
    ```

    What recommendation would you give the athlete?
    """.strip()

In [25]:
for element in data_dict:
    print(data_dict[element])
    print(
        build_feedback_prompt(
            observation=data_dict[element]["observation"],
            outcome=data_dict[element]["outcome"],
            code=data_dict[element]["code"]["code"],
            user_data=running_activities,
        )
    )
    print("====")

{'observation': 'Consistent long runs with progressive increase in duration.', 'outcome': 'Helps build endurance and prepares the body for the marathon distance.', 'data_fields': [<DataField.DISTANCE_METERS: 'distance_meters'>, <DataField.DATE: 'date'>], 'code': {'code': "# Convert the 'date' column to datetime format for easier manipulation\ndf['date'] = pd.to_datetime(df['date'])\n\n# Sort the dataframe by date to analyze the progression over time\ndf_sorted = df.sort_values(by='date')\n\n# Print the sorted dataframe to understand the progression\nprint(df_sorted[['date', 'distance_meters', 'moving_time_seconds']])\n\n# Calculate the difference in distance and moving time between consecutive runs\ndf_sorted['distance_diff'] = df_sorted['distance_meters'].diff()\ndf_sorted['time_diff'] = df_sorted['moving_time_seconds'].diff()\n\n# Print the differences to see if there is a progressive increase\nprint(df_sorted[['date', 'distance_meters', 'moving_time_seconds', 'distance_diff', 'time_

AttributeError: 'str' object has no attribute 'code'

In [13]:


class TrainingFeedback(BaseModel):
    feedback: str = Field(
        description="The feedback to the user, given the observation and outcome."
    )
    usefullness: Literal["useful", "irrelevant"] = Field(
        description="Whether the feedback is useful or not."
    )


feedback = i_client.chat.completions.create(
    # model="gpt-3.5-turbo",
    model="gpt-4o",
    response_model=TrainingFeedback,
    messages=[
        {"role": "system", "content": system_prompt},
    ],
    temperature=0,
)



In [15]:
print()

"You are a very experienced running coach.\nYour research has lead to the conclusion that the following observation:\n\n'Increased weekly training volume'\n\nLeads to the following outcome:\n\n'Improved endurance and overall performance'\n\nWhen investigating this, you ran the following code on the athletes's data:\n\n```python\nimport pandas as pd\n\ndf['date'] = pd.to_datetime(df['date'])\ndf.set_index('date', inplace=True)\ndf['week'] = df.index.to_period('W')\nweekly_volume = df.groupby('week')['distance_meters'].sum()\nprint(weekly_volume)\n```\n\nWhich, when executed, produced the following output:\n\n```\nweek\n2024-01-01/2024-01-07    29040.5\n2024-01-08/2024-01-14    31290.1\n2024-01-15/2024-01-21    51334.2\n2024-01-22/2024-01-28    33955.7\n2024-01-29/2024-02-04    55945.0\n2024-02-05/2024-02-11    54177.3\n2024-02-12/2024-02-18    66538.0\n2024-02-19/2024-02-25    68761.0\n2024-02-26/2024-03-03    60003.4\n2024-03-04/2024-03-10    64366.2\n2024-03-11/2024-03-17    46182.0\n

In [14]:
feedback.dict()

{'feedback': 'Based on the data, it appears that your weekly training volume has fluctuated significantly over the past few months. To improve your endurance and overall performance, I recommend aiming for a more consistent weekly training volume. Gradually increase your mileage to avoid injury and ensure steady progress. For example, try to maintain your weekly distance within a 10-15% range of your target volume, and avoid sudden spikes or drops in your training load.',
 'usefullness': 'useful'}

In [11]:
print(feedback.feedback)

Based on the data, it appears that your weekly training volume has varied significantly over the past few months. To improve your endurance and overall performance, I recommend aiming for a more consistent weekly training volume. Gradually increase your weekly distance, ensuring you do not have drastic drops or spikes, as these can lead to injury or burnout. Aim for a steady increase of about 10% per week, and include rest weeks with reduced volume to allow for recovery.
