In [33]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import jupyter_black
import instructor
from pydantic import BaseModel, Field
from openai import OpenAI
from enum import Enum
import pandas

jupyter_black.load()

In [36]:
running_activities = pandas.read_csv("../static/dummy/running_activities.csv")
print(running_activities.shape)

running_activities.head(10)

(124, 8)


Unnamed: 0,name,distance,moving_time,sport_type,start_date_local,average_speed,average_heartrate,max_heartrate
0,Afternoon Run,5000.0,1802,Run,2024-01-01T13:29:04Z,2.775,141.9,151.0
1,Morning Run,4943.9,1805,Run,2024-01-03T07:52:38Z,2.739,127.4,139.0
2,Morning Run,6031.7,2037,Run,2024-01-04T07:51:24Z,2.961,161.1,182.0
3,Afternoon Run,6051.1,2303,Run,2024-01-05T16:45:20Z,2.627,148.5,168.0
4,Lunch Run,7013.8,2629,Run,2024-01-07T11:47:59Z,2.668,151.3,166.0
5,Morning Run,4994.0,1801,Run,2024-01-08T07:41:12Z,2.773,124.4,144.0
6,Morning Run,5002.4,1806,Run,2024-01-09T08:32:21Z,2.77,124.8,133.0
7,Evening Run,5971.8,1920,Run,2024-01-12T20:10:16Z,3.11,167.2,182.0
8,Afternoon Run,8005.5,2593,Run,2024-01-13T15:02:48Z,3.087,152.2,181.0
9,Lunch Run,7316.4,2284,Run,2024-01-14T11:15:14Z,3.203,152.3,170.0


In [86]:
formatter = TextFormatter()
# video_id = "3HyQ7Q-XVKA"  # runwise podcast french
# video_id = "2_rfOfLYpXM"  # planted runner english
video_id = "FhOr5RLt5HI"  # prep running addict french


def fetch_transcript(video_id: str, language_out: str = "en"):
    available_languages = [
        t.language_code for t in YouTubeTranscriptApi.list_transcripts(video_id)
    ]
    if language_out in available_languages:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    else:
        transcript = (
            YouTubeTranscriptApi.list_transcripts(video_id)
            .find_transcript(available_languages[0:1])
            .translate(language_out)
            .fetch()
        )

    return formatter.format_transcript(transcript)
    # return transcript


transcript = fetch_transcript(video_id=video_id)
print(transcript[0:240], "....")

Nicolas, better known as the
running addict, recently broke his
marathon record going from 2h37 which
was already a good time to 2h33 how
did he do what
the training of someone who runs in
2h33 looks like in detail  on marathon what has he  ....


In [87]:
running_activities

Unnamed: 0,name,distance,moving_time,sport_type,start_date_local,average_speed,average_heartrate,max_heartrate
0,Afternoon Run,5000.0,1802,Run,2024-01-01T13:29:04Z,2.775,141.9,151.0
1,Morning Run,4943.9,1805,Run,2024-01-03T07:52:38Z,2.739,127.4,139.0
2,Morning Run,6031.7,2037,Run,2024-01-04T07:51:24Z,2.961,161.1,182.0
3,Afternoon Run,6051.1,2303,Run,2024-01-05T16:45:20Z,2.627,148.5,168.0
4,Lunch Run,7013.8,2629,Run,2024-01-07T11:47:59Z,2.668,151.3,166.0
...,...,...,...,...,...,...,...,...
119,Morning Run,5056.8,1683,Run,2024-06-24T07:17:39Z,3.005,140.0,149.0
120,Morning Run,8535.4,2862,Run,2024-06-25T07:31:22Z,2.982,144.7,171.0
121,Morning Run,5041.1,1682,Run,2024-06-26T08:01:43Z,2.997,145.1,156.0
122,Morning Run,5073.2,1676,Run,2024-06-27T10:12:31Z,3.027,148.9,162.0


In [88]:
class DataField(Enum):
    DISTANCE_METERS = "distance_meters"
    MOVING_TIME_SECONDS = "moving_time_seconds"
    DATE = "date"
    AVERAGE_SPEED_METERS_PER_SECOND = "average_speed_meters_per_second"
    MAX_HEARTRATE = "max_heartrate"
    AVERAGE_HEARTRATE = "average_heartrate"


class Insight(BaseModel):
    text: str = Field(description="The definition of the insight")
    data_fields: list[DataField] = Field(
        description="A list of the datafields that the insight is based on"
    )


class ExtractedInsights(BaseModel):
    insights: list[Insight]


system_message = f"""
You are a very experienced running coach.
You are conducting research for your athletes so that they can improve their running performance.
Your goal is to give valuable feedback to an athlete when looking at their running data.
For each athlete you have a table with all their runs for the past year. For each run, you have the following information: {[e.value for e in DataField]}

You will be given a piece of content related to running. 
Your goal is to analyse the piece of content, and extract valuable insights that can help analyse the current running form of the athlete.
For each insight, you need to ensure it can be calculated with the data you have available. 
The insights should be easy to compute and understand to your athletes.

""".strip()

user_message = f"""
Content:
===
{transcript}
===
""".strip()

In [89]:
client = instructor.from_openai(OpenAI())

insights = client.chat.completions.create(
    model="gpt-4o",
    response_model=ExtractedInsights,
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ],
    temperature=0,
)

In [90]:
for i in insights.insights:
    print(i.text)
    print(i.data_fields)
    print("====")

Weekly training volume and duration are crucial metrics to track for marathon preparation. Higher weekly volumes and longer durations, especially in the final weeks leading up to the marathon, can indicate better preparedness.
[<DataField.DISTANCE_METERS: 'distance_meters'>, <DataField.MOVING_TIME_SECONDS: 'moving_time_seconds'>, <DataField.DATE: 'date'>]
====
Tracking the number of long runs (over 2 hours) can help identify improvements in endurance. An increase in the number of long runs over time can indicate better endurance and preparedness for a marathon.
[<DataField.DISTANCE_METERS: 'distance_meters'>, <DataField.MOVING_TIME_SECONDS: 'moving_time_seconds'>, <DataField.DATE: 'date'>]
====
Monitoring the average and maximum heart rate during runs can help assess the athlete's cardiovascular fitness and recovery. Consistently high heart rates may indicate overtraining or insufficient recovery.
[<DataField.AVERAGE_HEARTRATE: 'average_heartrate'>, <DataField.MAX_HEARTRATE: 'max_heart

In [102]:
running_activities.head()

Unnamed: 0,name,distance,moving_time,sport_type,start_date_local,average_speed,average_heartrate,max_heartrate,date
0,Afternoon Run,5000.0,1802,Run,2024-01-01 13:29:04+00:00,2.775,141.9,151.0,2024-01-01 13:29:04+00:00
1,Morning Run,4943.9,1805,Run,2024-01-03 07:52:38+00:00,2.739,127.4,139.0,2024-01-03 07:52:38+00:00
2,Morning Run,6031.7,2037,Run,2024-01-04 07:51:24+00:00,2.961,161.1,182.0,2024-01-04 07:51:24+00:00
3,Afternoon Run,6051.1,2303,Run,2024-01-05 16:45:20+00:00,2.627,148.5,168.0,2024-01-05 16:45:20+00:00
4,Lunch Run,7013.8,2629,Run,2024-01-07 11:47:59+00:00,2.668,151.3,166.0,2024-01-07 11:47:59+00:00


In [103]:
def analyze_recovery_periods(df):
    # Convert the 'date' column to datetime
    df["start_date_local"] = pandas.to_datetime(df["start_date_local"])

    # Sort the dataframe by date
    df = df.sort_values("start_date_local")

    # Calculate the intervals between consecutive runs
    df["interval_days"] = df["start_date_local"].diff().dt.days

    # Count the number of recovery days (days with no runs)
    recovery_days = df["interval_days"].dropna().sum() - (len(df) - 1)

    # Calculate the average recovery period
    avg_recovery_period = df["interval_days"].mean()

    # Output the results
    print(f"Total recovery days: {recovery_days}")
    print(f"Average recovery period: {avg_recovery_period:.2f} days")

    return recovery_days, avg_recovery_period


analyze_recovery_periods(running_activities)

Total recovery days: -7.0
Average recovery period: 0.94 days


(np.float64(-7.0), np.float64(0.943089430894309))

In [97]:
import ast


def run(code):
    tree = ast.parse(code)
    last_node = tree.body[-1] if tree.body else None

    # If the last node is an expression, modify the AST to capture the result
    if isinstance(last_node, ast.Expr):
        tgts = [ast.Name(id="_result", ctx=ast.Store())]
        assign = ast.Assign(targets=tgts, value=last_node.value)
        tree.body[-1] = ast.fix_missing_locations(assign)

    ns = {}
    exec(compile(tree, filename="<ast>", mode="exec"), ns)
    return ns.get("_result", None)


def python(code:str):
    "Return result of executing `code` using python. If execution not permitted, returns `#FAIL#`"
    go = input(f'Proceed with execution?\n```\n{code}\n```\n')
    if go.lower()!='y': return '#FAIL#'
    return run(code)


run(
    """
a=1
b=2
a+b
"""
)

3