In [1]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import jupyter_black
import instructor
from pydantic import BaseModel, Field
from openai import OpenAI
from enum import Enum

jupyter_black.load()

In [2]:
formatter = TextFormatter()
# video_id = "3HyQ7Q-XVKA"  # runwise podcast french
# video_id = "2_rfOfLYpXM"  # planted runner english
video_id = "FhOr5RLt5HI"  # prep running addict french


def fetch_transcript(video_id: str, language_out: str = "en"):
    available_languages = [
        t.language_code for t in YouTubeTranscriptApi.list_transcripts(video_id)
    ]
    if language_out in available_languages:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
    else:
        transcript = (
            YouTubeTranscriptApi.list_transcripts(video_id)
            .find_transcript(available_languages[0:1])
            .translate(language_out)
            .fetch()
        )

    return formatter.format_transcript(transcript)
    # return transcript


transcript = fetch_transcript(video_id=video_id)
print(transcript[0:240], "....")

Nicolas, better known as the
running addict, recently broke his
marathon record going from 2h37 which
was already a good time to 2h33 how
did he do what
the training of someone who runs in
2h33 looks like in detail  on marathon what has he  ....


In [3]:
class DataField(Enum):
    HEART_RATE = "HEART_RATE"
    PACE = "PACE"
    DISTANCE = "DISTANCE"
    DATE = "DATE"


class Insight(BaseModel):
    text: str = Field(description="The text of the insight")
    data_fields: list[DataField] = Field(
        description="The data fields required to compute the insight for a given athlete"
    )


class Insights(BaseModel):
    insights: list[Insight]


client = instructor.from_openai(OpenAI())

instruction = f"""
You are a running coach conducting research for your athletes. 
You have data regarding every one of your athletes (running activities, pace, heart rate, etc).
Your goal is to analyze the following piece of text, and retrieve a list of insights.
To every insight, you must specify the data fields required. 

Below is the piece of text you must analyze:
===
{transcript}
===
""".strip()

insights = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Insights,
    messages=[{"role": "user", "content": instruction}],
)

In [4]:
for i in insights.insights:
    print(i.text)
    print(i.data_fields)
    print("====")

Nicolas's marathon preparation focuses on gradual increase in average weekly volume and density of long outings to target a specific muscular limiting factor.
[<DataField.DISTANCE: 'DISTANCE'>, <DataField.DISTANCE: 'DISTANCE'>, <DataField.DISTANCE: 'DISTANCE'>]
====
The use of Sweet Spot cycle in the preparation allows for intense stress without lasting too long, providing an alternative for targeting the limiting factor.
[<DataField.DISTANCE: 'DISTANCE'>, <DataField.DISTANCE: 'DISTANCE'>, <DataField.DISTANCE: 'DISTANCE'>]
====
The management of fatigue in athletes includes progressive session intensities, session format adjustments, and prioritizing duration over intensity when necessary.
[<DataField.PACE: 'PACE'>, <DataField.PACE: 'PACE'>, <DataField.PACE: 'PACE'>]
====
The analysis of performance standards throughout the preparation helps in projecting probabilities of success and adjusting training goals accordingly.
[<DataField.PACE: 'PACE'>, <DataField.PACE: 'PACE'>, <DataField.P