In [595]:
# import pandas numpy os sys
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split


# Preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/deancochran/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/deancochran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/deancochran/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [596]:


# The intervals are an array of objects
# Each object contains the duration and intensity of the interval

class Interval():
    duration:float = 0.0
    intensity:float = 0.0
    def __init__(self, obj):
        self.duration = float(obj['duration']) 
        self.intensity = float(obj['intensity'])
        
def eval_intervals(intervals:str):
    parsed = eval(intervals)
    for i, item in enumerate(parsed):
        interval = Interval(item)
        parsed[i] = [interval.duration, interval.intensity]
    return parsed

def get_text(x):
    try:
        return BeautifulSoup(x, 'lxml').get_text()
    except: 
        return np.nan

In [597]:
# import the dataframe from the data/prepped folder
df = pd.read_csv('data/prepped/zwift_workouts_all_collections_ordered_Nov20.csv')
df['name'] = df['name'].apply(lambda x: get_text(x))
df['description'] = df['description'].apply(lambda x: get_text(x))
# df['sportType'] = df['sportType'].apply(lambda x: get_text(x))
df['intervals'] = df['intervals'].apply(lambda x: eval_intervals(x))
df.shape

(1493, 4)

In [598]:
# Count the number of rows in the dataframe with nan
count_nan = len(df) - df.count()
count_nan

name             0
description      1
sportType      276
intervals        0
dtype: int64

In [599]:
# TODO: Drop all rows with nan in the dataframe
df = df.dropna()
df.shape

(1217, 4)

In [600]:
# Calculate the total duration of workout interval array in seconds
def total_duration(intervals: list) -> int:
    total_duration=0
    for interval in intervals:
        total_duration = total_duration + interval[0]
    return total_duration

# Calculate the intensity_score of workout interval array in seconds
def workout_intensity_score(intervals: list[object]) -> int:
    intensity_score=float(0)
    for interval in intervals:
        intensity_score = intensity_score + interval[1]*interval[0]

    return intensity_score
df['total_duration'] = df['intervals'].apply(lambda x: float(total_duration(x)))
# df['intensity_score'] = df['intervals'].apply(lambda x: workout_intensity_score(x))
# df['interval_count'] = df['intervals'].apply(lambda x: len(x))

df =df[df['total_duration'] != 0]
df.shape

(1160, 5)

In [601]:
def preprocess(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(tokens)

df['processed_description'] = df['description'].apply(preprocess)
df['processed_name'] = df['name'].apply(preprocess)


In [602]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['processed_description'])

# K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(tfidf_matrix)

# Map cluster labels to category names
cluster_mapping = {
    0: 'recovery',
    1: 'endurance',
    2: 'tempo',
    3: 'threshold',
    4: 'vo2',
    5: 'anaerobic'
}

# Add cluster labels to dataframe
df['category'] = kmeans.labels_
df['category'] = df['category'].map(cluster_mapping)
df.columns


Index(['name', 'description', 'sportType', 'intervals', 'total_duration',
       'processed_description', 'processed_name', 'category'],
      dtype='object')

In [603]:

# Create OneHotEncoder object
encoder = OneHotEncoder(sparse_output=False)

# Separate categorical feature
categorical_feature = df[['category']]

# Fit the encoder on the categorical feature
encoder.fit(categorical_feature)

encoded_category_names = encoder.get_feature_names_out(['category'])
# Transform the categorical feature into a one-hot encoded DataFrame
encoded_features = pd.DataFrame(encoder.transform(categorical_feature), columns=encoded_category_names).astype(int)

encoded_df = pd.DataFrame(np.hstack([df,encoded_features]),columns=[*df.columns,*encoded_features.columns])
encoded_df

Unnamed: 0,name,description,sportType,intervals,total_duration,processed_description,processed_name,category,category_endurance,category_recovery,category_tempo,category_threshold,category_vo2
0,Fun is Staying Cool,Anna van der Breggen is arguably one of the mo...,<sportType>bike</sportType>,"[[500.0, 0.5], [60.0, 0.5], [60.0, 1.176125], ...",2480.0,anna van der breggen arguably one versatile ri...,fun staying cool,endurance,1,0,0,0,0
1,Fun is Going Full Gas,"Since he was a Junior, Mathieu van der Poel (M...",<sportType>bike</sportType>,"[[600.0, 0.5], [60.0, 1.0], [60.0, 0.8], [60.0...",2700.0,since junior mathieu van der poel mvdp dominan...,fun going full gas,endurance,1,0,0,0,0
2,Fun is Flying Uphill,"Steady tempo efforts are great, but by incorpo...",<sportType>bike</sportType>,"[[480.0, 0.5], [120.0, 0.5], [30.000002, 1.0],...",2640.000012,steady tempo effort great incorporating effort...,fun flying uphill,vo2,0,0,0,0,1
3,70.3 Development,Looking for some brick work? You can pair toda...,<sportType>bike</sportType>,"[[300.0, 0.35], [30.0, 1.0], [30.0, 0.55], [30...",3600.0,looking brick work pair today session run work...,development,endurance,1,0,0,0,0
4,Long VO2 max,By now you should have already been through se...,<sportType>bike</sportType>,"[[240.0, 0.35], [30.0, 1.02], [30.0, 0.55], [2...",3600.0,already several short interval getting long in...,long max,recovery,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,Week 4.1 - Wave Rider,"They call it the ""Wave Rider"" because the work...",<sportType>bike</sportType>,"[[480.0, 0.5], [15.0, 1.5], [300.0, 0.88], [15...",1800.0,call wave rider workout graphic look like wave...,week wave rider,endurance,1,0,0,0,0
1156,Week 2.1 - Twenty Torch,"You can do anything for 20min, right!? Good! B...",<sportType>bike</sportType>,"[[180.0, 0.65], [30.0, 0.7], [60.0, 0.82], [20...",1200.0,anything right good workout torch daily calori...,week twenty torch,endurance,1,0,0,0,0
1157,Week 2.2 - Calorie Crush,Back at it with another super fun workout! We'...,<sportType>bike</sportType>,"[[180.0, 0.65], [20.0, 0.9], [20.0, 1.0], [20....",1500.0,back another super fun workout going push work...,week calorie crush,endurance,1,0,0,0,0
1158,Week 1.2 - Instant Inferno,Let's keep the consistency going with a quick ...,<sportType>bike</sportType>,"[[60.0, 0.65], [30.0, 0.76], [30.0, 0.82], [30...",1080.0,let keep consistency going quick workout sessi...,week instant inferno,endurance,1,0,0,0,0


In [604]:
def de_normalize_duration(normalized_duration, total_workout_duration):
    """
    De-normalizes predicted normalized duration based on total workout duration.
    """
    return normalized_duration * total_workout_duration

def de_normalize_intensity(normalized_intensity, min_intensity, max_intensity):
    """
    De-normalizes predicted normalized intensity based on min and max intensity values.
    """
    return normalized_intensity * (max_intensity - min_intensity) + min_intensity

def normalize_interval_sequence(row, min_intensity, max_intensity):
    """
    Normalizes interval sequence within a workout.

    Args:
        interval_sequence: List of intervals within a workout (e.g., [[duration1, intensity1], [duration2, intensity2]]).
        total_workout_duration: Total duration of the workout.
        min_intensity: Minimum intensity value found across all workouts in the dataset.
        max_intensity: Maximum intensity value found across all workouts in the dataset.

    Returns:
        A list of normalized intervals for the workout.
    """
    intervals=row['intervals']
    total_workout_duration = row['total_duration']
    normalized_intervals = []
    for interval in intervals:
    # Assuming first element is duration, second is intensity (adjust based on structure)
        duration, intensity = interval[0], interval[1]
        normalized_duration = duration / total_workout_duration

        # Normalize intensity using min and max values from entire dataset
        normalized_intensity = (intensity - min_intensity) / (max_intensity - min_intensity)

        normalized_intervals.append([normalized_duration, normalized_intensity])
    return normalized_intervals

def find_global_max_intensity(intervals_array):
    """
    Finds the single global maximum intensity value across all intervals in the data.

    Args:
        data: A list of records, where each record contains an array of intervals 
            (e.g., [[duration1, intensity1], [duration2, intensity2]]).

    Returns:
        The single global maximum intensity value found in any interval across all records.
    """
    global_max_intensity = float('-inf')  # Initialize with negative infinity

    for intervals in intervals_array:
        for interval in intervals:
            intensity = interval[1]  # Assuming intensity is the second element
            global_max_intensity = max(global_max_intensity, intensity)

    
    return global_max_intensity

max_intensity = find_global_max_intensity(df['intervals'])

def normalize_data(x):
    """
    Normalizes data between min_value and max_value (default 0-1).
    """
    scaler = MinMaxScaler()
    return scaler.fit_transform(x.reshape(-1, 1))




encoded_df['total_duration_normalized'] = normalize_data(df['total_duration'].to_numpy())
encoded_df['intervals_normalized'] = pd.Series([normalize_interval_sequence({'intervals': row['intervals'], 'total_duration': row['total_duration']},  min_intensity=0, max_intensity=max_intensity) for i, row in encoded_df.iterrows()], name='intervals_normalized')

In [609]:
MAX_INTERVAL_LENGTH = 150
# Function to pad intervals to max_interval_length
def pad_intervals(interval_rows, max_interval_length):
    new_rows = []
    for intervals in interval_rows:
        if len(intervals) < max_interval_length:
            num_missing_intervals = max_interval_length - len(intervals)
            new_intervals = intervals + [[0, 0]] * num_missing_intervals
            new_rows.append(new_intervals)
    return new_rows
encoded_df['intervals_normalized'] = pad_intervals(encoded_df['intervals_normalized'], MAX_INTERVAL_LENGTH)


In [610]:
def mask_intervals(rows):
    new_rows=[]
    for intervals in rows:
        mask=[]
        for interval in intervals:
            if(interval[0]==0 and interval[1]==0):
                mask.append(0)
            else:
                mask.append(1)
        new_rows.append(mask)
    return new_rows
    
# encoded_df['intervals_mask'] = mask_intervals(encoded_df['intervals_normalized'])


In [611]:
encoded_df

Unnamed: 0,name,description,sportType,intervals,total_duration,processed_description,processed_name,category,category_endurance,category_recovery,category_tempo,category_threshold,category_vo2,total_duration_normalized,intervals_normalized
0,Fun is Staying Cool,Anna van der Breggen is arguably one of the mo...,<sportType>bike</sportType>,"[[500.0, 0.5], [60.0, 0.5], [60.0, 1.176125], ...",2480.0,anna van der breggen arguably one versatile ri...,fun staying cool,endurance,1,0,0,0,0,0.136232,"[[0.20161290322580644, 0.16641703002715727], [..."
1,Fun is Going Full Gas,"Since he was a Junior, Mathieu van der Poel (M...",<sportType>bike</sportType>,"[[600.0, 0.5], [60.0, 1.0], [60.0, 0.8], [60.0...",2700.0,since junior mathieu van der poel mvdp dominan...,fun going full gas,endurance,1,0,0,0,0,0.152174,"[[0.2222222222222222, 0.16641703002715727], [0..."
2,Fun is Flying Uphill,"Steady tempo efforts are great, but by incorpo...",<sportType>bike</sportType>,"[[480.0, 0.5], [120.0, 0.5], [30.000002, 1.0],...",2640.000012,steady tempo effort great incorporating effort...,fun flying uphill,vo2,0,0,0,0,1,0.147826,"[[0.18181818099173555, 0.16641703002715727], [..."
3,70.3 Development,Looking for some brick work? You can pair toda...,<sportType>bike</sportType>,"[[300.0, 0.35], [30.0, 1.0], [30.0, 0.55], [30...",3600.0,looking brick work pair today session run work...,development,endurance,1,0,0,0,0,0.217391,"[[0.08333333333333333, 0.11649192101901008], [..."
4,Long VO2 max,By now you should have already been through se...,<sportType>bike</sportType>,"[[240.0, 0.35], [30.0, 1.02], [30.0, 0.55], [2...",3600.0,already several short interval getting long in...,long max,recovery,0,1,0,0,0,0.217391,"[[0.06666666666666667, 0.11649192101901008], [..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1155,Week 4.1 - Wave Rider,"They call it the ""Wave Rider"" because the work...",<sportType>bike</sportType>,"[[480.0, 0.5], [15.0, 1.5], [300.0, 0.88], [15...",1800.0,call wave rider workout graphic look like wave...,week wave rider,endurance,1,0,0,0,0,0.086957,"[[0.26666666666666666, 0.16641703002715727], [..."
1156,Week 2.1 - Twenty Torch,"You can do anything for 20min, right!? Good! B...",<sportType>bike</sportType>,"[[180.0, 0.65], [30.0, 0.7], [60.0, 0.82], [20...",1200.0,anything right good workout torch daily calori...,week twenty torch,endurance,1,0,0,0,0,0.043478,"[[0.15, 0.21634213903530447], [0.025, 0.232983..."
1157,Week 2.2 - Calorie Crush,Back at it with another super fun workout! We'...,<sportType>bike</sportType>,"[[180.0, 0.65], [20.0, 0.9], [20.0, 1.0], [20....",1500.0,back another super fun workout going push work...,week calorie crush,endurance,1,0,0,0,0,0.065217,"[[0.12, 0.21634213903530447], [0.0133333333333..."
1158,Week 1.2 - Instant Inferno,Let's keep the consistency going with a quick ...,<sportType>bike</sportType>,"[[60.0, 0.65], [30.0, 0.76], [30.0, 0.82], [30...",1080.0,let keep consistency going quick workout sessi...,week instant inferno,endurance,1,0,0,0,0,0.034783,"[[0.05555555555555555, 0.21634213903530447], [..."


In [612]:
dataset_columns = [col for col in encoded_df.columns if col not in ['category', 'name', 'description', 'sportType', 'processed_description', 'processed_name', 'intervals','total_duration']]
dataset_columns

['category_endurance',
 'category_recovery',
 'category_tempo',
 'category_threshold',
 'category_vo2',
 'total_duration_normalized',
 'intervals_normalized']

In [635]:
dataset = encoded_df[[
    'intervals_normalized',
    'category_endurance',
 'category_recovery',
 'category_tempo',
 'category_threshold',
 'category_vo2',
 'total_duration_normalized']]
dataset.head()

Unnamed: 0,intervals_normalized,category_endurance,category_recovery,category_tempo,category_threshold,category_vo2,total_duration_normalized
0,"[[0.20161290322580644, 0.16641703002715727], [...",1,0,0,0,0,0.136232
1,"[[0.2222222222222222, 0.16641703002715727], [0...",1,0,0,0,0,0.152174
2,"[[0.18181818099173555, 0.16641703002715727], [...",0,0,0,0,1,0.147826
3,"[[0.08333333333333333, 0.11649192101901008], [...",1,0,0,0,0,0.217391
4,"[[0.06666666666666667, 0.11649192101901008], [...",0,1,0,0,0,0.217391


In [636]:
y = dataset['intervals_normalized']    # Target dataframe
X = dataset.drop('intervals_normalized', axis=1)
X = np.asarray(X).astype(np.float32)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [637]:
# Convert intervals to numpy arrays
y_train = np.array(y_train.tolist())
y_test = np.array(y_test.tolist())

# Reshape the input data to fit LSTM input shape
X_train_lstm = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
X_test_lstm = np.reshape(X_test, (X_test.shape[0], X_test.shape[1], 1))
# Reshape the target intervals to match the output shape
y_train_lstm = np.reshape(y_train, (y_train.shape[0], y_train.shape[1], 2))
y_test_lstm = np.reshape(y_test, (y_test.shape[0], y_test.shape[1], 2))
print('X_train_lstm.shape: ', X_train_lstm.shape)
print('y_train_lstm.shape: ', y_train_lstm.shape)

X_train_lstm.shape:  (928, 6, 1)
y_train_lstm.shape:  (928, 150, 2)


In [638]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking, RepeatVector


# Build and compile the model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(RepeatVector(y_train.shape[1]))  # Repeat the output sequence
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(Dense(2))  # Output layer for [duration, intensity]

# Compile model
model.compile(optimizer='adam', loss='mse')

# # Print model summary
model.summary()

  super().__init__(**kwargs)


In [662]:
y_train_lstm

array([[[0.05084746, 0.14977533],
        [0.01694915, 0.19970044],
        [0.01694915, 0.21634214],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.11666667, 0.16791477],
        [0.08333333, 0.3176901 ],
        [0.08333333, 0.18455648],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.11904762, 0.16641703],
        [0.11904762, 0.21634214],
        [0.04761905, 0.18305873],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       ...,

       [[1.        , 0.13313362],
        [0.        , 0.        ],
        [0.        , 0.        ],
        ...,
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.04761905, 0.18305874],
        [0.04761905, 0.23298384],
        [0.04761905, 0.28290896],
        .

In [639]:
# Train the model
model.fit(X_train_lstm, y_train_lstm, epochs=50, batch_size=32)

Epoch 1/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - loss: 0.0064
Epoch 2/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 31ms/step - loss: 0.0061
Epoch 3/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step - loss: 0.0041
Epoch 4/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - loss: 0.0038
Epoch 5/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 0.0038
Epoch 6/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - loss: 0.0038
Epoch 7/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 37ms/step - loss: 0.0038
Epoch 8/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 35ms/step - loss: 0.0037
Epoch 9/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - loss: 0.0038
Epoch 10/50
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 34ms/step - loss: 0.0035

<keras.src.callbacks.history.History at 0x147b20170>

In [660]:
def category_to_one_hot_array(category):
    all_categories = ["endurance", "tempo", "vo2", "recovery", "threshold"]
    # Create an array of zeros with the same length as all_categories
    one_hot_array = np.zeros(len(all_categories))
    
    # Find the index of the category in all_categories
    index = all_categories.index(category)
    
    # Set the corresponding index to 1.0
    one_hot_array[index] = float(1)
    
    return one_hot_array

def reshape_input(inputs):
    return np.reshape(inputs, (1, inputs.shape[0], 1))



array([[[1.],
        [0.],
        [0.],
        [0.],
        [0.]]])

In [661]:
# Predict unique intervals
predicted_intervals_unique = model.predict(reshape_input(category_to_one_hot_array("endurance")))

print("Predicted Unique Intervals:", predicted_intervals_unique)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 391ms/step
Predicted Unique Intervals: [[[ 1.85754538e-01  1.49234951e-01]
  [ 7.83205777e-02  2.03545392e-01]
  [ 3.90300751e-02  2.18539298e-01]
  [ 2.88055018e-02  2.21685022e-01]
  [ 2.92776078e-02  2.21388131e-01]
  [ 3.31830978e-02  2.19824791e-01]
  [ 3.62432152e-02  2.18525320e-01]
  [ 3.78117897e-02  2.16659516e-01]
  [ 3.80547307e-02  2.13548303e-01]
  [ 3.72046828e-02  2.08794981e-01]
  [ 3.55110541e-02  2.02250898e-01]
  [ 3.31928134e-02  1.93924218e-01]
  [ 3.04253623e-02  1.83930397e-01]
  [ 2.76751742e-02  1.73549563e-01]
  [ 2.49116570e-02  1.62816018e-01]
  [ 2.22657099e-02  1.51873648e-01]
  [ 1.99155509e-02  1.41000912e-01]
  [ 1.77585706e-02  1.30211264e-01]
  [ 1.57322250e-02  1.19512923e-01]
  [ 1.38933025e-02  1.09044701e-01]
  [ 1.23892166e-02  9.90694612e-02]
  [ 1.11247189e-02  8.95449668e-02]
  [ 1.00325346e-02  8.04365873e-02]
  [ 9.06554610e-03  7.17146769e-02]
  [ 8.19094852e-03  6.33532256e-02]