In [1]:
import pandas as pd
import numpy as np
# import requests
from bs4 import BeautifulSoup
import re

In [2]:
# URL = "data/Micro0 Gonzalo septiembre2021.html"
# page = requests.get(URL)
# print(page.text)

# 1. Get data from .html

The storage structure is as follows (JSON compatibl-ish):
```
{micro_name: [
      {"workout_desc": "Pierna",
       "date_workout": session1_date,
       "exercises": [
              {"nombre": "sentadilla",
               "series": 4, 
               ...},
              {"nombre": "peso muerto",
               "series": 4}, 
              ...
              ]
      },
      {"workout_desc": "Torso",
       "date_workout": session2_date,
       "exercises": [
              {"nombre": "dominadas",
               "series": 4, 
               ...},
              {"nombre": "press banca",
               "series": 4}, 
              ...
              ]
      },
      ...
] }
```

In [6]:
def get_data_from_html(file):
# "data/Micro0 Gonzalo septiembre2021.html"  

    with open(file) as html_file:
        soup = BeautifulSoup(html_file, "html.parser")

        sessions_list = []
        sessions = soup.find_all("div", class_="dia")
        for session in sessions:
            session_dict = {}

            workout_desc = session.find("div", class_="titulo").text.lower()
            session_dict["workout_desc"] = workout_desc

            day = int(session.find("div", id="dia").text)
            month = int(session.find("div", id="mes").text)
            year = int(session.find("div", id="anyo").text)
            date = pd.Timestamp(day=day, month=month, year=year).date()
            session_dict["date_workout"] = date
            
            exercises_list = []
            exercises = session.find("div", class_="cuerpo-boxdia").find_all("div", re.compile("ejercicio.*"))
            for exercise in exercises:
                exercise_dict = {}
                for element in exercise.find_all("div"):
                    exercise_dict[element.attrs['class'][0]] = element.text
                exercises_list.append(exercise_dict)
            session_dict["exercises"] = exercises_list

            sessions_list.append(session_dict)

        block_name = soup.find(id="microciclo").text
        micro_dict = {block_name: sessions_list}

    return micro_dict

get_data_from_html("data/Micro2 Gonzalo septiembre2021.html")

{'Micro2 Gonzalo septiembre2021': [{'workout_desc': 'pierna ',
   'date_workout': datetime.date(2021, 9, 20),
   'exercises': [{'nombre': 'Sentadilla excéntrica 3"',
     'series': '4',
     'cargas': '70 %',
     'kilos': '70 Kg',
     'repeticiones': '7',
     'rpe': '8',
     'descanso': '3 min'},
    {'nombre': 'Peso muerto excéntrico 3"',
     'series': '4',
     'cargas': '65 %',
     'kilos': '85 Kg',
     'repeticiones': '7',
     'rpe': '8',
     'descanso': '3 min'},
    {'nombre': 'Front squat',
     'series': '5',
     'cargas': '70 %',
     'kilos': '30 Kg',
     'repeticiones': '12',
     'rpe': '8',
     'descanso': '2 min'},
    {'nombre': 'Sentadilla búlgara',
     'series': '3',
     'cargas': '70 %',
     'kilos': '10 Kg',
     'repeticiones': '8',
     'rpe': '7',
     'descanso': '1 min'},
    {'nombre': 'Rueda abd',
     'series': '4',
     'cargas': '70 %',
     'kilos': '2 Kg',
     'repeticiones': '12',
     'rpe': '7',
     'descanso': '1 min'},
    {'nombre':

# 2. Curate data

In [46]:
SESSION_COL_NAMES = ["Ejercicio", "Series", "Cargas (%)", 
                     "Kilos", "Repeticiones", "RPE", "Descanso (min)"]

def curate_exercises_data(exercises:list, col_names:list):

    df_exercises = pd.DataFrame(exercises)
    # # Data quality criteria for workout session table
    # Standard names (only if matching length, else keep originals)
    if len(df_exercises.columns) == len(col_names):
        df_exercises.columns = col_names
    # Lowercase exercise names
    df_exercises.iloc[:, 0] = df_exercises.iloc[:, 0].str.lower()
    # Convert to numbers
    df_exercises.iloc[:, 1] = df_exercises.iloc[:, 1].str.extract("(\d+)").astype(int).values
    df_exercises.iloc[:, 2] = df_exercises.iloc[:, 2].str.extract("(\d+)").astype(float).values
    df_exercises.iloc[:, 3] = df_exercises.iloc[:, 3].str.extract("(\d+)").astype(float).values
    df_exercises.iloc[:, 4] = df_exercises.iloc[:, 4].str.extract("(\d+)").astype(int).values
    df_exercises.iloc[:, 5] = df_exercises.iloc[:, 5].str.extract("(\d+)").astype(int).values
    df_exercises.iloc[:, 6] = df_exercises.iloc[:, 6].str.extract("(\d+)").astype(float).values
    # In "Cargas (%)" and "Descanso (min)" change 0 --> NULL
    df_exercises.iloc[:, 2] = df_exercises.iloc[:, 2].replace(0.0, np.nan)
    df_exercises.iloc[:, 6] = df_exercises.iloc[:, 6].replace(0.0, np.nan)

    return df_exercises


block_dict = get_data_from_html("data/Micro2 Gonzalo septiembre2021.html")
for block_name, workouts_list in block_dict.items():
    print(block_name)
    print("-"*30)
    for workout in workouts_list:
        print(f"{workout['date_workout']} - {workout['workout_desc']}")
        df = curate_exercises_data(workout["exercises"], SESSION_COL_NAMES)
        display(df)
        # data_ex = df["Ejercicio"].unique()
        # db_ex = [i[0] for i in session.query(Exercise.exercise_desc).all()]
        # print(set(data_ex))
        # print(list(set(data_ex) - set(db_ex)))

Micro2 Gonzalo septiembre2021
------------------------------
2021-09-20 - pierna 


Unnamed: 0,Ejercicio,Series,Cargas (%),Kilos,Repeticiones,RPE,Descanso (min)
0,"sentadilla excéntrica 3""",4,70.0,70.0,7,8,3.0
1,"peso muerto excéntrico 3""",4,65.0,85.0,7,8,3.0
2,front squat,5,70.0,30.0,12,8,2.0
3,sentadilla búlgara,3,70.0,10.0,8,7,1.0
4,rueda abd,4,70.0,2.0,12,7,1.0
5,"plancha abd 30""",4,70.0,2.0,1,7,1.0


2021-09-21 - torso


Unnamed: 0,Ejercicio,Series,Cargas (%),Kilos,Repeticiones,RPE,Descanso (min)
0,press de banca con pausa,4,75.0,75.0,6,7,3.0
1,fondos en paralelas (abiertos),4,70.0,80.0,10,8,2.0
2,flexiones de pecho,4,70.0,40.0,10,5,2.0
3,dominadas de biceps lastradas +25kg,4,85.0,105.0,5,8,3.0
4,"dominadas de biceps excéntricas 3""",4,70.0,80.0,6,7,2.0
5,extension de tríceps en polea con cuerda,6,70.0,20.0,15,8,1.0


2021-09-23 - acc piern pst


Unnamed: 0,Ejercicio,Series,Cargas (%),Kilos,Repeticiones,RPE,Descanso (min)
0,peso muerto sumo,4,85.0,85.0,10,8,3.0
1,hip thrust,3,85.0,120.0,10,8,2.0
2,hip thrust,3,75.0,100.0,12,8,2.0
3,hip thrust,3,65.0,85.0,15,10,2.0
4,femoral en maquina,5,75.0,30.0,12,7,1.0
5,femoral con mancuernas,5,70.0,15.0,12,8,1.0


2021-09-25 - acc torso


Unnamed: 0,Ejercicio,Series,Cargas (%),Kilos,Repeticiones,RPE,Descanso (min)
0,press inclinado,4,60.0,60.0,10,7,2.0
1,apertura de mancuernas en banco inclinado,4,70.0,12.0,10,8,1.0
2,remo barra,4,60.0,60.0,10,8,2.0
3,remo con mancuernas en banco inclinado,4,80.0,20.0,12,8,1.0
4,press militar,4,80.0,40.0,10,8,2.0
5,press militar,4,70.0,30.0,15,9,2.0
6,face pull,4,80.0,30.0,12,8,1.0
7,face pull,4,70.0,20.0,15,8,1.0


In [55]:
for index, row in df.iterrows():
    for wod_set in range(row["Series"]):
        set_id = wod_set + 1
        prev_ex = df["Ejercicio"].iloc[index-1]
        if prev_ex == row["Ejercicio"]:
            set_id += row["Series"]
        print(f"{row['Ejercicio']} --> {set_id} ({prev_ex})")

press inclinado --> 1 (face pull)
press inclinado --> 2 (face pull)
press inclinado --> 3 (face pull)
press inclinado --> 4 (face pull)
apertura de mancuernas en banco inclinado --> 1 (press inclinado)
apertura de mancuernas en banco inclinado --> 2 (press inclinado)
apertura de mancuernas en banco inclinado --> 3 (press inclinado)
apertura de mancuernas en banco inclinado --> 4 (press inclinado)
remo barra --> 1 (apertura de mancuernas en banco inclinado)
remo barra --> 2 (apertura de mancuernas en banco inclinado)
remo barra --> 3 (apertura de mancuernas en banco inclinado)
remo barra --> 4 (apertura de mancuernas en banco inclinado)
remo con mancuernas en banco inclinado --> 1 (remo barra)
remo con mancuernas en banco inclinado --> 2 (remo barra)
remo con mancuernas en banco inclinado --> 3 (remo barra)
remo con mancuernas en banco inclinado --> 4 (remo barra)
press militar --> 1 (remo con mancuernas en banco inclinado)
press militar --> 2 (remo con mancuernas en banco inclinado)
pr

In [None]:
df.iloc[:, 2].str.extract("(\d+)").astype

In [5]:
# for index, row in df.iterrows():
#     for set in range(row["Series"]):
#         set_id = set + 1
#         print(row.drop("Series")["Ejercicio"], row.drop("Series")["Cargas (%)"], set_id)

# session.query(Exercise.exercise_id).filter_by(exercise_desc="press banca").scalar()

In [6]:
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from models import Program, Block, Workout, Workout_set, Exercise

engine = create_engine(f"sqlite:///data/db/gym_database.db")
Session = sessionmaker(bind=engine)
session = Session()

In [8]:
# Adding block of program from personal coach html file
def add_block(session, source_file=None, program:str=None):
    
    COL_NAMES = ["Ejercicio", "Series", "Cargas (%)", 
                 "Kilos", "Repeticiones", "RPE", "Descanso (min)"]

    # Check if program description already exists
    if program:
        program = (
            session.query(Program)
            .filter(Program.program_desc==program)
            .one_or_none()
        )
        program_id = program.program_id
    # If not (or not provided), create new generic program
    if program is None:
        program = Program()
        session.add(program)
        # If new, the program_id will be last element added
        program_id = session.query(Program).count()

    block_dict = get_data_from_html(source_file)

    for block_name, workouts_list in block_dict.items():
        # Check if block already exist (matching both name and program)
        block = (
            session.query(Block)
            .filter(Block.block_desc==block_name, Block.program_id==program_id)
            .one_or_none()
        )
        if block:
            block_id = block.block_id
        # If not, create it
        if block is None:
            block = Block(block_desc=block_name, program_id=program_id)
            session.add(block)
            # If new, the block_id will be last element added
            block_id = session.query(Block).count() 

            # The loading of workouts and sets is only done if block doesn't exist
            # If block already exists we could branch an update function inside
            # (inside this add_block() or outside as a different call)
            for wod in workouts_list:
                workout = Workout(workout_desc=wod["workout_desc"], 
                                  block_id=block_id, 
                                  date_workout=wod["date_workout"])
                session.add(workout)
                workout_id = session.query(Workout).count() 
                df_exercises = curate_exercises_data(wod["exercises"], COL_NAMES)
                
                # If exercise is not in Exercise lookup table, add it previously
                data_ex = df_exercises["Ejercicio"].unique()
                db_ex = [i[0] for i in session.query(Exercise.exercise_desc).all()]
                for new_exercise in list(set(data_ex) - set(db_ex)):
                    exercise = Exercise(exercise_desc=new_exercise)
                    session.add(exercise)
                
                # Then, insert row by row the results (exploding for as many series
                # per exercise there are)
                for index, row in df_exercises.iterrows():
                    for wod_set in range(row["Series"]):
                        set_id = wod_set + 1
                        workout_set = Workout_set(
                            workout_id=workout_id,
                            exercise_id=session.query(Exercise.exercise_id)
                                               .filter_by(exercise_desc=row["Ejercicio"])
                                               .scalar(),
                            set_id=set_id,
                            no_reps=row["Repeticiones"],
                            weight=row["Kilos"],
                            perc_rm=row["Cargas (%)"] if row["Cargas (%)"] != np.nan else None,
                            max_rpe=row["RPE"],
                            rest_min=row["Descanso (min)"] if row["Descanso (min)"] != np.nan else None
                        )
                        session.add(workout_set)

    pass

add_block(session, "data/Micro0 Gonzalo septiembre2021.html")

In [10]:
# session.rollback()

# session.query(Program).all()
# session.query(Program).order_by(Program.program_id.desc()).first().program_id
# session.query(Program).count()

# session.query(Block).filter(Block.program_id==4).first().program
# session.query(Block).all()

# session.query(Exercise).all()

# session.query(Workout).all()

# session.query(Workout_set).all()
session.query(Workout_set.perc_rm).all()

[(92.5,),
 (92.5,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (60.0,),
 (60.0,),
 (60.0,),
 (60.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (50.0,),
 (50.0,),
 (50.0,),
 (50.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (None,),
 (50.0,),
 (50.0,),
 (50.0,),
 (50.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,),
 (70.0,)]

In [12]:
session.close()
engine.dispose()

In [13]:
session

<sqlalchemy.orm.session.Session at 0x7f19e1736190>