In [109]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import google_drive
import re

### Todo:

- Standardize methods to clean data including checking for normalizing names. This should be done before starting any analysis and also before setting up the data fram we'll work with 
- Get graph output for individual exercises by name - will need a function to do this
- Get graph output of muscle groups - will also need function for this
- Add date range functionality to graphing functions

### Goal graph outputs
- Line graph x = time y = lift weight
- Do I want tabular output?
- Total volume in kg lifted is an interesting stat or total reps
- Do I want estimated one rep max?
- Track estiamted 1rm? 

Estimated 1rm
Max wt
Workout volume
Total reps
max reps
weight and reps

max distance
max time
max speed
max pace
total distance
total time

Stats for time period
- total workouts
- total sets
- total raeps
- total volume
- max wt/volume
- max reps
- estimated 1rm

Workouts by day of the week in box plots or bar graph

- could consider adding workout time started and ended

### Ideal data formating for analysis

- To start off with, I want to get the data into a nice format for analysis. I think having each row indexed by date and representing one exercise would be idea. I'm not sure if I should have each row represent just a set of an exercise though. 

- My goal plots will be time vs exercise - where exercise can be either the max lifted that day or a 1 rep max.

- I also want to see a table with date as column 1 and full set list for a specified exercise 

- These things should be in functions



## Naming conventions

Chest
- Bench press = BP
- Dumbell incline press = DB incline press
- Barbell incline press = BB incline press
- Machine fly

Tricep
- Cable pullover
- Cable pushdown
- Close grip bench press = Close grip BP
- Barbell reverse tricep extension = Barbell reverese tricep ext

Back
- Cable row
- Lat pulldown
- Dumbell row = DB row
- Smith bent row

Bicep
- BARBELL CURL = BB Curl
- CABLE CURL
- Dumbell curl = DB curl 

Shoulders
- Cable side raise
- Standing bumbell shoudler press = Standing DB shoulder press
- Standing barbell shoudler press = Standing BB shoudler press
- Sitting bumbell shoudler press = Standing DB shoulder press
- Sitting barbell shoudler press = Standing BB shoudler press
- Smith seated press



Legs
- Deadlift = DL
- Squat
- Leg extension = Leg ext
- Leg curl
- Smith deadlift = Smith DL
- Smith squat

Other

In [206]:
df = pd.read_excel(google_drive.get_file(), usecols=2)
df['Exercise'] = df.Exercise.str.lower().str.strip()

In [104]:
#list of acceptable naming of exercises
exercises = {
    'chest': ['bench press','bp','dumbell incline press','db incline press',
             'barbell incline press','bb incline press','machine fly'],
    
    'tricep': ['cable pullover','cable pushdown','close grip bench press',
              'close grip bp','barbell reverse tricep extension',
              'barbell reverse tricep ext'],
    
    'back': ['cable row','lat pulldown','dumbbell row','db row','smith bent row'],
    
    'bicep': ['barbell curl','bb curl','cable curl','dumbell curl','db curl'],
    
    'shoulders': ['cable side raise',' standing dumbell shoulder press',
                 'standing db shoulder press','standing barbell shoulder press',
                  'standing bb shoulder press','sitting barbell shoulder press',
                  'sitting bb shoulder press','sitting dumbbell shoulder press',
                  'sitting dumbell shoulder press','smith seated press'],
    
    'legs':['deadlift','squat','leg ext','leg extension','leg curl',
           'smith deadlift','smith dl','smith squat']
}

In [255]:
def check_names(df):
    # Checks names of exercises
    #
    # If any are not in the exercise list, it will print
    # the bad indices and the exercise in question
    # 
    # Returns bool dataframe of indices that have
    # exercises within the list
    
    all_exercises = sum(exercises.values(), [])
    check = df.Exercise.str.lower().isin(all_exercises)
    if len(check == False) > 0:        
        print("Please check naming at indeces: \n")
        print(df[check == False].Exercise)
        print('\n')
    else: 
        print('Data all good')
    return check
    
def check_sets(df):
    # Checks that set format is: reps*weight,....,reps*weight
    # 
    # If any are not in this format, will print bad indices and set in question
    #
    # Returns bool dataframe with indices that have set with proper formatting
    
    r = re.compile(r'(\d+\*\d+\.\d+,|\d+\*\d+,)*(\d+\*\d+\.\d+|\d+\*\d+)')
    s = df.Sets.astype(str)
    m = s.apply(lambda x: len(r.findall(x)) == 1)
    if len(m == False) > 0:
        print('Please check set formatting at the following indices:\n')
        print(df[m == False].Sets)
    else:
        print('Set formatting all good')
    return m
check_sets(df)

Please check set formatting at the following indices:

26                             10*100,8*120,8*120*6*120
32                                                  NaN
86    12*87.5,8*100,8*112.5,8*125,5*125,8*87.5,5*125...
89                                                  NaN
Name: Sets, dtype: object


0      True
1      True
2      True
3      True
4      True
5      True
6      True
7      True
8      True
9      True
10     True
11     True
12     True
13     True
14     True
15     True
16     True
17     True
18     True
19     True
20     True
21     True
22     True
23     True
24     True
25     True
26    False
27     True
28     True
29     True
      ...  
60     True
61     True
62     True
63     True
64     True
65     True
66     True
67     True
68     True
69     True
70     True
71     True
72     True
73     True
74     True
75     True
76     True
77     True
78     True
79     True
80     True
81     True
82     True
83     True
84     True
85     True
86    False
87     True
88     True
89    False
Name: Sets, Length: 90, dtype: bool

In [251]:
df.iloc[86,2]

'12*87.5,8*100,8*112.5,8*125,5*125,8*87.5,5*125*8*87.5'

In [239]:
def transform(df):
    df['Date'] = df.Date.fillna(method='ffill')
    df = df.loc[(check_names(df) == True) & (check_sets(df) == True)].copy()
    df = (df.set_index(['Date','Exercise'])
         .stack()
         .str.split(',',expand=True)
         .stack()
         .unstack(-2)
         .reset_index(-1, drop=True)
         .reset_index()
         )
    return df

In [241]:
cleaned_df = transform(df)

Please check naming at indeces: 

8     standing dumbbell  shoulder press
89                                  NaN
Name: Exercise, dtype: object


Please check set formatting at the following indices:

32    NaN
89    NaN
Name: Sets, dtype: object


In [11]:
def plot_exercise(name):
    #displays time vs max weight on a the lift 'name'
    pass
def plot_muscle_group(name):
    #plots time vs muscle group in [chest, tricep, back, bicep, shoulders, legs]
    pass

In [242]:
cleaned_df.head(5)

Unnamed: 0,Date,Exercise,Sets
0,2019-02-03,bp,12*70
1,2019-02-03,bp,8*90
2,2019-02-03,bp,8*90
3,2019-02-03,bp,8*90
4,2019-02-03,bp,6*100


In [245]:
cleaned_df.Sets.str.split('*', expand=True)

Unnamed: 0,0,1,2,3
0,12,70,,
1,8,90,,
2,8,90,,
3,8,90,,
4,6,100,,
5,6,100,,
6,14,30,,
7,8,42.5,,
8,8,42.5,,
9,8,42.5,,


In [247]:
cleaned_df.tail(5)

Unnamed: 0,Date,Exercise,Sets
317,2019-03-12,lat pulldown,8*112.5
318,2019-03-12,lat pulldown,8*125
319,2019-03-12,lat pulldown,5*125
320,2019-03-12,lat pulldown,8*87.5
321,2019-03-12,lat pulldown,5*125*8*87.5
