# Goals of this script:
1. Convert TCX file to TXT file with position data
2. Write out a summary of workout info to compare with what was downloaded
---
## Next steps after this works:
3. Identify which TCX files are mislabled => Relabel or re-download them
4. Add field to database to indicate which TCX files are correctly converted to TXT
5. Finish downloading the rest of existing TCX files and convert
6. Quick research about how to store spatial data
7. Can I visualize the converted data?
---
## Prereqs: 
- Need to have TCX files downloaded locally  (see Automator script)
- To do that, I used the workoutIds found in the CSV file of exported workout_history from MapMyRun

## Preprocessing & Helper functions: 
- Setup a list of TCX files to loop through & label summary file
- Define Helper funcitons

In [2]:
import os

mainfolder = '/Users/barbaraihidalgo-sotelo/PROJECTS/build_workout_dashboard/build_workout_dashboard/'
summaryfile = mainfolder + 'workout_position_summary.csv'

# Label summary file, if it hasnt been created yet
if not(os.path.isfile(summaryfile)):
    fh = open(summaryfile,"a+")
    fh.writelines('WorkoutId,TimeStamp,WorkoutDateOrig,Calories,TotalTimeSeconds,TotalTimeSecondsOrig,MaximumSpeed,#TxtFileRows,#SamplingInterval'+'\n')
    fh.close()

In [3]:
import datetime as dt
from scipy import stats

def calc_Time(Time):
    timestamps, dates, times = [], [], []
    for samp in Time:
        date_time_obj = dt.datetime.strptime(samp[0:19].replace('T',' '), '%Y-%m-%d %H:%M:%S') 
        timestamps.append(date_time_obj.date())
        dates.append(date_time_obj.time())
        times.append(date_time_obj)

    # Calculate sampling interval
    sampInt = []
    for i in range(1,len(times)):
        sampInt.append((times[i] - times[i-1]).total_seconds())
    typSampInterval = int(stats.mode(sampInt)[0])

    return typSampInterval
#     return typSampInterval, timestamps, dates, times

#     print('Date:', date_time_obj.date())
#     print('Time:', date_time_obj.time())
#     print('Date-time:', date_time_obj)


In [4]:
def extractSummaryInfo(root):
    base = '{http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2}'

    TimeStamp = [m.text for m in root.iter(base+'Id')]
    Calories = [m.text for m in root.iter(base+'Calories')]
    TotalTimeSec = [m.text for m in root.iter(base+'TotalTimeSeconds')]
    MaxSpeed = [m.text for m in root.iter(base+'MaximumSpeed')]
    if len(TimeStamp)==0:
        TimeStamp = ''
    else:
        TimeStamp = TimeStamp[0]
        # Trip Timestamp to something easier to convert to datetime format
        TimeStamp = TimeStamp[0:19].replace('T',' ')
    if len(Calories)==0:
        Calories = ''
    else:
        Calories = Calories[0]    
    if len(TotalTimeSec)==0:
        TotalTimeSec = ''
    else:
        TotalTimeSec = TotalTimeSec[0] 
    if len(MaxSpeed)==0:
        MaxSpeed = ''
    else:
        MaxSpeed = MaxSpeed[0]  
    
    return TimeStamp, Calories, TotalTimeSec, MaxSpeed


In [5]:
def extractPositionData(root):
    base = '{http://www.garmin.com/xmlschemas/TrainingCenterDatabase/v2}'
    Time, Pos, LatPts, LongPts, AltM, DistM = [], [], [], [], [], []
    [Time.append(m.text) for m in root.iter(base+'Time')]
    [Pos.append(m.text) for m in root.iter(base+'Position')]
    [LatPts.append(m.text) for m in root.iter(base+'LatitudeDegrees')]
    [LongPts.append(m.text) for m in root.iter(base+'LongitudeDegrees')]
    [AltM.append(m.text) for m in root.iter(base+'AltitudeMeters')]
    [DistM.append(m.text) for m in root.iter(base+'DistanceMeters')]

    if len(DistM)<1:
        return '', '', '', '', '', ''
    
    elif float(DistM[0]) > 10:
#         # Check whether first element in the list is the TOTAL distance meters
#         print(f'(first 3 elements: {DistM[0]}, {DistM[1]}, {DistM[2]})')
        TotalDistMeters = DistM[0]
        DistM = DistM[1:]
    
    TimeConv = []
    for samp in Time:
        TimeConv.append(samp[0:19].replace('T',' '))
    
    return TimeConv, Pos, LatPts, LongPts, AltM, DistM

In [6]:
def qualityChecks(Time, Pos, LatPts, LongPts, AltM, DistM):
    print('Time: ' + str(len(Time)))
    print('Position: ' + str(len(Pos)))
    print('LatDeg: ' + str(len(LatPts)))
    print('LongDeg: ' + str(len(LongPts)))
    print('AltitudeMeters: ' + str(len(AltM)))
    print('DistanceMeters: ' + str(len(DistM)))

#     print(f"\nNote: Length of these lists should be roughly equal to TotalTimeSeconds, {TotalTimeSec}")
    # print(f"Total Time (seconds): {TotalTimeSec}")
#     print('(But not always old files greater than 1sec btwn samples)')

    return Time, Pos, LatPts, LongPts, AltM, DistM

In [7]:
def writePositionFile(writefile, Time, LatPts, LongPts, AltM, DistM):
    # Save position data
    numLines=0
    for l in range(0,len(Time)):
        # Open file handle to write data:
        fh = open(writefile,"a+")
        # Write data
        try:
            L = Time[l]+'\t'+LatPts[l]+'\t'+LongPts[l]+'\t'+AltM[l]+'\t'+DistM[l]+'\n'
        except IndexError:
        # print('There was an index error; probably one array is shorter than others')
            if len(LatPts)<=numLines:
                LatTemp = 'NaN'
            else:
                LatTemp = LatPts[l]
            if len(LongPts)<=numLines:
                LonTemp = 'NaN'
            else:
                LonTemp = LongPts[l]
            if len(AltM)<=numLines:
                AltMTemp = 'NaN'
            else:
                AltMTemp = AltM[l]
            if len(DistM)<=numLines:
                DistMTemp = 'NaN'
            else:
                DistMTemp =  DistM[l]
            L = Time[l]+'\t'+LatTemp+'\t'+LonTemp+'\t'+AltMTemp+'\t'+DistMTemp+'\n'
        fh.writelines(L)
        # Close file handle
        fh.close()
        numLines+=1
    return numLines
    
def writeSummaryData(summaryfile,workoutId,TimeStamp,TimeStampOrig, Calories,TotalTimeSec,TotalTimeSecOrig, MaxSpeed,numLines,sampInterval, ):
    # Save summary data
    fh = open(summaryfile,"a+")

    # Write data
    L = workoutId+','+TimeStamp+','+TimeStampOrig+','+Calories+','+TotalTimeSec+','+TotalTimeSecOrig+','+MaxSpeed+','+str(numLines)+' lines in txt file'+','+str(sampInterval)+'\n'
    fh.writelines(L)

    # Close file handle
    fh.close()


## Loop through TCX files...
1: Load data  
2: Extract summary info  
3: Extract position info  
4: Quality data checks  
5: Write out text file   
- Write a new file for workout position data ("workoutId.txt")
- Append summary stats to file with summary workout info (workout_position_summary.csv)

__NOTE__:  
If you run the cell below again, it'll add a new row to the workout_summary for each of the workouts that doesn't have a .txt file (i.e. no position data) ...  
- As of Jan 2021, there were 39 of those, all except two from 2011 and 2013
- That leaves 969 workouts with position data (as of 1/1/2021)

In [8]:
import lxml.etree as ET

readfolder = "/Users/barbaraihidalgo-sotelo/PROJECTS/build_workout_dashboard/workouts_timedata/"
writefolder = "/Users/barbaraihidalgo-sotelo/PROJECTS/build_workout_dashboard/workouts_timedata_TXT/"

workoutsummary = '/Users/barbaraihidalgo-sotelo/PROJECTS/build_workout_dashboard/build_workout_dashboard/user2632022_workout_history.csv'


In [9]:
import pandas as pd

workoutsummary = pd.read_csv(workoutsummary)
workoutsummary['workoutid'] = workoutsummary['Link'].apply( lambda x: x.replace('http://www.mapmyfitness.com/workout/','') )


In [12]:
# Identify number of TCX files in folder
nFiles=0
for file in os.listdir(readfolder): 
    if file.endswith('.tcx'):
        nFiles+=1 

# Check each of them in this loop
cnt=0
for file in os.listdir(readfolder): 
    if file.endswith('.tcx'):
        cnt+=1
        print(f"Processing file {cnt} of {nFiles}")
        workoutId = file[0:-4]

        # Quality checks:
        #qualityChecks(Time, Pos, LatPts, LongPts, AltM, DistM)
        WorkoutDateOrig = workoutsummary[workoutsummary.workoutid == workoutId][['Workout Date']]  #'workoutid', 'Workout Date'
        WorkoutTimeOrig = workoutsummary[workoutsummary.workoutid == workoutId][['Workout Time (seconds)']]  #'workoutid', 'Workout Date'
                        
        # Check whether this data exists already
        writefile = writefolder + workoutId + '.txt'
        if not(os.path.isfile(writefile)):

            # Label new file
            fh = open(writefile,"a+")
            fh.writelines('TimeStamp\tLatitude\tLongitude\tAltitude(m)\tDistance(m)\t')
            fh.close()

            # LOAD XML FILE:
            tree =  ET.parse(readfolder + file)

            # Identify top-level element:
            root = tree.getroot();

            # --- TO GET FAMILIAR WITH FILE CONTENTS: --- 
            # Print XML in "readable" format
            # print(ET.tostring(root, pretty_print=True).strip())  

            # These are the elements in the XML tree:
            # xmlElem = [elem.tag for elem in root.iter()]
            # print(set(xmlElem))
            
            # Extract workout summary info:
            TimeStamp,Calories,TotalTimeSec,MaxSpeed = extractSummaryInfo(root)
            print('---------------------------------')
            print(f"Workout id: {workoutId} \nDate/Time: {TimeStamp} \nCalories: {Calories} \nTotal Time (seconds): {TotalTimeSec}, {WorkoutTimeOrig} \nMax Speed (seconds): {MaxSpeed}")

            # Extract position data:
            Time, Pos, LatPts, LongPts, AltM, DistM = extractPositionData(root)
            # Check whether any position data exists:
            if len(Time) == 0:
                os.remove(writefile)
                print(f'No data to write to {writefile}')
                numLines, sampInterval = 'NaN','NaN'
            else:
                # (wonky workaround for when there's one extra Time row)
                if (len(Time)-len(Pos)) == 1:
                    Time = Time[0:-1]  

                # Write Position Data 
                numLines = writePositionFile(writefile, Time, LatPts, LongPts, AltM, DistM)
                print(f"Saved txt file with {numLines} lines")

                # Add sample interval
                sampInterval = calc_Time(Time)

            # Write Summary Info 
            writeSummaryData(summaryfile,workoutId,TimeStamp,WorkoutDateOrig, Calories,TotalTimeSec,WorkoutTimeOrig, MaxSpeed,numLines, sampInterval)
            #print(f"Appended workout {workoutId}, {TotalTimeSec} sec, to summary data\n")

Processing file 1 of 441
Processing file 2 of 441
Processing file 3 of 441
Processing file 4 of 441
Processing file 5 of 441
Processing file 6 of 441
Processing file 7 of 441
Processing file 8 of 441
Processing file 9 of 441
Processing file 10 of 441
Processing file 11 of 441
Processing file 12 of 441
---------------------------------
Workout id: 309076653 
Date/Time: 2013-06-19 17:00:00 
Calories: 383.999999778816 
Total Time (seconds): 1680.0,        Workout Time (seconds)
2090                    1680 
Max Speed (seconds): 
No data to write to /Users/barbaraihidalgo-sotelo/PROJECTS/build_workout_dashboard/workouts_timedata_TXT/309076653.txt
Processing file 13 of 441
Processing file 14 of 441
Processing file 15 of 441
Processing file 16 of 441
---------------------------------
Workout id: 371883583 
Date/Time: 2013-08-31 17:00:00 
Calories: 106.99999993836799 
Total Time (seconds): 600.0,        Workout Time (seconds)
2041                     600 
Max Speed (seconds): 
No data to writ

### Notes

In [None]:
# workoutId,TimeStamp,Calories,TotalTimeSec,MaxSpeed,numLines
TotalTimeSec
TimeStamp

In [None]:
# --- Get Position Info ---
# Approach 1 => use etree with "iter" to get tags directly
# # 
# Time, Pos, LatPts, LongPts, AltM, DistM = [], [], [], [], [], []

# [Time.append(m.text) for m in root.iter(base+'Time')]
# [Pos.append(m.text) for m in root.iter(base+'Position')]
# [LatPts.append(m.text) for m in root.iter(base+'LatitudeDegrees')]
# [LongPts.append(m.text) for m in root.iter(base+'LongitudeDegrees')]
# [AltM.append(m.text) for m in root.iter(base+'AltitudeMeters')]
# [DistM.append(m.text) for m in root.iter(base+'DistanceMeters')]

# # Check whether first element in the list is the TOTAL distance meters
# if float(DistM[0]) > 10:
#     print('The first element of DistanceMeters is probably the summary...')
#     print('Thus removing it from array')
#     print(f'(first 3 elements: {DistM[0]}, {DistM[1]}, {DistM[2]})')
#     TotalDistMeters = DistM[0]
#     DistM = DistM[1:]

# Potential Approach 2 => use etree with "iter" to get children of a higher level tag')
# for trackPt in root.iter(base + 'Trackpoint'):
#     for child in trackPt:
#         print(child.tag,'----', child.text)

# Potential Approach 3 => Use RegEx