## Process Sessions

This script processes the sessions data directly downloaded from Garmin Connect 
and compiles all lengths into a single csv file with the following columns:
    
> datetime  |  interval  |  order_in_interval  |  order_in_session  |  style  |  distance  |  time  |  strokes

In [39]:
import pandas as pd
from datetime import datetime
from os import listdir
from os.path import isfile, join

In [77]:
def process_single_session(session_path):
    
    """
    Process a single swimming session.
    Reads the file provided as argument and returns the processed session as a dataframe.
    """
    
    print('Start processing file {}'.format(session_path))
    
    # load single data file
    session_datetime = datetime.strptime(session_path.split(sep='_')[2][:12], '%Y%m%d%H%M')
    session_raw = pd.read_csv(session_path)
    #print('{} lines in raw file'.format(len(session_raw)))
    
    # drop rows: summary of interval, summary of session
    session_processed = session_raw.drop(session_raw[session_raw['Lengths'] > 1.0].index)

    # drop error lengths with '--' in style column
    session_processed = session_processed.drop(session_processed[session_processed['Swim Stroke'] == '--'].index)
    
    # get columns we need and set final columns
    session_final = session_processed[['Split','Distance', 'Time', 'Swim Stroke', 'Total Strokes']]
    session_final['datetime'] = session_datetime
    session_final['interval'] = session_final.apply( 
        lambda x: x['Split'].split('.')[0] if '.' in x['Split'] else '',
        axis=1
    )
    session_final['order_in_interval'] = session_final.apply( 
        lambda x: x['Split'].split('.')[1] if '.' in x['Split'] else '',
        axis=1
    )
    session_final['style'] = session_final.apply(
        lambda x: x['Swim Stroke'] if x['Swim Stroke'] == x['Swim Stroke'] else x['Split'] ,
        axis=1
    )
    session_final['distance'] = session_final.apply( 
        lambda x: x['Distance'],
        axis=1
    )
    session_final['time'] = session_final.apply(
        lambda x: (datetime.strptime(x['Time'], '%H:%M:%S.%f')-datetime.strptime('00:00:00.000', '%H:%M:%S.%f')).total_seconds(),
        axis=1
    )
    session_final['strokes'] = session_final.apply(
        lambda x: x['Total Strokes'] if x['Total Strokes'] != '--' else 0,
        axis=1
    )

    # TODO: add interval and length in rest rows

    # add length index within the session
    session_final=session_final.reset_index(drop=True)
    session_final['order_in_sesion'] = session_final.index + 1 # +1 to avoid 0

    # drop original columns
    session_final = session_final.drop(['Split','Distance', 'Time', 'Swim Stroke', 'Total Strokes'], axis=1)
    
    #print('{} lines in final file'.format(len(session_final)))
    
    # return final dataframe (sorting the columns)
    return session_final[['datetime', 'interval', 'order_in_interval', 'order_in_sesion', 'style', 'distance', 'time', 'strokes']]

    # dump dataframe into csv
    #session_final.to_csv('lengths.csv', header=False, index=False, mode='a')

In [83]:
# get all activity files in data
sessions_folder = 'sessions'
sessions_files = sorted([f for f in listdir('sessions') if ( isfile(join('sessions', f)) and f.startswith('activity'))])

In [88]:
# loop files and process all sessions
all_sessions = pd.DataFrame()
for sf in sessions_files:
    session_df = process_single_session(join(sessions_folder, sf))
    all_sessions = all_sessions.append(session_df)

Start processing file sessions/activity_1255537501_201607141156.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

Start processing file sessions/activity_1259535835_201607151216.csv
Start processing file sessions/activity_1379959220_201609061806.csv
Start processing file sessions/activity_1379959252_201609121854.csv
Start processing file sessions/activity_1379959270_201609151815.csv
Start processing file sessions/activity_1379959299_201609191837.csv
Start processing file sessions/activity_1379959323_201609221800.csv
Start processing file sessions/activity_1379959351_201609261744.csv
Start processing file sessions/activity_1400932066_201609291737.csv
Start processing file sessions/activity_1400932077_201610031737.csv
Start processing file sessions/activity_1400932086_201610051827.csv
Start processing file sessions/activity_1400932096_201610090939.csv
Start processing file sessions/activity_1408867640_201610121001.csv
Start processing file sessions/activity_1408867644_201610150825.csv
Start processing file sessions/activity_1422683148_201610171834.csv
Start processing file sessions/activity_14226831

In [89]:
# reset the indexes after concatenation 
all_sessions = all_sessions.reset_index(drop=True)

In [90]:
print('{} rows'.format(len(all_sessions)))
#print('{} rows | {} distance'.format(len(all_sessions), sum(all_sessions['distance'])))

5638 rows


In [91]:
# clear csv and write header in file where sessions data will be dumped
with open('lengths.csv', 'w') as f:
    f.write('datetime,interval,order_in_interval,order_in_session,style,distance,time,strokes\n')

# dump dataframe into csv
all_sessions.to_csv('lengths.csv', header=False, index=False, mode='a')

### (Manually) Mark Wrong Lengths 

In [48]:
# read the csv file with wrong lengths
wrong_lengths = pd.read_csv('wrong_lengths.csv')

In [49]:
# create column that will contain wether tjhe length is marked as wrong or not (from manual info in file)
all_sessions_with_wrong_lengths = all_sessions
all_sessions_with_wrong_lengths['wrong_manual'] = False

In [50]:
# iterate all lengths
for index, length in all_sessions_with_wrong_lengths.iterrows():
    
    length_datetime_string = length['datetime'].strftime("%Y-%m-%d %H:%M:%S")
    
    # iterate all wrong lengths
    for index_wrong, length_wrong in wrong_lengths.iterrows():

        # check if the iterated df's row is in the manual list of wrong lengths
        if(
            length_datetime_string == length_wrong['datetime'] and \
            length['interval'] == str(length_wrong['interval']) and \
            length['length'] == str(length_wrong['length'])
          ):
            # print('Detected wrong length at index {} : {}-{}-{}'.format(index, length['datetime'], length['interval'], length['length']))
            all_sessions_with_wrong_lengths.loc[index,'wrong_manual'] = True

all_sessions_with_wrong_lengths[all_sessions_with_wrong_lengths['wrong_manual'] == True]

KeyError: 'length'

In [None]:
# clear csv and write header in file where sessions data will be dumped
with open('lengths.csv', 'w') as f:
    f.write('datetime,interval,order_in_interval,order_in_session,style,distance,time,strokes,wrong_manual\n')

# dump dataframe into csv
all_sessions_with_wrong_lengths.to_csv('lengths.csv', header=False, index=False, mode='a')