# Data Wrangling for Strava Data

In [22]:
import pandas as pd
import numpy as np
import json
import os


## Define base constants

Let's define simple converter function to parse and convert activity json objects

In [23]:
PWD: str = "/Users/andreasliistro/Documents/Development/Projects/sport-analysis"
PATH: str = f"{PWD}/strava/data/private/"

In [172]:
def get_lat_lng_value(data: dict, key: str, index: int) -> float:
  value = data[key]
  if (len(value) == 0): return None
  if (len(value) -1 < index): return None
  return value[index]

'''
  Load activity data from JSON file and parse / convert data into
  single row DataFrame.
''' 
def parse_activity(data: dict) -> pd.DataFrame:
  SUPPORTED_TYPES = ['Ride', 'EBikeRide']

  if (data['type'] not in SUPPORTED_TYPES):
    return None

  # data properties we want to keep
  keys = ['id',
    'name',
    'distance',
    'moving_time',
    'elapsed_time',
    'total_elevation_gain',
    'type',
    'sport_type',
    'workout_type',
    'start_date',
    'start_date_local',
    'timezone',
    'utc_offset',
    'location_city',
    'location_state',
    'location_country',
    'achievement_count',
    'kudos_count',
    'comment_count',
    'athlete_count',
    'photo_count',
    'trainer',
    'commute',
    'manual',
    'private',
    'visibility',
    'flagged',
    'gear_id',
    'average_speed',
    'max_speed',
    'average_temp',
    'average_watts',
    'kilojoules',
    'device_watts',
    'has_heartrate',
    'heartrate_opt_out',
    'elev_high',
    'elev_low',
    'upload_id',
    'pr_count']
  data_dict = {key: data.get(key, None) for key in keys}

  data_dict['athlete_id'] = data['athlete']
  data_dict['map_id'] = data.get('map', {}).get('id', None)
  data_dict['start_lat'] = get_lat_lng_value(data, 'start_latlng', 0)
  data_dict['start_lng'] = get_lat_lng_value(data, 'start_latlng', 1)
  data_dict['end_lat'] = get_lat_lng_value(data, 'end_latlng', 0)
  data_dict['end_lng'] = get_lat_lng_value(data, 'end_latlng', 1)

  return pd.DataFrame([pd.Series(data_dict)])

## Load first activity for testing

In [236]:
# test for sigle activity
file_path: str = f"{PATH}raw/activities/7522108142.json"

with open(file_path, 'r') as f:
  # load json data
  data = json.load(f)

  df = parse_activity(data)
  print(df.shape)
  print(df.head())

(1, 46)
           id            name  distance  moving_time  elapsed_time  \
0  7522108142  Afternoon Ride   25571.3         4141          5393   

   total_elevation_gain  type sport_type  workout_type            start_date  \
0                 333.7  Ride       Ride             0  2022-07-24T15:10:23Z   

   ... elev_high elev_low   upload_id pr_count  \
0  ...     553.8    287.7  8020355592        0   

                               athlete_id       map_id  start_lat  start_lng  \
0  {'id': 106018162, 'resource_state': 1}  a7522108142   47.50441   8.026813   

    end_lat  end_lng  
0  47.50402  8.02637  

[1 rows x 46 columns]


# Load all activities and combine into single DataFrame

Let's iterate over all files in dictionary and build one dataframe from all data

In [175]:
directory: str = f"{PATH}raw/activities"

df_activities = pd.DataFrame()

for filename in os.listdir(directory):
  if filename.endswith(".json"):
    file_path = f"{directory}/{filename}"
    with open(file_path, 'r') as f:
      # load json data
      data = json.load(f)
      # parse data into DataFrame
      df = parse_activity(data)

      # skip if df is None
      if df is None:
        continue

      df_activities = pd.concat([df_activities, df], axis=0, ignore_index=True)

      f.close()

print(df_activities.head(), '\n')
print(df_activities.shape, '\n')
print(df_activities.describe(), '\n')
      

(107, 46) 

                 id       distance   moving_time  elapsed_time  \
count  1.070000e+02     107.000000    107.000000    107.000000   
mean   8.929347e+09   31850.266355   4843.074766   7199.168224   
std    1.096973e+09   18383.889405   2737.241287   5105.172622   
min    7.522108e+09       0.000000    848.000000   1136.000000   
25%    7.771832e+09   19157.200000   2618.500000   3001.500000   
50%    9.159410e+09   30138.100000   4629.000000   6290.000000   
75%    9.720977e+09   42874.850000   6561.000000   9805.000000   
max    1.092471e+10  102158.000000  16581.000000  24660.000000   

       total_elevation_gain   utc_offset  achievement_count  kudos_count  \
count            107.000000   107.000000         107.000000   107.000000   
mean             291.473832  5887.850467           5.981308     0.672897   
std              227.394569  2062.271613           7.522270     0.939447   
min                0.000000     0.000000           0.000000     0.000000   
25%          

  df_activities = pd.concat([df_activities, df], axis=0, ignore_index=True)


### Store data as csv data

In [178]:
# create csv from DataFrame and store in data folder
df_activities.to_csv(f"{PATH}activities.csv", index=False)

## Work with stream data

Now let's work with stream data and create individual stream.csv files for each stream download

In [261]:
def parse_stream(data: dict) -> pd.DataFrame:
  # create simple df
  df = pd.DataFrame(data)

  # move last column to be first column
  df = df[ ['time'] + [ col for col in df.columns if col != 'time' ] ]

  if ('latlng' in df.columns):
    df['lat'] = df['latlng'].apply(lambda x: x[0])
    df['lng'] = df['latlng'].apply(lambda x: x[1])

    # drop latlng column
    df = df.drop('latlng', axis=1)

  # # convert latlng in to separate columns
  # df['lat'] = df['latlng'].apply(lambda x: x[0])
  # df['lng'] = df['latlng'].apply(lambda x: x[1])

  return df

## Work with single Stream file for testing

In [241]:
# test for sigle activity
file_path: str = f"{PATH}raw/streams/7522108142.json"

with open(file_path, 'r') as f:
  # load json data
  data = json.load(f)
  dict = { obj['type']: obj['data'] for obj in data }

  df = parse_stream(dict)
  print(df.shape, '\n')
  print(df.head(), '\n')

(3626, 8) 

   time  moving                 latlng  velocity_smooth  grade_smooth  \
0     0   False   [47.50441, 8.026813]            0.000           0.8   
1     2    True   [47.50447, 8.026946]            3.026           0.6   
2     3    True  [47.504494, 8.027005]            4.011           0.5   
3     4    True  [47.504493, 8.027016]            4.309          -4.2   
4     7    True   [47.50449, 8.027079]            2.867          -4.4   

   distance  altitude  heartrate  
0       0.0     357.9         77  
1       6.1     358.0         77  
2      12.0     358.0         77  
3      17.2     358.0         77  
4      20.4     358.0         80   



## Convert all streams files

Go through all files, convert and store as csv files

In [262]:
directory: str = f"{PATH}raw/streams"

for index, row in df_activities.iterrows():
  # print(row['id'])
  id = row['id']
  file_path = f"{directory}/{id}.json"

  with open(file_path, 'r') as f:
    # load json data
    data = json.load(f)
    dict = { obj['type']: obj['data'] for obj in data }

    df = parse_stream(dict)
    df.to_csv(f"{PATH}streams/{id}.csv", index=False)

    f.close()