<a href="https://colab.research.google.com/github/cmannnn/2021_strava/blob/main/2021_strava.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# plt.style.use('ggplot')
# sns.set_style('darkgrid')

In [2]:
# mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# setting the path of each interesting CSV
activities_path = "/content/drive/MyDrive/Colabby Krabby Paddy/2021_strava/2021_strava/activities.csv"
local_legends_segments_path = "/content/drive/MyDrive/Colabby Krabby Paddy/2021_strava/2021_strava/local_legend_segments.csv"
kudos_path = "/content/drive/MyDrive/Colabby Krabby Paddy/2021_strava/2021_strava/kudos.csv"

# Importing, cleaning activities_df

In [4]:
# reading in activities data
activities_df = pd.read_csv(activities_path, parse_dates = True)

In [5]:
# columns to drop from activities_df
cols_to_drop = ['Activity Description', 'Max Heart Rate', 'Relative Effort', 'Commute', 'Activity Gear', 'Filename', 'Athlete Weight', 
                'Bike Weight', 'Elapsed Time.1', 'Moving Time', 'Distance.1', 'Average Positive Grade', 'Average Negative Grade',
                'Max Cadence', 'Average Cadence', 'Max Heart Rate.1', 'Average Heart Rate', 'Max Watts', 'Average Watts', 'Max Temperature',
                'Average Temperature', 'Relative Effort.1', 'Total Work', 'Number of Runs', 'Uphill Time', 'Downhill Time', 'Other Time',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.start_time">Start Time</span>',
                'Weighted Average Power', 'Power Count', 'Prefer Perceived Exertion', 'Commute.1', 'Total Weight Lifted', 'From Upload', 'Grade Adjusted Distance',
                'Weather Observation Time', 'Weather Condition', 'Weather Temperature', 'Apparent Temperature', 'Dewpoint', 'Humidity', 'Weather Pressure',
                'Wind Speed', 'Wind Gust', 'Wind Bearing', 'Precipitation Intensity', 'Sunset Time', 'Moon Phase', 'Bike', 'Gear', 'Precipitation Probability', 
                'Precipitation Type', 'Cloud Cover', 'Weather Visibility', 'UV Index', 'Weather Ozone', 'Sunrise Time', 'Activity Name',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.type">Type</span>', 
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.jump_count">Jump Count</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.total_grit">Total Grit</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_flow">Avg Flow</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.flagged">Flagged</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.avg_elapsed_speed">Avg Elapsed Speed</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.dirt_distance">Dirt Distance</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.newly_explored_distance">Newly Explored Distance</span>',
                '<span class="translation_missing" title="translation missing: en-US.lib.export.portability_exporter.activities.horton_values.newly_explored_dirt_distance">Newly Explored Dirt Distance</span>',
                ]

In [6]:
# dropping the columns
activities_df.drop(axis=1, columns = cols_to_drop, inplace = True)

In [7]:
# checking correct columns were dropped
print(activities_df.columns)

Index(['Activity ID', 'Activity Date', 'Activity Type', 'Elapsed Time',
       'Distance', 'Max Speed', 'Average Speed', 'Elevation Gain',
       'Elevation Loss', 'Elevation Low', 'Elevation High', 'Max Grade',
       'Average Grade', 'Calories', 'Perceived Exertion',
       'Perceived Relative Effort'],
      dtype='object')


In [8]:
print(activities_df['Activity Date'].head())

0    Jun 28, 2020, 5:03:05 PM
1    Jun 29, 2020, 9:42:34 PM
2    Jun 30, 2020, 9:01:11 PM
3     Jul 1, 2020, 8:38:04 PM
4     Jul 3, 2020, 1:44:58 PM
Name: Activity Date, dtype: object


In [9]:
# changing activity date column to datetime
activities_df['Activity Date'] = pd.to_datetime(activities_df['Activity Date'])

In [10]:
# checking the data types of each column
print(activities_df.dtypes)

Activity ID                           int64
Activity Date                datetime64[ns]
Activity Type                        object
Elapsed Time                          int64
Distance                             object
Max Speed                           float64
Average Speed                       float64
Elevation Gain                      float64
Elevation Loss                      float64
Elevation Low                       float64
Elevation High                      float64
Max Grade                           float64
Average Grade                       float64
Calories                            float64
Perceived Exertion                  float64
Perceived Relative Effort           float64
dtype: object


In [11]:
# filtering activities_df to only 2021 data
activities_df = activities_df[activities_df['Activity Date'] > '2021-01-01']

In [12]:
# setting activity date as the index of activities_df
activities_df.set_index('Activity Date', inplace = True)

In [13]:
# elapsed time in activities_df was defaulted to seconds, changing it to minutes
activities_df['Elapsed Time'] = activities_df['Elapsed Time'] / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [14]:
# changing distance column to float64
activities_df['Distance'] = activities_df['Distance'].astype(np.float64)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [18]:
# changing distance from km to mi
activities_df['Distance'] = (activities_df['Distance'] / (8/5)) 

In [19]:
# changing elevation gain, elevation loss, elevation low, elevation high from meters to feet
activities_df['Elevation Gain'] = (activities_df['Elevation Gain'] * 3.28084)
activities_df['Elevation Loss'] = (activities_df['Elevation Loss'] * 3.28084)
activities_df['Elevation Low'] = (activities_df['Elevation Low'] * 3.28084)
activities_df['Elevation High'] = (activities_df['Elevation High'] * 3.28084)

# activities_df analysis

In [18]:
# total activity distance sum
activities_df['Distance'].sum()

4856.0

In [19]:
# counts of each workout
activities_df['Activity Type'].value_counts()

Run                235
Ride                16
Backcountry Ski     16
Hike                 9
Virtual Ride         5
Walk                 3
Workout              3
Alpine Ski           2
Elliptical           1
Name: Activity Type, dtype: int64

In [20]:
# sum of run data
activities_df[activities_df['Activity Type'] == 'Run'].sum()

Activity ID                                                      1317445878402
Activity Type                RunRunRunRunRunRunRunRunRunRunRunRunRunRunRunR...
Elapsed Time                                                           19935.2
Distance                                                               3693.57
Max Speed                                                              1420.92
Average Speed                                                          812.759
Elevation Gain                                                         59704.6
Elevation Loss                                                         59521.4
Elevation Low                                                          13241.4
Elevation High                                                         36786.8
Max Grade                                                              5032.08
Average Grade                                                         -2.30651
Calories                                            

In [21]:
# sum of bike data
activities_df[activities_df['Activity Type'] == 'Ride'].sum()

Activity ID                                                        90594882152
Activity Type                RideRideRideRideRideRideRideRideRideRideRideRi...
Elapsed Time                                                           2603.85
Distance                                                                 773.3
Max Speed                                                              281.672
Average Speed                                                          93.4108
Elevation Gain                                                         12170.1
Elevation Loss                                                         12034.7
Elevation Low                                                           4396.9
Elevation High                                                          7905.3
Max Grade                                                              374.962
Average Grade                                                         0.317965
Calories                                            

In [22]:
# longest run
activities_df[activities_df['Activity Type'] == 'Run'].max()

Activity ID                  6453199183
Activity Type                       Run
Elapsed Time                    464.733
Distance                          50.85
Max Speed                       21.5308
Average Speed                   4.95201
Elevation Gain                   1390.1
Elevation Loss                   1392.6
Elevation Low                    2027.8
Elevation High                   2098.6
Max Grade                            50
Average Grade                 0.0954654
Calories                           2368
Perceived Exertion                    9
Perceived Relative Effort           732
dtype: object

In [23]:
# longest ride
activities_df[activities_df['Activity Type'] == 'Ride'].max()

Activity ID                  6314894348
Activity Type                      Ride
Elapsed Time                     699.75
Distance                         139.38
Max Speed                          20.7
Average Speed                    7.8865
Elevation Gain                  1585.19
Elevation Loss                  1585.19
Elevation Low                    2054.5
Elevation High                   2197.5
Max Grade                          48.1
Average Grade                  0.235899
Calories                            NaN
Perceived Exertion                    9
Perceived Relative Effort          1012
dtype: object

In [25]:
activities_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 290 entries, 2021-01-01 19:06:13 to 2021-12-31 16:37:36
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Activity ID                290 non-null    int64  
 1   Activity Type              290 non-null    object 
 2   Elapsed Time               290 non-null    float64
 3   Distance                   290 non-null    float64
 4   Max Speed                  286 non-null    float64
 5   Average Speed              290 non-null    float64
 6   Elevation Gain             286 non-null    float64
 7   Elevation Loss             286 non-null    float64
 8   Elevation Low              286 non-null    float64
 9   Elevation High             286 non-null    float64
 10  Max Grade                  286 non-null    float64
 11  Average Grade              290 non-null    float64
 12  Calories                   101 non-null    float64
 13  Perceived Exe

# Importing, cleaning local legends segments df

In [None]:
local_legends_segments_df = pd.read_csv(local_legends_segments_path)

In [None]:
print(local_legends_segments_df)

    Segment ID                 Segment Name
0       784795             Trafton Rd Climb
1      5265111                  Sheddsville
2      6548500                      college
3      8396145                        Kings
4      8691292             Cypress St Climb
5      8691298             Abbott St Ascent
6     13421564                       Barton
7     21915885     Pawtucket Avenue (North)
8     22788662       Lower Cypress Downhill
9     23961262         Wobble up the Cobble
10    25261378  ROGUE OVERLAND - Segment #2
11    28533121               BH to Cemetery


# Importing, cleaning kudos df

In [None]:
# reading kudos into pandas
kudos_df = pd.read_csv(kudos_path)

In [None]:
# changing the kudos date col to datetime
kudos_df['Kudos Date'] = pd.to_datetime(kudos_df['Kudos Date'])

In [None]:
# filtering the kudos date to only 2021
kudos_df = kudos_df[kudos_df['Kudos Date'] > '2021-01-01']

In [None]:
# setting the index to kudos date
kudos_df.set_index('Kudos Date', inplace = True)

In [None]:
print(kudos_df.head(10))

                    Kudos Type   Parent ID
Kudos Date                                
2021-01-03 18:15:53   Activity  4557471533
2021-01-11 14:41:43   Activity  4601576163
2021-01-11 14:41:53   Activity  4601235756
2021-01-15 18:59:46   Activity  4621176515
2021-01-18 00:28:34   Activity  4632648762
2021-01-18 00:29:07   Activity  4631905956
2021-01-18 00:29:09   Activity  4632190150
2021-01-18 01:22:35   Activity  4638698280
2021-01-19 00:16:22   Activity  4644196837
2021-01-19 00:16:26   Activity  4643820879


# Downloading modified files

In [None]:
from google.colab import files

activities_df.to_csv('activities_df.csv')
local_legends_segments_df.to_csv('local_legends_segments_df.csv')
kudos_df.to_csv('kudos_df')


files.download('activities_df.csv')
files.download('local_legends_segments_df.csv')
files.download('kudos_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>