In [58]:
# import libraries
from dotenv import load_dotenv
from datetime import datetime, timezone
import psycopg2
import pandas as pd 
from psycopg2 import sql
import numpy as np 
import os 

from functions import convert_nanoseconds_to_datetime

In [26]:
# Load environment variables
load_dotenv()

# Retrieve credentials from environment
db_name = os.getenv("DB_NAME")
db_user = os.getenv("DB_USER")
db_password = os.getenv("DB_PASSWORD")
db_host = os.getenv("DB_HOST")
db_port = os.getenv("DB_PORT")

In [27]:
try:
    # Connect to the database
    conn = psycopg2.connect(
        dbname=db_name,
        user=db_user,
        password=db_password,
        host=db_host,
        port=db_port
    )
    print("Connected to the database successfully!")

    # Create a cursor
    cursor = conn.cursor()

    # Queries for the three datasets
    map_my_run_query = """SELECT * FROM map_my_run_db;"""
    polar_data_query = """SELECT * FROM polar_data_db;"""
    watch_exercise_query = """SELECT * FROM watch_exercise_data_db;"""

    # Fetch data from map_my_run_db
    cursor.execute(map_my_run_query)
    map_my_run_data = cursor.fetchall()
    map_my_run_columns = [desc[0] for desc in cursor.description]
    map_my_run_df = pd.DataFrame(map_my_run_data, columns=map_my_run_columns)

    # Fetch data from polar_daba_db
    cursor.execute(polar_data_query)
    polar_data = cursor.fetchall()
    polar_data_columns = [desc[0] for desc in cursor.description]
    polar_data_df = pd.DataFrame(polar_data, columns=polar_data_columns)

    # Fetch data from watch_exercise_data_db
    cursor.execute(watch_exercise_query)
    watch_exercise_data = cursor.fetchall()
    watch_exercise_columns = [desc[0] for desc in cursor.description]
    watch_exercise_df = pd.DataFrame(watch_exercise_data, columns=watch_exercise_columns)

    # Close the cursor and connection
    cursor.close()
    conn.close()
    print("Data fetched and connection closed!")

except Exception as e:
    print(f"An error occurred: {e}")



Connected to the database successfully!
Data fetched and connection closed!


In [28]:
print(f"Map My Run: {map_my_run_df.shape[0]} rows")
print(f"Polar: {polar_data_df.shape[0]} rows")
print(f"GalaxyWatch: {watch_exercise_df.shape[0]} rows")

Map My Run: 94833 rows
Polar: 35850 rows
GalaxyWatch: 23092 rows


In [29]:
map_my_run_df['datetime'] = map_my_run_df['time'].apply(convert_nanoseconds_to_datetime)

In [30]:
# Create a column to hold the date of the run
polar_data_df['datetime'] = polar_data_df['real_time'].apply(convert_nanoseconds_to_datetime)

In [31]:
watch_exercise_df['datetime'] = watch_exercise_df['start_time'].apply(convert_nanoseconds_to_datetime)

In [32]:
map_my_run_df['distance'].value_counts()

distance
5007.5717    2
5719.5903    2
6272.4760    2
4784.5659    2
2326.4072    2
            ..
2167.1740    1
2163.7290    1
2160.2956    1
2156.8605    1
6500.9731    1
Name: count, Length: 93668, dtype: int64

In [33]:
map_my_run_df[['time', 'distance']]

Unnamed: 0,time,distance
0,1725676267872000000,
1,1725676268304000000,
2,1725676269475000000,
3,1725676270584000000,
4,1725676271310000000,
...,...,...
94828,1727568034569000000,6491.7138
94829,1727568035658000000,6494.8334
94830,1727568036567000000,6497.9189
94831,1727568037551000000,6500.9731


In [34]:
map_my_run_df = map_my_run_df.sort_values(by='time')

In [35]:
map_my_run_df

Unnamed: 0,time,latitude,longitude,altitude,distance,activity_sport,activity_id,lap_start_time,lap_total_time_seconds,lap_distance_meters,date,elapsed_seconds,datetime
37720,1709764304882000000,,,,,Running,2024-03-06T22:31:44+00:00,2024-03-06 14:31:44,2623,8044.579572,1709683200000000000,0.000,2024-03-06 22:31:44.882
37721,1709764305072000000,,,,,Running,2024-03-06T22:31:44+00:00,2024-03-06 14:31:44,2623,8044.579572,1709683200000000000,0.190,2024-03-06 22:31:45.072
37722,1709764306506000000,,,,,Running,2024-03-06T22:31:44+00:00,2024-03-06 14:31:44,2623,8044.579572,1709683200000000000,1.624,2024-03-06 22:31:46.506
37723,1709764307072000000,,,,,Running,2024-03-06T22:31:44+00:00,2024-03-06 14:31:44,2623,8044.579572,1709683200000000000,2.190,2024-03-06 22:31:47.072
37724,1709764308072000000,,,,,Running,2024-03-06T22:31:44+00:00,2024-03-06 14:31:44,2623,8044.579572,1709683200000000000,3.190,2024-03-06 22:31:48.072
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16193,1731010399165000000,37.865282,-122.301938,3.78,7233.5375,Running,2024-11-07T19:35:17+00:00,2024-11-07 11:35:17,2284,7239.473050,1730937600000000000,2280.589,2024-11-07 20:13:19.165
16194,1731010400166000000,37.865306,-122.301950,3.85,7236.5122,Running,2024-11-07T19:35:17+00:00,2024-11-07 11:35:17,2284,7239.473050,1730937600000000000,2281.590,2024-11-07 20:13:20.166
16195,1731010401166000000,37.865329,-122.301963,3.91,7239.4750,Running,2024-11-07T19:35:17+00:00,2024-11-07 11:35:17,2284,7239.473050,1730937600000000000,2282.590,2024-11-07 20:13:21.166
48690,,37.851818,-122.296438,5.17,1928.3666,Running,2024-10-15T18:18:53+00:00,2024-10-15 11:18:53,2190,6434.559648,,,NaT


In [36]:
def convert_speed(elapsed_seconds, distance_meters):
    """Function to create speed based on elapsed seconds and distance in meters."""
    if elapsed_seconds is None or distance_meters is None:
        return None

    try:
        # Convert distance from meters to miles
        distance_miles = distance_meters * 0.000621371
    except (ValueError, TypeError):
        return None

    if elapsed_seconds <= 0:
        return None

    speed = distance_miles / (elapsed_seconds / 3600)  # convert seconds to hours
    return round(speed, 2)  # Output speed rounded to two decimal places

In [37]:
map_my_run_df['speed'] = map_my_run_df.apply(lambda row: convert_speed(row['elapsed_seconds'], row['distance']), axis=1)


In [49]:
len(map_my_run_df['activity_id'].unique())

45

In [39]:
polar_data_df['first_datetime'] = polar_data_df.groupby('source_file')['datetime'].transform('first')
polar_data_df['elapsed_seconds'] = (polar_data_df['datetime'] - polar_data_df['first_datetime']).dt.total_seconds()


In [59]:
map_my_run_df['heart_rate'] = np.nan


In [64]:
watch_exercise_df['app_id'] = 0
polar_data_df['app_id'] = 1
map_my_run_df['app_id'] = 2

In [65]:
map_my_run_df[['activity_id', 'app_id', 'datetime', 'elapsed_seconds', 'heart_rate', 'speed', 'distance','latitude', 'longitude', 'altitude']]

Unnamed: 0,activity_id,app_id,datetime,elapsed_seconds,heart_rate,speed,distance,latitude,longitude,altitude
37720,2024-03-06T22:31:44+00:00,2,2024-03-06 22:31:44.882,0.000,,,,,,
37721,2024-03-06T22:31:44+00:00,2,2024-03-06 22:31:45.072,0.190,,,,,,
37722,2024-03-06T22:31:44+00:00,2,2024-03-06 22:31:46.506,1.624,,,,,,
37723,2024-03-06T22:31:44+00:00,2,2024-03-06 22:31:47.072,2.190,,,,,,
37724,2024-03-06T22:31:44+00:00,2,2024-03-06 22:31:48.072,3.190,,,,,,
...,...,...,...,...,...,...,...,...,...,...
16193,2024-11-07T19:35:17+00:00,2,2024-11-07 20:13:19.165,2280.589,,7.10,7233.5375,37.865282,-122.301938,3.78
16194,2024-11-07T19:35:17+00:00,2,2024-11-07 20:13:20.166,2281.590,,7.09,7236.5122,37.865306,-122.301950,3.85
16195,2024-11-07T19:35:17+00:00,2,2024-11-07 20:13:21.166,2282.590,,7.09,7239.4750,37.865329,-122.301963,3.91
48690,2024-10-15T18:18:53+00:00,2,NaT,,,,1928.3666,37.851818,-122.296438,5.17


In [66]:
polar_data_df

Unnamed: 0,time,heart_rate,speed,original_pace,distances,real_time,activity_id,datetime,first_datetime,elapsed_seconds,app_id
0,0,116.0,0.0,00:00,0.00,1707230610000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:30,2024-02-06 14:43:30,0.0,1
1,1000000000,116.0,3.2,18:36,0.00,1707230611000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:31,2024-02-06 14:43:30,1.0,1
2,2000000000,116.0,5.9,10:13,14.85,1707230612000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:32,2024-02-06 14:43:30,2.0,1
3,3000000000,116.0,5.9,10:13,27.78,1707230613000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:33,2024-02-06 14:43:30,3.0,1
4,4000000000,116.0,7.5,07:57,42.63,1707230614000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:34,2024-02-06 14:43:30,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...
35845,1931000000000,158.0,6.8,08:47,21076.54,1706539114000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:34,2024-01-29 14:06:23,1931.0,1
35846,1932000000000,158.0,7.0,08:33,21092.27,1706539115000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:35,2024-01-29 14:06:23,1932.0,1
35847,1933000000000,158.0,7.1,08:24,21099.58,1706539116000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:36,2024-01-29 14:06:23,1933.0,1
35848,1934000000000,158.0,7.1,08:30,21115.31,1706539117000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:37,2024-01-29 14:06:23,1934.0,1


In [70]:
polar_data_df = polar_data_df.rename(columns={'source_file': 'activity_id', 'hr': 'heart_rate', 'distances': 'distance'})

In [71]:
polar_data_df

Unnamed: 0,time,heart_rate,speed,original_pace,distance,real_time,activity_id,datetime,first_datetime,elapsed_seconds,app_id
0,0,116.0,0.0,00:00,0.00,1707230610000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:30,2024-02-06 14:43:30,0.0,1
1,1000000000,116.0,3.2,18:36,0.00,1707230611000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:31,2024-02-06 14:43:30,1.0,1
2,2000000000,116.0,5.9,10:13,14.85,1707230612000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:32,2024-02-06 14:43:30,2.0,1
3,3000000000,116.0,5.9,10:13,27.78,1707230613000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:33,2024-02-06 14:43:30,3.0,1
4,4000000000,116.0,7.5,07:57,42.63,1707230614000000000,David+_Diaz+_2024-02-06_14-43-30.CSV,2024-02-06 14:43:34,2024-02-06 14:43:30,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...
35845,1931000000000,158.0,6.8,08:47,21076.54,1706539114000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:34,2024-01-29 14:06:23,1931.0,1
35846,1932000000000,158.0,7.0,08:33,21092.27,1706539115000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:35,2024-01-29 14:06:23,1932.0,1
35847,1933000000000,158.0,7.1,08:24,21099.58,1706539116000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:36,2024-01-29 14:06:23,1933.0,1
35848,1934000000000,158.0,7.1,08:30,21115.31,1706539117000000000,David+_Diaz+_2024-01-29_14-06-23.CSV,2024-01-29 14:38:37,2024-01-29 14:06:23,1934.0,1


In [73]:
polar_data_df['latitude'] = np.nan
polar_data_df['longitude'] = np.nan
polar_data_df['altitude'] = np.nan

In [74]:
polar_data_df[['activity_id', 'app_id', 'datetime', 'elapsed_seconds', 'heart_rate', 'speed', 'distance','latitude', 'longitude', 'altitude']]


Unnamed: 0,activity_id,app_id,datetime,elapsed_seconds,heart_rate,speed,distance,latitude,longitude,altitude
0,David+_Diaz+_2024-02-06_14-43-30.CSV,1,2024-02-06 14:43:30,0.0,116.0,0.0,0.00,,,
1,David+_Diaz+_2024-02-06_14-43-30.CSV,1,2024-02-06 14:43:31,1.0,116.0,3.2,0.00,,,
2,David+_Diaz+_2024-02-06_14-43-30.CSV,1,2024-02-06 14:43:32,2.0,116.0,5.9,14.85,,,
3,David+_Diaz+_2024-02-06_14-43-30.CSV,1,2024-02-06 14:43:33,3.0,116.0,5.9,27.78,,,
4,David+_Diaz+_2024-02-06_14-43-30.CSV,1,2024-02-06 14:43:34,4.0,116.0,7.5,42.63,,,
...,...,...,...,...,...,...,...,...,...,...
35845,David+_Diaz+_2024-01-29_14-06-23.CSV,1,2024-01-29 14:38:34,1931.0,158.0,6.8,21076.54,,,
35846,David+_Diaz+_2024-01-29_14-06-23.CSV,1,2024-01-29 14:38:35,1932.0,158.0,7.0,21092.27,,,
35847,David+_Diaz+_2024-01-29_14-06-23.CSV,1,2024-01-29 14:38:36,1933.0,158.0,7.1,21099.58,,,
35848,David+_Diaz+_2024-01-29_14-06-23.CSV,1,2024-01-29 14:38:37,1934.0,158.0,7.1,21115.31,,,


In [55]:
watch_exercise_df = watch_exercise_df.rename(columns={'live_data': 'activity_id'})


In [76]:
watch_exercise_df['latitude'] = np.nan
watch_exercise_df['longitude'] = np.nan
watch_exercise_df['altitude'] = np.nan

In [77]:
watch_exercise_df[['activity_id', 'app_id', 'datetime', 'elapsed_seconds', 'heart_rate', 'speed', 'distance','latitude', 'longitude', 'altitude']]

Unnamed: 0,activity_id,app_id,datetime,elapsed_seconds,heart_rate,speed,distance,latitude,longitude,altitude
0,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:54:25.725,0,,1.338654,,,,
1,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:55:25.724,59,,1.145645,,,,
2,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:56:25.724,119,,1.345484,,,,
3,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:57:25.662,179,87.0,,,,,
4,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:57:25.724,179,,1.412211,,,,
...,...,...,...,...,...,...,...,...,...,...
23087,0e5da72a-2435-8b22-4a88-f81e07676fce.live_data,0,2024-11-07 16:27:50.663,988,119.0,,,,,
23088,0e5da72a-2435-8b22-4a88-f81e07676fce.live_data,0,2024-11-07 16:28:00.664,998,117.0,,,,,
23089,0e5da72a-2435-8b22-4a88-f81e07676fce.live_data,0,2024-11-07 16:28:10.666,1008,122.0,,,,,
23090,0e5da72a-2435-8b22-4a88-f81e07676fce.live_data,0,2024-11-07 16:28:20.667,1018,122.0,,,,,


In [78]:
watch_exercise_df_filtered = watch_exercise_df[['activity_id', 'app_id', 'datetime', 'elapsed_seconds', 'heart_rate', 'speed', 'distance','latitude', 'longitude', 'altitude']]
polar_data_df_filtered = polar_data_df[['activity_id', 'app_id', 'datetime', 'elapsed_seconds', 'heart_rate', 'speed', 'distance','latitude', 'longitude', 'altitude']]
map_my_run_df_filtered = map_my_run_df[['activity_id', 'app_id', 'datetime', 'elapsed_seconds', 'heart_rate', 'speed', 'distance','latitude', 'longitude', 'altitude']]


In [79]:
# Concatenating the filtered DataFrames
combined_df = pd.concat(
    [watch_exercise_df_filtered, polar_data_df_filtered, map_my_run_df_filtered],
    axis=0,  # Concatenate along rows
    ignore_index=True  # Reindex the combined DataFrame
)

In [80]:
combined_df

Unnamed: 0,activity_id,app_id,datetime,elapsed_seconds,heart_rate,speed,distance,latitude,longitude,altitude
0,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:54:25.725,0.000,,1.338654,,,,
1,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:55:25.724,59.000,,1.145645,,,,
2,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:56:25.724,119.000,,1.345484,,,,
3,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:57:25.662,179.000,87.0,,,,,
4,04ddf549-cbc9-351f-9b1a-51511470d787.live_data,0,2024-04-26 20:57:25.724,179.000,,1.412211,,,,
...,...,...,...,...,...,...,...,...,...,...
153770,2024-11-07T19:35:17+00:00,2,2024-11-07 20:13:19.165,2280.589,,7.100000,7233.5375,37.865282,-122.301938,3.78
153771,2024-11-07T19:35:17+00:00,2,2024-11-07 20:13:20.166,2281.590,,7.090000,7236.5122,37.865306,-122.301950,3.85
153772,2024-11-07T19:35:17+00:00,2,2024-11-07 20:13:21.166,2282.590,,7.090000,7239.4750,37.865329,-122.301963,3.91
153773,2024-10-15T18:18:53+00:00,2,NaT,,,,1928.3666,37.851818,-122.296438,5.17
