In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import TensorDataset, DataLoader
import os 
import requests
from datetime import datetime
import pandas as pd

In [87]:
url = 'https://api.ouraring.com/v2/usercollection/daily_stress' 
params={ 
    'start_date': '2021-11-01', 
    'end_date': '2024-12-01' 
}
headers = { 
  'Authorization': f'Bearer {os.getenv("OURA_TOKEN")}' 
}
response = requests.request('GET', url, headers=headers, params=params) 
data = response.json()['data']

In [88]:
def get_data(api):
    url = f'https://api.ouraring.com/v2/usercollection/{api}'
    headers = { 
        'Authorization': f'Bearer {os.getenv("OURA_TOKEN")}' 
    }
    response = requests.request('GET', url, headers=headers, params=params) 
    data = response.json()['data']

    rows = []

    for item in data: 
        row = item.copy()

        if 'contributors' in item:
            row.update(item['contributors'])
            del row['contributors']
        
        rows.append(row)
    
    return rows

In [89]:
def group_data(data):
    for item in data:
        item['day'] = datetime.strptime(item['timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').date()

    df = pd.DataFrame(data)

    grouped = df.groupby('day').agg({'bpm': list, 'source': list, 'timestamp': list}).reset_index()

    return grouped.to_dict('records')

In [105]:
def transform_data(data):
    for item in data:
        item['day'] = datetime.strptime(item['timestamp'], '%Y-%m-%dT%H:%M:%S+00:00').date()
    df = pd.DataFrame(data)

    grouped = df.groupby('day').agg({
        'bpm': ['median', 'mean', lambda x: x.quantile(0.25), lambda x: x.quantile(0.75)],
        'source': lambda x: x.mode()[0] if len(x.mode()) > 0 else np.nan
    }).reset_index()

    grouped.columns = ['_'.join(col).strip() for col in grouped.columns.values]

    return grouped.to_dict('records')

In [164]:
def aggregate_data(apis):
    combined_df = pd.DataFrame()

    for api in apis:
        data = get_data(api)
        df = pd.DataFrame(data)
        df['day'] = pd.to_datetime(df['day']).dt.date
        df = df.set_index('day')
        df.columns = [f'{api}_{col}' for col in df.columns]
        
        if combined_df.empty:
            combined_df = df
        else:
            combined_df = combined_df.join(df, how='outer')
    
    combined_df = combined_df.reset_index()

    return combined_df

In [207]:
api = [
    'daily_sleep',
    'daily_readiness',
    'daily_stress',
    'daily_activity',
]
aggregated_data = aggregate_data(api)

In [208]:
day_summary_mapping = {
    'restored': 1,
    'normal': 0,
    'stressful': -1,
    None: 0
}
aggregated_data['daily_stress_day_summary'] = aggregated_data['daily_stress_day_summary'].map(day_summary_mapping)

In [212]:
if 'week' not in aggregated_data.columns:
    aggregated_data['week'] = pd.to_datetime(aggregated_data['day']).dt.isocalendar().week

In [213]:
aggregated_data = aggregated_data.dropna(axis=1, how='all')

In [214]:
aggregated_data.to_csv('D:\Root\projects\oura_ai\backend\aggregated_data.csv')

In [219]:
def compute_weekly_importances(df, target):
    weekly_importances = []
    weeks = df['week'].unique()
    imputer_X = SimpleImputer(strategy='mean')
    imputer_y = SimpleImputer(strategy='mean')

    for week in weeks:
        df_week = df[df['week'] == week]
        if df_week.empty:
            continue
        
        X_week = df_week.drop(columns=[target, 'day', 'daily_sleep_id', 'daily_sleep_timestamp', 'daily_readiness_id', 
        'daily_readiness_timestamp', 'daily_stress_id', 'daily_stress_timestamp', 'daily_activity_id', 'daily_activity_timestamp'], errors='ignore')
        X_week = X_week.select_dtypes(include=[float, int])
        

        if X_week.empty or X_week.shape[0] == 0 or X_week.shape[1] == 0:
            continue
        
        imputed_data = imputer_X.fit_transform(X_week)
        if imputed_data.shape[1] == len(X_week.columns):
            X_week = pd.DataFrame(imputed_data, columns=X_week.columns)
        else:
            X_week = pd.DataFrame(imputed_data, columns=[f'feature_{i}' for i in range(imputed_data.shape[1])])
        
        y_week = df_week[target]
        if y_week.empty or len(y_week) == 0:
            continue
        
        imputer_y.fit(y_week.values.reshape(-1, 1))
        y_week = pd.Series(imputer_y.transform(y_week.values.reshape(-1, 1)).flatten())
        
        X_train, X_test, y_train, y_test = train_test_split(X_week, y_week, test_size=0.2, random_state=42)
        
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        
        importances = model.feature_importances_
        feature_importance = pd.DataFrame({'feature': X_week.columns, 'importance': importances, 'week': week})
        weekly_importances.append(feature_importance)
    
    if weekly_importances:
        weekly_importances_df = pd.concat(weekly_importances, ignore_index=True)
        return weekly_importances_df
    else:
        return pd.DataFrame(columns=['feature', 'importance', 'week'])


def analyze_feature_importance(importance_df):
    weeks = importance_df['week'].unique()
    last_week = weeks.max()
    
    all_time_importance = importance_df.groupby('feature')['importance'].mean().head(10)
    
    last_week_top_10 = importance_df[importance_df['week'] == last_week].sort_values(by='importance', ascending=False).head(10)

    top_10_weekly = {week: importance_df[importance_df['week'] == week].sort_values(by='importance', ascending=False).head(10)['feature'].tolist() for week in weeks}
    
    three_week_consistent = set()
    for i in range(len(weeks) - 2):
        week1, week2, week3 = weeks[i], weeks[i+1], weeks[i+2]
        top_10_3weeks = set(top_10_weekly[week1]) & set(top_10_weekly[week2]) & set(top_10_weekly[week3])
        three_week_consistent.update(top_10_3weeks)
    
    dropped_features = [feature for feature in three_week_consistent if feature not in top_10_weekly[last_week]]
    
    results = {
        'all_time_top_10': all_time_importance.index.tolist(),
        'last_week_top_10': last_week_top_10['feature'].tolist(),
        'three_week_consistent': list(three_week_consistent),
        'dropped_features': dropped_features
    }
    
    return results

In [220]:
metrics = ['daily_activity_score', 'daily_readiness_score', 'daily_stress_day_summary', 'daily_sleep_score']

importance_dfs = {metric: compute_weekly_importances(aggregated_data, metric) for metric in metrics}

analysis_results = {metric: analyze_feature_importance(importance_df) for metric, importance_df in importance_dfs.items()}

for metric, results in analysis_results.items():
    print(f"\nMetric: {metric}")
    print("\nAll time top 10 features:")
    print(results['all_time_top_10'])
    print("\nLast week top 10 features:")
    print(results['last_week_top_10'])
    print("\nFeatures in top 10 for at least three weeks:")
    print(results['three_week_consistent'])
    print("\nFeatures that were in top 10 for three weeks but not in the last week:")
    print(results['dropped_features'])




Metric: daily_activity_score

All time top 10 features:
['daily_activity_active_calories', 'daily_activity_average_met_minutes', 'daily_activity_equivalent_walking_distance', 'daily_activity_high_activity_met_minutes', 'daily_activity_high_activity_time', 'daily_activity_inactivity_alerts', 'daily_activity_low_activity_met_minutes', 'daily_activity_low_activity_time', 'daily_activity_medium_activity_met_minutes', 'daily_activity_medium_activity_time']

Last week top 10 features:
['daily_readiness_hrv_balance', 'daily_activity_move_every_hour', 'daily_sleep_total_sleep', 'daily_readiness_previous_night', 'daily_activity_inactivity_alerts', 'daily_activity_high_activity_time', 'daily_sleep_latency', 'daily_activity_stay_active', 'daily_activity_medium_activity_time', 'daily_readiness_previous_day_activity']

Features in top 10 for at least three weeks:
['daily_stress_stress_high', 'daily_activity_steps', 'daily_sleep_total_sleep', 'daily_activity_sedentary_time', 'daily_activity_low_act