In [221]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import TensorDataset, DataLoader
import os 
import requests
from datetime import datetime
import pandas as pd
from sklearn.preprocessing import StandardScaler

## Load

In [223]:
url = 'https://api.ouraring.com/v2/usercollection/daily_stress' 
params={ 
    'start_date': '2021-11-01', 
    'end_date': '2024-12-01' 
}
headers = { 
  'Authorization': f'Bearer {os.getenv("OURA_TOKEN")}' 
}
response = requests.request('GET', url, headers=headers, params=params) 
data = response.json()['data']

In [224]:
def get_data(api):
    url = f'https://api.ouraring.com/v2/usercollection/{api}'
    headers = { 
        'Authorization': f'Bearer {os.getenv("OURA_TOKEN")}' 
    }
    response = requests.request('GET', url, headers=headers, params=params) 
    data = response.json()['data']

    rows = []

    for item in data: 
        row = item.copy()

        if 'contributors' in item:
            row.update(item['contributors'])
            del row['contributors']
        
        rows.append(row)
    
    return rows


def aggregate_data(apis):
    combined_df = pd.DataFrame()

    for api in apis:
        data = get_data(api)
        df = pd.DataFrame(data)
        df['day'] = pd.to_datetime(df['day']).dt.date
        df = df.set_index('day')
        df.columns = [f'{api}_{col}' for col in df.columns]
        
        if combined_df.empty:
            combined_df = df
        else:
            combined_df = combined_df.join(df, how='outer')
    
    combined_df = combined_df.reset_index()

    return combined_df

In [246]:
api = [
    'daily_sleep',
    'daily_readiness',
    'daily_stress',
    'daily_activity',
]
aggregated_data = aggregate_data(api)

## Normalization

In [247]:
day_summary_mapping = {
    'restored': 1,
    'normal': 0,
    'stressful': -1,
    None: 0
}
aggregated_data['daily_stress_day_summary'] = aggregated_data['daily_stress_day_summary'].map(day_summary_mapping)

aggregated_data['week'] = pd.to_datetime(aggregated_data['day']).dt.isocalendar().week

In [248]:
columns_to_drop = ['daily_sleep_id', 'daily_sleep_timestamp', 'daily_readiness_id', 
                   'daily_readiness_timestamp', 'daily_stress_id', 'daily_activity_id', 
                   'daily_activity_timestamp', 'daily_activity_met', 'daily_activity_class_5_min']

aggregated_data = aggregated_data.drop(columns=[col for col in columns_to_drop if col in aggregated_data.columns])

In [253]:
aggregated_data = aggregated_data.dropna(axis=1, how='all')
numeric_columns = aggregated_data.select_dtypes(include=['float64', 'int64'])
aggregated_data[numeric_columns.columns] = numeric_columns.fillna(numeric_columns.mean())

In [254]:
scaler = StandardScaler()
aggregated_data[numeric_columns.columns] = scaler.fit_transform(numeric_columns)

In [255]:
aggregated_data.to_csv('D:\Root\projects\oura_ai\backend\aggregated_data.csv')

## Model 

In [285]:
def compute_weekly_importances(df, target):
    weekly_importances = []
    weeks = df['week'].unique()
    imputer_X = SimpleImputer(strategy='mean')
    imputer_y = SimpleImputer(strategy='mean')

    for week in weeks:
        df_week = df[df['week'] == week]
        if df_week.empty:
            continue
        
        X_week = df_week.drop(columns=[target, 'day', 'week'], errors='ignore')
        X_week = X_week.select_dtypes(include=[float, int])
        

        if X_week.empty or X_week.shape[0] == 0 or X_week.shape[1] == 0:
            continue
        
        imputed_data = imputer_X.fit_transform(X_week)
        if imputed_data.shape[1] == len(X_week.columns):
            X_week = pd.DataFrame(imputed_data, columns=X_week.columns)
        else:
            X_week = pd.DataFrame(imputed_data, columns=[f'feature_{i}' for i in range(imputed_data.shape[1])])
        
        y_week = df_week[target]
        if y_week.empty or len(y_week) == 0:
            continue
        
        imputer_y.fit(y_week.values.reshape(-1, 1))
        y_week = pd.Series(imputer_y.transform(y_week.values.reshape(-1, 1)).flatten())
        
        X_train, X_test, y_train, y_test = train_test_split(X_week, y_week, test_size=0.2, random_state=42)
        
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        
        importances = model.feature_importances_
        feature_importance = pd.DataFrame({'feature': X_week.columns, 'importance': importances, 'week': week})
        weekly_importances.append(feature_importance)
    
    if weekly_importances:
        weekly_importances_df = pd.concat(weekly_importances, ignore_index=True)
        return weekly_importances_df
    else:
        return pd.DataFrame(columns=['feature', 'importance', 'week'])


def analyze_feature_importance(importance_df):
    weeks = importance_df['week'].unique()
    last_week = weeks.max()
    previous_week = weeks[-2] if len(weeks) > 1 else last_week

    last_month_weeks = weeks[-4:-1]
    last_3months_weeks = weeks[-12:]

    last_week_top_10 = importance_df[importance_df['week'] == last_week].sort_values(by='importance', ascending=False).head(10)
    previous_week_top_10 = importance_df[importance_df['week'] == previous_week].sort_values(by='importance', ascending=False).head(10)
    last_month_top_10 = importance_df[importance_df['week'].isin(last_month_weeks)].groupby('feature')['importance'].mean().sort_values(ascending=False).head(10)
    last_3months_top_10 = importance_df[importance_df['week'].isin(last_3months_weeks)].groupby('feature')['importance'].mean().sort_values(ascending=False).head(10)

    consistent_monthly_features = set.intersection(*[set(importance_df[importance_df['week'] == week].sort_values(by='importance', ascending=False).head(20)['feature']) for week in last_month_weeks])
    consistent_3months_features = set.intersection(*[set(importance_df[importance_df['week'] == week].sort_values(by='importance', ascending=False).head(20)['feature']) for week in last_3months_weeks])

    new_this_week = set(last_week_top_10['feature']) - set(previous_week_top_10['feature'])
    new_this_month = set(last_week_top_10['feature']) - set(last_month_top_10.index)

    results = {
        'last_week_top_10': last_week_top_10.set_index('feature')['importance'].to_dict(),
        'last_month_top_10': last_month_top_10.to_dict(),
        'last_3months_top_10': last_3months_top_10.to_dict(),
        'consistent_monthly_features': {feature: importance_df[importance_df['feature'] == feature]['importance'].mean() for feature in consistent_monthly_features},
        'consistent_3months_features': {feature: importance_df[importance_df['feature'] == feature]['importance'].mean() for feature in consistent_3months_features},
        'new_this_week': list(new_this_week),
        'new_this_month': list(new_this_month),
    }
    
    return results


In [288]:
metrics = ['daily_activity_score', 'daily_readiness_score', 'daily_stress_day_summary', 'daily_sleep_score']

importance_dfs = {metric: compute_weekly_importances(aggregated_data, metric) for metric in metrics}

analysis_results = {metric: analyze_feature_importance(importance_df) for metric, importance_df in importance_dfs.items()}

for metric, results in analysis_results.items():
    print(f"\nMetric: {metric}")
    print("\nLast week top 10 features:")
    print(results['last_week_top_10'])
    print("\nLast month top 10 features:")
    print(results['last_month_top_10'])
    print("\nLast 3 months top 10 features:")
    print(results['last_3months_top_10'])
    print("\nFeatures consistently in top 20 for the last month:")
    print(results['consistent_monthly_features'])
    print("\nFeatures consistently in top 20 for the last 3 months:")
    print(results['consistent_3months_features'])
    print("\nFeatures new this week (which weren't in the previous week):")
    print(results['new_this_week'])
    print("\nFeatures new this week (which weren't in the previous month):")
    print(results['new_this_month'])




Metric: daily_activity_score

Last week top 10 features:
{'daily_readiness_hrv_balance': 0.06974510817468305, 'daily_readiness_sleep_balance': 0.05687766918088922, 'daily_readiness_previous_night': 0.04942478408517465, 'daily_activity_inactivity_alerts': 0.04926234519278551, 'daily_sleep_total_sleep': 0.04710549278214464, 'daily_activity_move_every_hour': 0.04046762870264809, 'daily_activity_high_activity_time': 0.03959399142185361, 'daily_sleep_latency': 0.03807918298675497, 'daily_readiness_previous_day_activity': 0.03685145826034731, 'daily_activity_meet_daily_targets': 0.03270690546449323}

Last month top 10 features:
{'daily_activity_sedentary_time': 0.049684932082925994, 'daily_activity_stay_active': 0.047218252954109786, 'daily_activity_low_activity_time': 0.046517008703347316, 'daily_sleep_total_sleep': 0.04157621477655977, 'daily_readiness_hrv_balance': 0.04117010307002097, 'daily_sleep_deep_sleep': 0.03648043472548389, 'daily_readiness_previous_night': 0.03646536493775554, '

## Analytics / Data Mapping

In [None]:
## read metrics to do 

## descriptions for each metric to do 

## drop model to backend to do 

## celery to run everything once a day to do

## openai functions for chat and daily featured insights to do