In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import TensorDataset, DataLoader
import os 
import requests
from datetime import datetime
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import pickle
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import copy
import sklearn

In [2]:
print(f"Pandas version: {pd.__version__}")
print(f"Pickle version: {pickle.format_version}")
print(f"Numpy version: {np.__version__}")
print(f"Scikit-learn version: {sklearn.__version__}")

Pandas version: 2.1.1
Pickle version: 4.0
Numpy version: 1.24.3
Scikit-learn version: 1.3.2


## Load

In [14]:
url = 'https://api.ouraring.com/v2/usercollection/daily_stress' 
params={ 
    'start_date': '2023-01-01', 
    'end_date': '2024-12-01' 
}
headers = { 
  'Authorization': f'Bearer {os.getenv("OURA_TOKEN")}' 
}
response = requests.request('GET', url, headers=headers, params=params) 
data = response.json()['data']

In [15]:
def get_data(api):
    url = f'https://api.ouraring.com/v2/usercollection/{api}'
    headers = { 
        'Authorization': f'Bearer {os.getenv("OURA_TOKEN")}' 
    }
    response = requests.request('GET', url, headers=headers, params=params) 
    data = response.json()['data']

    rows = []

    for item in data: 
        row = item.copy()

        if 'contributors' in item:
            row.update(item['contributors'])
            del row['contributors']
        
        rows.append(row)
    
    return rows


def aggregate_data(apis):
    combined_df = pd.DataFrame()

    for api in apis:
        data = get_data(api)
        df = pd.DataFrame(data)
        df['day'] = pd.to_datetime(df['day']).dt.date
        df = df.set_index('day')
        df.columns = [f'{api}_{col}' for col in df.columns]
        
        if combined_df.empty:
            combined_df = df
        else:
            combined_df = combined_df.join(df, how='outer')
    
    combined_df = combined_df.reset_index()

    return combined_df

In [16]:
api = [
    'daily_sleep',
    'daily_readiness',
    'daily_stress',
    'daily_activity',
]
aggregated_data = aggregate_data(api)

## Normalization

In [18]:
day_summary_mapping = {
    'restored': 1,
    'normal': 0,
    'stressful': -1,
    None: 0
}
aggregated_data['daily_stress_day_summary'] = aggregated_data['daily_stress_day_summary'].map(day_summary_mapping)

aggregated_data['week'] = pd.to_datetime(aggregated_data['day']).dt.isocalendar().week

In [19]:
aggregated_data['week'] = pd.to_datetime(aggregated_data['day']).dt.isocalendar().week

In [20]:
columns_to_drop = ['daily_sleep_id', 'daily_sleep_timestamp', 'daily_readiness_id', 
                   'daily_readiness_timestamp', 'daily_stress_id', 'daily_activity_id', 
                   'daily_activity_timestamp', 'daily_activity_met', 'daily_activity_class_5_min']

aggregated_data = aggregated_data.drop(columns=[col for col in columns_to_drop if col in aggregated_data.columns])

In [21]:
aggregated_data = aggregated_data.dropna(axis=1, how='all')
numeric_columns = aggregated_data.select_dtypes(include=['float64', 'int64'])
aggregated_data[numeric_columns.columns] = numeric_columns.fillna(numeric_columns.mean())

In [22]:
without_normalization_df = copy.deepcopy(aggregated_data)
without_normalization_df['day'] = pd.to_datetime(without_normalization_df['day'])

In [24]:
scaler = StandardScaler()
aggregated_data[numeric_columns.columns] = scaler.fit_transform(numeric_columns)

## Model 

In [59]:
def train_model(df, target):
    weeks = df['week'].unique()
    imputer_X = SimpleImputer(strategy='mean')
    imputer_y = SimpleImputer(strategy='mean')
    models = {}

    for week in weeks:
        df_week = df[df['week'] == week]
        if df_week.empty:
            continue
        
        X_week = df_week.drop(columns=[target, 'day', 'week'], errors='ignore')
        X_week = X_week.select_dtypes(include=[float, int])
        
        if X_week.empty or X_week.shape[0] == 0 or X_week.shape[1] == 0:
            continue
        
        X_week = X_week.dropna(axis=1, how='all')
        imputed_data = imputer_X.fit_transform(X_week)
        X_week = pd.DataFrame(imputed_data, columns=X_week.columns)
        
        y_week = df_week[target]
        if y_week.empty or len(y_week) == 0:
            continue
        
        imputer_y.fit(y_week.values.reshape(-1, 1))
        y_week = pd.Series(imputer_y.transform(y_week.values.reshape(-1, 1)).flatten())
        
        X_train, X_test, y_train, y_test = train_test_split(X_week, y_week, test_size=0.2, random_state=42)
        
        model = RandomForestRegressor(n_estimators=100, random_state=42)
        model.fit(X_train, y_train)
        
        models[week] = {'model': model, 'X_test': X_test, 'y_test': y_test, 'X_columns': X_week.columns}
    
    return models

In [11]:
def evaluate_model(models):
    weekly_metrics = []

    for week, data in models.items():
        model = data['model']
        X_test = data['X_test']
        y_test = data['y_test']
        
        y_pred = model.predict(X_test)
        
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        
        weekly_metrics.append({'week': week, 'MSE': mse, 'MAE': mae, 'R2': r2})
    
    return weekly_metrics

In [25]:
def compute_feature_importances(models):
    weekly_importances = []

    for week, data in models.items():
        model = data['model']
        X_test = data['X_test']
        
        importances = model.feature_importances_
        feature_importance = pd.DataFrame({'feature': X_test.columns, 'importance': importances, 'week': week})
        weekly_importances.append(feature_importance)
    
    if weekly_importances:
        weekly_importances_df = pd.concat(weekly_importances, ignore_index=True)
        return weekly_importances_df
    else:
        return pd.DataFrame(columns=['feature', 'importance', 'week'])

In [26]:
def compute_feature_importances(models):
    weekly_importances = []

    if isinstance(models, dict):
        models = [models]

    for model_data in models:
        for week, data in model_data.items():
            model = data['model']
            X_test = data['X_test']
            
            importances = model.feature_importances_
            feature_importance = pd.DataFrame({'feature': X_test.columns, 'importance': importances, 'week': week})
            weekly_importances.append(feature_importance)
    
    if weekly_importances:
        weekly_importances_df = pd.concat(weekly_importances, ignore_index=True)
        return weekly_importances_df
    else:
        return pd.DataFrame(columns=['feature', 'importance', 'week'])

In [60]:
def load_models(metrics, directory):
    loaded_models = {}
    for metric in metrics:
        with open(os.path.join(directory, f'{metric}_model.pkl'), 'rb') as f:
            loaded_models[metric + "_model"] = pickle.load(f)
    return loaded_models

In [61]:
metrics = ['daily_activity_score', 'daily_readiness_score', 'daily_stress_day_summary', 'daily_sleep_score']

In [62]:
models = {}

for metric in metrics:
    models[metric] = train_model(aggregated_data, metric)

save_dir = '/Users/dmitrykorzhov/Desktop/Root/projects/oura_ai/backend/ml_models'

for metric, model in models.items():
    with open(os.path.join(save_dir, f'{metric}_model.pkl'), 'wb') as f:
        pickle.dump(model, f)

In [63]:
load_dir = '/Users/dmitrykorzhov/Desktop/Root/projects/oura_ai/backend/ml_models'

loaded_models = load_models(metrics, load_dir)

importance_dfs = {metric: compute_feature_importances(model) for metric, model in loaded_models.items()}

In [64]:
importance_dfs

{'daily_activity_score_model':                                 feature  importance  week
 0                     daily_sleep_score    0.029192    47
 1                daily_sleep_deep_sleep    0.014194    47
 2                daily_sleep_efficiency    0.035268    47
 3                   daily_sleep_latency    0.005426    47
 4                 daily_sleep_rem_sleep    0.062049    47
 ...                                 ...         ...   ...
 1498     daily_activity_move_every_hour    0.019231    26
 1499       daily_activity_recovery_time    0.000000    26
 1500         daily_activity_stay_active    0.000000    26
 1501  daily_activity_training_frequency    0.000000    26
 1502     daily_activity_training_volume    0.000000    26
 
 [1503 rows x 3 columns],
 'daily_readiness_score_model':                                 feature  importance  week
 0                     daily_sleep_score    0.031816    47
 1                daily_sleep_deep_sleep    0.029995    47
 2                daily_sl

In [65]:
directory = 'ml_models'

def load_model(metric):
    with open(os.path.join(directory, f'{metric}_model.pkl'), 'rb') as f:
        return pickle.load(f)


def row_insights_for_metric(metric):
    loaded_model = load_model(metric)
    
    feature_importances = compute_feature_importances(loaded_model)
    
    return feature_importances

In [66]:
aggregated_data.to_csv('aggregated_data.csv')

In [67]:
importance = row_insights_for_metric('daily_activity_score')

In [68]:
importance_dfs['daily_activity_score']

KeyError: 'daily_activity_score'

In [75]:
def get_current_week_data(importance_df, without_normalization_df, metric):
    weeks = importance_df['week'].unique()
    
    current_week = weeks[-1]
    
    metric = metric.replace('_model', '')

    current_week_top_10 = importance_df[importance_df['week'] == current_week].sort_values(by='importance', ascending=False).head(10)

    new_this_week = set(current_week_top_10['feature'])

    current_week_values = without_normalization_df[without_normalization_df['week'] == current_week][current_week_top_10['feature'].tolist() + ['day']].set_index('day').to_dict()

    metric_current_week_values = without_normalization_df[without_normalization_df['week'] == current_week][[metric, 'day']].set_index('day').to_dict()

    current_week_days = without_normalization_df[without_normalization_df['week'] == current_week]['day']

    current_date = pd.to_datetime('today').date()

    current_week_days = pd.to_datetime(current_week_days).dt.date

    results = {
        'new_this_week': list(new_this_week),
        'current_week_values': current_week_values,
        'current_week_top_10': current_week_top_10.set_index('feature')['importance'].to_dict(),
        'metric_current_week_values': metric_current_week_values,
    }
    
    return results


def get_last_week_data(importance_df, without_normalization_df, metric):
    weeks = importance_df['week'].unique()

    last_week = weeks[-2]
    
    metric = metric.replace('_model', '')

    last_week_top_10 = importance_df[importance_df['week'] == last_week].sort_values(by='importance', ascending=False).head(10)

    new_this_week = set(last_week_top_10['feature'])

    last_week_values = without_normalization_df[without_normalization_df['week'] == last_week][last_week_top_10['feature'].tolist() + ['day']].set_index('day').to_dict()

    metric_last_week_values = without_normalization_df[without_normalization_df['week'] == last_week][[metric, 'day']].set_index('day').to_dict()

    last_week_days = without_normalization_df[without_normalization_df['week'] == last_week]['day']

    current_date = pd.to_datetime('today').date()

    last_week_days = pd.to_datetime(last_week_days).dt.date

    results = {
        'new_this_week': list(new_this_week),
        'last_week_values': last_week_values,
        'last_week_top_10': last_week_top_10.set_index('feature')['importance'].to_dict(),
        'metric_last_week_values': metric_last_week_values,
    }
    
    return results


def get_last_month_data(importance_df, without_normalization_df):
    weeks = importance_df['week'].unique()
    last_month_weeks = weeks[-4:] if len(weeks) > 4 else weeks

    last_month_top_10 = importance_df[importance_df['week'].isin(last_month_weeks)].groupby('feature')['importance'].mean().sort_values(ascending=False).head(10)
    previous_month_top_10 = importance_df[importance_df['week'].isin(weeks[-8:-4])].groupby('feature')['importance'].mean().sort_values(ascending=False).head(10) if len(weeks) > 8 else last_month_top_10

    new_this_month = set(last_month_top_10.index) - set(previous_month_top_10.index)

    last_month_values = without_normalization_df[without_normalization_df['week'].isin(last_month_weeks)].drop(columns=['week', 'day']).mean().to_dict()

    consistent_monthly_features = set.intersection(*[set(importance_df[importance_df['week'] == week].sort_values(by='importance', ascending=False).head(20)['feature']) for week in last_month_weeks])

    results = {
        'new_this_month': list(new_this_month),
        'last_month_values': last_month_values,
        'last_month_top_10': last_month_top_10.to_dict(),
        'consistent_monthly_features': {feature: importance_df[importance_df['feature'] == feature]['importance'].mean() for feature in consistent_monthly_features},
    }
    
    return results

def get_last_3months_data(importance_df, without_normalization_df):
    weeks = importance_df['week'].unique()
    last_3months_weeks = weeks[-12:] if len(weeks) > 12 else weeks

    last_3months_top_10 = importance_df[importance_df['week'].isin(last_3months_weeks)].groupby('feature')['importance'].mean().sort_values(ascending=False).head(10)
    previous_3months_top_10 = importance_df[importance_df['week'].isin(weeks[-24:-12])].groupby('feature')['importance'].mean().sort_values(ascending=False).head(10) if len(weeks) > 24 else last_3months_top_10

    new_this_3months = set(last_3months_top_10.index) - set(previous_3months_top_10.index)

    last_3months_values = without_normalization_df[without_normalization_df['week'].isin(last_3months_weeks)].drop(columns=['week', 'day']).mean().to_dict()

    consistent_3months_features = set.intersection(*[set(importance_df[importance_df['week'] == week].sort_values(by='importance', ascending=False).head(20)['feature']) for week in last_3months_weeks])

    results = {
        'new_this_3months': list(new_this_3months),
        'last_3months_values': last_3months_values,
        'last_3months_top_10': last_3months_top_10.to_dict(),
        'consistent_3months_features': {feature: importance_df[importance_df['feature'] == feature]['importance'].mean() for feature in consistent_3months_features},
    }
    
    return results

In [78]:
last_month_results = {metric: get_current_week_data(importance_df, without_normalization_df, metric) for metric, importance_df in importance_dfs.items()}

for metric, results in last_month_results.items():
    print(f"\nMetric: {metric}")
    print("\nTarget metrics values for the current week by days:")
    print(results['new_this_week'])
    print("\nNew features current week:")
    print(results['current_week_values'])
    print("\nCurrent week feature values by days for current week:")
    print(results['current_week_top_10'])
    print("\nTarget metrics values for the current week by days:")
    print(results['metric_current_week_values'])


Metric: daily_activity_score_model

Target metrics values for the current week by days:
['daily_readiness_temperature_deviation', 'daily_activity_low_activity_time', 'daily_readiness_previous_night', 'daily_stress_stress_high', 'daily_activity_target_calories', 'daily_activity_target_meters', 'daily_activity_equivalent_walking_distance', 'daily_readiness_recovery_index', 'daily_activity_meters_to_target', 'daily_sleep_total_sleep']

New features current week:
{'daily_readiness_temperature_deviation': {Timestamp('2024-06-24 00:00:00'): 0.25, Timestamp('2024-06-25 00:00:00'): -0.25, Timestamp('2024-06-26 00:00:00'): 0.26}, 'daily_stress_stress_high': {Timestamp('2024-06-24 00:00:00'): 3600, Timestamp('2024-06-25 00:00:00'): 8100, Timestamp('2024-06-26 00:00:00'): 3600}, 'daily_activity_target_calories': {Timestamp('2024-06-24 00:00:00'): 600, Timestamp('2024-06-25 00:00:00'): 600, Timestamp('2024-06-26 00:00:00'): 400}, 'daily_readiness_previous_night': {Timestamp('2024-06-24 00:00:00')

In [72]:
last_month_results = {metric: get_last_week_data(importance_df, without_normalization_df, metric) for metric, importance_df in importance_dfs.items()}

for metric, results in last_month_results.items():
    print(f"\nMetric: {metric}")
    print("\nTarget metrics values for the last week by days:")
    print(results['metric_last_week_values'])
    print("\nNew features last week:")
    print(results['new_this_week'])
    print("\nLast week feature values by days for last week:")
    print(results['last_week_values'])
    print("\nLast week top 10 features:")
    print(results['last_week_top_10'])

[47 48 49 50 51 52  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
 19 20 21 22 23 24 25 26]
New features this week: {'daily_readiness_temperature_deviation', 'daily_activity_low_activity_time', 'daily_readiness_previous_night', 'daily_stress_stress_high', 'daily_activity_target_calories', 'daily_activity_target_meters', 'daily_activity_equivalent_walking_distance', 'daily_readiness_recovery_index', 'daily_activity_meters_to_target', 'daily_sleep_total_sleep'}
[47 48 49 50 51 52  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18
 19 20 21 22 23 24 25 26]
New features this week: {'daily_activity_low_activity_time', 'daily_readiness_resting_heart_rate', 'daily_readiness_previous_day_activity', 'daily_readiness_temperature_trend_deviation', 'daily_readiness_recovery_index', 'daily_activity_target_meters', 'daily_activity_target_calories', 'daily_activity_meet_daily_targets', 'daily_activity_equivalent_walking_distance', 'daily_stress_recovery_high'}
[47 48 49 50 51 52  1  2  3  4

KeyError: 'metric_last_week_values'

In [182]:
last_month_results = {metric: get_last_month_data(importance_df, without_normalization_df) for metric, importance_df in importance_dfs.items()}

for metric, results in last_month_results.items():
    print(f"\nMetric: {metric}")
    print("\nNew features this month:")
    print(results['new_this_month'])
    print("\nLast month mean feature values:")
    print(results['last_month_values'])
    print("\nLast month top 10 features:")
    print(results['last_month_top_10'])
    print("\nFeatures consistently in top 20 for the last month:")
    print(results['consistent_monthly_features'])


Metric: daily_activity_score_model

New features this month:
['daily_activity_resting_time', 'daily_activity_low_activity_met_minutes', 'daily_activity_meters_to_target']

Last month mean feature values:
{'daily_sleep_score': 52.80424528301887, 'daily_sleep_deep_sleep': 60.93901617250674, 'daily_sleep_efficiency': 71.41071428571429, 'daily_sleep_latency': 84.88982479784366, 'daily_sleep_rem_sleep': 43.91711590296495, 'daily_sleep_restfulness': 57.23180592991913, 'daily_sleep_timing': 2.6580188679245285, 'daily_sleep_total_sleep': 51.04885444743935, 'daily_readiness_score': 71.31502695417791, 'daily_readiness_temperature_deviation': -0.009969676549865225, 'daily_readiness_temperature_trend_deviation': 0.08834631008801623, 'daily_readiness_activity_balance': 79.67115902964959, 'daily_readiness_body_temperature': 88.36421832884096, 'daily_readiness_hrv_balance': 81.28294036061027, 'daily_readiness_previous_day_activity': 74.17688679245283, 'daily_readiness_previous_night': 51.02560646900