In [1]:
import pandas as pd

In [2]:
activity = pd.read_csv('data/dailyActivity_merged.csv')
intensity = pd.read_csv('data/dailyIntensities_merged.csv')
steps = pd.read_csv('data/dailySteps_merged.csv')
sleep = pd.read_csv('data/sleepDay_merged.csv')
heartrate = pd.read_csv('data/heartrate_seconds_merged.csv')

In [3]:
activity['ActivityDate'] = pd.to_datetime(activity['ActivityDate'])
intensity['ActivityDay'] = pd.to_datetime(intensity['ActivityDay'])
steps['ActivityDay'] = pd.to_datetime(steps['ActivityDay'])
sleep['SleepDay'] = pd.to_datetime(sleep['SleepDay'], format='%m/%d/%Y %I:%M:%S %p')
heartrate['Time'] = pd.to_datetime(heartrate['Time'])
#Extract heart rate for merging
heartrate['Date'] = heartrate['Time'].dt.date

In [4]:
#average heart rate from seconds
daily_heartrate = heartrate.groupby(['Id', 'Date'])['Value'].mean().reset_index()
daily_heartrate.rename(columns={'Value': 'AverageHeartRate'}, inplace=True)
daily_heartrate['Date'] = pd.to_datetime(daily_heartrate['Date'])

In [5]:
activity_merged = activity.merge(intensity, how='left', left_on=['Id', 'ActivityDate'], right_on=['Id', 'ActivityDay'])
activity_merged = activity_merged.merge(steps, how='left', left_on=['Id', 'ActivityDate'], right_on=['Id', 'ActivityDay'])
activity_merged = activity_merged.merge(sleep, how='left', left_on=['Id', 'ActivityDate'], right_on=['Id', 'SleepDay'])
activity_merged = activity_merged.merge(daily_heartrate, how='left', left_on=['Id', 'ActivityDate'], right_on=['Id', 'Date'])
master_df = activity_merged.copy()

In [6]:
master_df.drop(columns=['ActivityDay_x', 'ActivityDay_y', 'SleepDay', 'Date'], inplace=True, errors='ignore')
master_df.rename(columns={'ActivityDate': 'Date'}, inplace=True)

print(master_df.head())

           Id       Date  TotalSteps  TotalDistance  TrackerDistance  \
0  1503960366 2016-04-12       13162           8.50             8.50   
1  1503960366 2016-04-13       10735           6.97             6.97   
2  1503960366 2016-04-14       10460           6.74             6.74   
3  1503960366 2016-04-15        9762           6.28             6.28   
4  1503960366 2016-04-16       12669           8.16             8.16   

   LoggedActivitiesDistance  VeryActiveDistance_x  ModeratelyActiveDistance_x  \
0                       0.0                  1.88                        0.55   
1                       0.0                  1.57                        0.69   
2                       0.0                  2.44                        0.40   
3                       0.0                  2.14                        1.26   
4                       0.0                  2.71                        0.41   

   LightActiveDistance_x  SedentaryActiveDistance_x  ...  VeryActiveMinutes_y  \

In [7]:
master_df.to_csv('data/clean_dataset.csv', index=False)
print("Final dataset saved as 'data/clean_dataset.csv'")

Final dataset saved as 'data/clean_dataset.csv'


In [11]:
activity.rename(columns={'ActivityDate': 'Date'}, inplace=True)
intensity.rename(columns={'ActivityDay': 'Date'}, inplace=True)
steps.rename(columns={'ActivityDay': 'Date'}, inplace=True)
sleep.rename(columns={'SleepDay': 'Date'}, inplace=True)

In [12]:
df = activity.merge(intensity, on=['Id', 'Date'], how='left') \
             .merge(steps, on=['Id', 'Date'], how='left') \
             .merge(sleep, on=['Id', 'Date'], how='left') \
             .merge(daily_heartrate, on=['Id', 'Date'], how='left')

In [13]:
df['SleepEfficiency'] = (df['TotalMinutesAsleep'] / df['TotalTimeInBed']) * 100

**Risk Labeling**

*Creating Risk score + Risk Category columns for training data*

In [15]:
print(df.columns.tolist())


['Id', 'Date', 'TotalSteps', 'TotalDistance', 'TrackerDistance', 'LoggedActivitiesDistance', 'VeryActiveDistance_x', 'ModeratelyActiveDistance_x', 'LightActiveDistance_x', 'SedentaryActiveDistance_x', 'VeryActiveMinutes_x', 'FairlyActiveMinutes_x', 'LightlyActiveMinutes_x', 'SedentaryMinutes_x', 'Calories', 'SedentaryMinutes_y', 'LightlyActiveMinutes_y', 'FairlyActiveMinutes_y', 'VeryActiveMinutes_y', 'SedentaryActiveDistance_y', 'LightActiveDistance_y', 'ModeratelyActiveDistance_y', 'VeryActiveDistance_y', 'StepTotal', 'TotalSleepRecords', 'TotalMinutesAsleep', 'TotalTimeInBed', 'AverageHeartRate', 'SleepEfficiency', 'RiskScore', 'RiskCategory']


In [16]:
import pandas as pd

df = pd.read_csv('clean_dataset.csv')

def calculate_risk(row):
    score = 0
    if row['TotalMinutesAsleep'] <= 300:
        score += 1
    if row['SleepEfficiency'] <=85:
        score += 1
    if row ['VeryActiveMinutes_x'] <=20:
        score += 1
    if row['SedentaryMinutes_x'] >= 1000:
        score += 1
    if row ['AverageHeartRate'] >= 90:
        score += 1
    return score

df['RiskScore'] = df.apply(calculate_risk, axis=1)

def categorize_risk(score):
    if score <=1:
        return 'Low Risk'
    elif score <=3:
        return 'Medium Risk'
    else:
        return 'High Risk'
    
df['RiskCategory'] = df['RiskScore'].apply(categorize_risk)

print(df['RiskCategory'].value_counts())
df.to_csv('clean_dataset.csv', index=False)
print("New labeled dataset saved as 'clean_dataset_labeled.csv'")

RiskCategory
Low Risk       504
Medium Risk    438
High Risk        1
Name: count, dtype: int64
New labeled dataset saved as 'clean_dataset_labeled.csv'
