In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import numpy as np
import pytz

# Helper Functions
def process_sleep_data(df, freq='1min', start_date='2020-09-26 00:00:00', end_date='2023-03-17 00:00:00'):
    #exclude where valud is HKCategoryValueSleepAnalysisInBed
    df = df.drop(df[df['value'] == 'HKCategoryValueSleepAnalysisInBed'].index)
    
    # Parse dates and times
    df['startDate'] = pd.to_datetime(df['startDate'])
    df['endDate'] = pd.to_datetime(df['endDate'])

    # Create the date range
    expanded_df = pd.DataFrame()
    expanded_df["date"] = pd.date_range(start_date, end_date, freq=freq, tz=pytz.FixedOffset(-240))

    # 1 if between startDate and endDate, 0 otherwise
    expanded_df["value"] = 0
    for _, row in df.iterrows():
        mask = (expanded_df['date'] >= row['startDate']) & (expanded_df['date'] <= row['endDate'])
        expanded_df.loc[mask, 'value'] = 1
        
    expanded_df = expanded_df.rename(columns={'value': 'sleep'})

    return expanded_df


def preprocess_feature_data(df, col_name, freq='1min', smoothing = 2, start_date='2020-09-26 00:00:00', end_date='2023-03-17 00:00:00'):
    df = df[(df['startDate'] >= start_date) & (df['startDate'] <= end_date)]
    
    df = pd.melt(df, id_vars=['value'], value_vars=['startDate', 'endDate'], value_name='date')
    df = df.groupby('date', as_index=False).mean(numeric_only=True)
    df = df.sort_values(by='date')
    
    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')
    df = df.resample(freq).mean()
    
    df = df.interpolate().rolling(smoothing).mean()
    df = df.fillna(method="bfill")
    
    df = df.reset_index()
    df = df.rename(columns={'date': 'date', 'value': col_name})
    
    return df

In [10]:
%%time
freq='1min'
start_date='2020-09-26 00:00:00'
end_date='2023-03-17 00:00:00'
df = pd.read_csv('data/train_detailed.csv', low_memory=False)

# Parse dates and times
df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])

# Create the date range
date_range = pd.date_range(start_date, end_date, freq=freq, tz=pytz.FixedOffset(-240))

# Convert date range and dataframe dates to numpy arrays for faster computation
date_range_arr = date_range.values.astype('datetime64[m]')
start_date_arr = df['startDate'].values.astype('datetime64[m]')
end_date_arr = df['endDate'].values.astype('datetime64[m]')

# Create a boolean mask array where each element corresponds to whether the date range
# at that index is between the start and end dates in the dataframe
mask = (date_range_arr[:,None] >= start_date_arr) & (date_range_arr[:,None] <= end_date_arr)

# Use numpy.sum to count the number of True values in each row of the mask array,
# which corresponds to the number of intervals that overlap with each dataframe row
counts = np.sum(mask, axis=1)

# Create the expanded dataframe using the date range and the counts array
expanded_df = pd.DataFrame({'date': date_range, 'value': counts})

expanded_df.head()


CPU times: user 20.1 s, sys: 2.27 s, total: 22.3 s
Wall time: 22.3 s


Unnamed: 0,date,value
0,2020-09-26 00:00:00-04:00,0
1,2020-09-26 00:01:00-04:00,0
2,2020-09-26 00:02:00-04:00,0
3,2020-09-26 00:03:00-04:00,0
4,2020-09-26 00:04:00-04:00,0


In [6]:
(expanded_df.value > 0).value_counts()


False    1072434
True      226447
Name: value, dtype: int64

In [9]:
%%time
freq='1min'
start_date='2020-09-26 00:00:00'
end_date='2023-03-17 00:00:00'
df = pd.read_csv('data/train_detailed.csv', low_memory=False)

# Parse dates and times
df['startDate'] = pd.to_datetime(df['startDate'])
df['endDate'] = pd.to_datetime(df['endDate'])

# Create the date range
expanded_df = pd.DataFrame()
expanded_df["date"] = pd.date_range(start_date, end_date, freq=freq, tz=pytz.FixedOffset(-240))

# 1 if between startDate and endDate, 0 otherwise
expanded_df["value"] = 0
for _, row in df.iterrows():
    mask = (expanded_df['date'] >= row['startDate']) & (expanded_df['date'] <= row['endDate'])
    expanded_df.loc[mask, 'value'] = 1

# Create the expanded dataframe using the date range and the counts array
expanded_df = pd.DataFrame({'date': date_range, 'value': counts})

CPU times: user 19.7 s, sys: 9.42 ms, total: 19.7 s
Wall time: 19.7 s


In [12]:
expanded_df

Unnamed: 0,date,value
0,2020-09-26 00:00:00-04:00,0
1,2020-09-26 00:01:00-04:00,0
2,2020-09-26 00:02:00-04:00,0
3,2020-09-26 00:03:00-04:00,0
4,2020-09-26 00:04:00-04:00,0
...,...,...
1298876,2023-03-16 23:56:00-04:00,0
1298877,2023-03-16 23:57:00-04:00,0
1298878,2023-03-16 23:58:00-04:00,0
1298879,2023-03-16 23:59:00-04:00,0
