In [1]:
import pandas as pd
import os

In [None]:
# for renaming the filenames in 'user_processed' folder from Unix epoch time to datetime'
from datetime import datetime

def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d')

parent_folder = './train_dataset/user21-25/user25_processed'

for subfolder in os.listdir(parent_folder):
    subfolder_path = os.path.join(parent_folder, subfolder)
    # check if it's a directory 
    if os.path.isdir(subfolder_path):
        try:
            # convert subfolder name to datetime format
            new_name = unix_to_datetime(int(subfolder))
            os.rename(subfolder_path, os.path.join(parent_folder, new_name))
            print(f"Renamed {subfolder} to {new_name}")

            csv_file = os.path.join(parent_folder, new_name, f"{subfolder}_label.csv")
            new_csv_name = os.path.join(parent_folder, new_name, f"{new_name}_label.csv")

            os.rename(csv_file, new_csv_name)
            print(f"Renamed {csv_file} to {new_csv_name}")

        except ValueError:
            print(f"Skipping {subfolder}: Not a valid Unix epoch time")
        except Exception as e:
            print(f"Error renaming {subfolder}: {e}")


In [None]:
import os
from datetime import datetime

# Function to convert Unix epoch time to datetime format
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d %H:%M:%S')

# Path to the folder containing CSV files with Unix epoch filenames
folder_path = './train_dataset/user01-06/user01_processed/2020-08-31/e4Acc'

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        try:
            # Extract Unix epoch time from the filename
            epoch_time = int(os.path.splitext(file_name)[0])
            
            # Convert Unix epoch time to datetime format
            new_name = unix_to_datetime(epoch_time)
            
            # Rename the file with the new datetime format
            old_path = os.path.join(folder_path, file_name)
            new_path = os.path.join(folder_path, f"{new_name}.csv")
            os.rename(old_path, new_path)
            
            print(f"Renamed {file_name} to {new_name}.csv")
        except ValueError:
            print(f"Skipping {file_name}: Not a valid Unix epoch time")
        except Exception as e:
            print(f"Error renaming {file_name}: {e}")


## User info

In [43]:
user_info = pd.read_csv("./train_dataset/user_info_2020.csv")

## Label data

In [None]:
from datetime import datetime
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d %H:%M:%S')

user_label_01_0831 = pd.read_csv("./train_dataset/user01-06/user01_processed/2020-08-31/2020-08-31_label.csv")
user_label_01_0831['ts'] = user_label_01_0831['ts'].apply(unix_to_datetime)

## Survey data

In [43]:
# survey data for user01 on 2020-08-31
user_survey = pd.read_csv("./train_dataset/user_survey_2020.csv")
user_survey['date'] = pd.to_datetime(user_survey['date'])
user_survey_01 = user_survey[user_survey['userId'] == 'user01']
user_survey_01_0831 = user_survey_01[user_survey_01['date'] == '2020-08-31']

## Sleep data

In [27]:
# sleep data for user01 on 2020-08-31
user_sleep = pd.read_csv("./train_dataset/user_sleep_2020.csv")
user_sleep['date'] = pd.to_datetime(user_sleep['date'])
user_sleep_01 = user_sleep[user_sleep['userId'] == 'user01']
user_sleep_01_0831 = user_sleep_01[user_sleep_01['date'] == '2020-08-31']

user_sleep_01_0831 = user_sleep_01_0831.copy()
user_sleep_01_0831.loc[:, 'startDt'] = pd.to_datetime(user_sleep_01_0831['startDt'], unit='s')
user_sleep_01_0831.loc[:, 'endDt'] = pd.to_datetime(user_sleep_01_0831['endDt'], unit='s')
user_sleep_01_0831.loc[:, 'lastUpdate'] = pd.to_datetime(user_sleep_01_0831['lastUpdate'], unit='s')

user_sleep_01_0831.columns

['2020-08-30 15:44:00']
Length: 1, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  user_sleep_01_0831.loc[:, 'startDt'] = pd.to_datetime(user_sleep_01_0831['startDt'], unit='s')
['2020-08-30 23:43:00']
Length: 1, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  user_sleep_01_0831.loc[:, 'endDt'] = pd.to_datetime(user_sleep_01_0831['endDt'], unit='s')
['2020-08-31 01:46:13']
Length: 1, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  user_sleep_01_0831.loc[:, 'lastUpdate'] = pd.to_datetime(user_sleep_01_0831['lastUpdate'], unit='s')


Index(['userId', 'timezone', 'date', 'startDt', 'endDt', 'lastUpdate',
       'wakeupduration', 'lightsleepduration', 'deepsleepduration',
       'wakeupcount', 'durationtosleep', 'remsleepduration',
       'durationtowakeup', 'hr_average', 'hr_min', 'hr_max', 'rr_average',
       'rr_min', 'rr_max', 'breathing_disturbances_intensity', 'snoring',
       'snoringepisodecount', 'sleep_score'],
      dtype='object')

In [13]:
import pickle
def open_pickle(filename: str):
    with open(filename, 'rb') as file:
        load = pickle.load(file)
    return load

def save_pickle(filename: str, data):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

## Sensor Data

### Data aggregation

- Originally, for each user, date -> sensor data name -> separate csv files for each 'minute'.
- To aggregate, first produce a single file for each date's sensor data name

In [None]:
# first, add the date ('2020-08-30') and time in seconds ('36') to create a column 'timestamp' in ('%Y-%m-%d %H:%M:%S.%f') format
# then combine them for each ssensor type
import os
import pandas as pd
from datetime import datetime
from pathlib import Path
import math
from pandas.errors import EmptyDataError

# Function to convert Unix time to datetime string
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d %H:%M:%S.%f')

def calculate_magnitude(x, y, z):
    return math.sqrt(x**2 + y**2 + z**2)

# Function to reformat sensor data in a directory
def reformat_sensor_data(date_directory, data_type, output_dir):
    directory = date_directory / data_type
    output_filename = f"{data_type}_{date_directory.name}_combined.csv"
    output_file = output_dir / output_filename

    # Skip processing if the output file already exists
    if output_file.exists():
        print(f"Processed file already exists: {output_file}, skipping.")
        return None

    csv_files = list(directory.glob('*.csv'))
    
    if not csv_files:
        print(f"No CSV files found in {directory}, skipping.")
        return None
    
    dfs = []
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
        except EmptyDataError:
            print(f"EmptyDataError: {csv_file} is empty, skipping this file.")
            continue
            
        time_value = csv_file.stem
        try:
            df['time_value'] = float(time_value)
        except ValueError:
            print(f"ValueError: could not convert {time_value} to float, skipping this file.")
            continue
        dfs.append(df)
    
    if not dfs:
        print(f"No valid CSV files found in {directory}, skipping.")
        return None

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df.sort_values(['time_value', 'timestamp'], ascending=[True, True])
    
    def safe_float_conversion(x):
        try:
            return float(x)
        except ValueError:
            print(f"ValueError: could not convert {x} to float, skipping this row.")
            return None
    
    combined_df['timestamp'] = combined_df['timestamp'].apply(safe_float_conversion)
    combined_df = combined_df.dropna(subset=['timestamp'])  # Drop rows where conversion failed
    
    combined_df['timestamp'] = (combined_df['time_value'] + combined_df['timestamp']).apply(unix_to_datetime)

    final_df = combined_df.drop(columns=['time_value'])
    
    if data_type in ['mAcc', 'e4Acc', 'mGyr', 'mMag']:
        # Apply calculate_magnitude function to specified columns
        magnitude_col = f"magnitude_{data_type}"
        final_df[magnitude_col] = final_df.apply(lambda row: calculate_magnitude(row['x'], row['y'], row['z']), axis=1)

    # Save the final DataFrame to a CSV file
    final_df.to_csv(output_file, index=False)
    print(f"Saved processed data to: {output_file}")
    return final_df

# Define the base directory and output directory
base_dir = Path('./train_dataset/user01-06/user05_processed')
output_dir = Path('./train_dataset/user01-06/user05_processed')

# Traverse the directory structure and process each date directory
for date_dir in base_dir.rglob('*'):
    if date_dir.is_dir():
        for data_type in ['e4Acc', 'e4Bvp', 'e4Eda', 'e4Hr', 'e4Temp', 'mAcc', 'mGps', 'mGyr', 'mMag']:
            data_type_path = date_dir / data_type
            if data_type_path.is_dir():
                print(f"Processing directory: {data_type_path}")
                final_df = reformat_sensor_data(date_dir, data_type, output_dir)

In [None]:
import os

def delete_combined_files(directory):
    files = os.listdir(directory)
    for file in files:
        if file.endswith("_combined.csv"):
            file_path = os.path.join(directory, file)
            os.remove(file_path)
            print(f"Deleted: {file_path}")

directory_path = './train_dataset/user01-06/user06_processed'
delete_combined_files(directory_path)

In [6]:
import glob
from pathlib import Path 
import pandas as pd 

def concat_csv(data_type, output_dir):
    files_to_concat = [file for file in output_dir.glob("*.csv") if data_type in file.name]
    if len(files_to_concat) <2:
        print(f"Not enough files found for {data_type}, skipping concatenation")
        return None 
    
    dfs = [pd.read_csv(file) for file in files_to_concat]
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df.sort_values('timestamp')
    return combined_df 

output_dir = Path('./train_dataset/user01-06/user06_processed')
user_id = 'user06'

for data_type in ['e4Acc', 'e4Bvp', 'e4Eda', 'e4Hr', 'e4Temp', 'mAcc', 'mGps', 'mGyr', 'mMag']:
    print(f"Processing data type: {data_type}")
    concat_df = concat_csv(data_type, output_dir)
    if concat_df is not None:
        output_filename = f"{user_id}_{data_type}_combined.csv"
        output_file = output_dir / output_filename
        concat_df.to_csv(output_file, index=False)
        print(f"Concatenated data saved to: {output_file}")

Processing data type: e4Acc
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Acc_combined.csv
Processing data type: e4Bvp
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Bvp_combined.csv
Processing data type: e4Eda
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Eda_combined.csv
Processing data type: e4Hr
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Hr_combined.csv
Processing data type: e4Temp
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Temp_combined.csv
Processing data type: mAcc
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_mAcc_combined.csv
Processing data type: mGps
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_mGps_combined.csv
Processing data type: mGyr
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_mGyr_combined.csv
Processing data type: 

In [2]:
import shutil 

def move_file(src_file_path: str, dest_dir_path: str):
    try:
        src_file = Path(src_file_path)
        dest_dir = Path(dest_dir_path)
        
        if not src_file.exists():
            print(f"Source file '{src_file}' does not exist.")
            return
        
        if not dest_dir.is_dir():
            print(f"Destination directory '{dest_dir}' is invalid.")
            return
        
        shutil.move(src_file, dest_dir)
        print(f"Moved '{src_file}' to '{dest_dir}' successfully.")
    except Exception as e:
        print(f"An error occurred: {e}")

In [5]:
def resample_data(df, unit: str):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)
    resampled_df = df.resample(unit).mean()
    return resampled_df


def resample_data_bvp(df, unit: str):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)

    df['bvp_positive'] = df['value'].apply(lambda x: x if x > 0 else None)
    df['bvp_negative'] = df['value'].apply(lambda x: x if x < 0 else None)
    
    positive_mean = df['bvp_positive'].resample(unit).mean()
    negative_mean = df['bvp_negative'].resample(unit).mean()

    resampled_df = pd.DataFrame({'bvp_positive': positive_mean, 'bvp_negative': negative_mean})

    return resampled_df

In [294]:
import os
import pandas as pd

def resample_combine(folder_path, user_ids:list, unit:str):
    # Iterate through each user ID
    for user_id in user_ids:
        combined_df = pd.DataFrame()  # Initialize an empty DataFrame for each user
        
        # Iterate through each file in the folder
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                if user_id in filename:
                    df = pd.read_csv(file_path)
                    
                    # Apply the appropriate resampling method based on filename
                    if 'Bvp' in filename:
                        df = resample_data_bvp(df, unit)
                    else:
                        df = resample_data(df, unit)
                
                    # Concatenate the processed DataFrame to the combined DataFrame
                    combined_df = pd.concat([combined_df, df], axis=0)
        combined_filename = f'{user_id}_combined.csv'
        combined_filepath = os.path.join(output_dir, combined_filename)
        combined_df.to_csv(combined_filepath)

In [90]:
import os
import pandas as pd

def process_sensor_data(root, users, sensors, resample_freq):

    # Define dictionary for resampling frequency
    freq_dict = {'h': 'hourly', 'min': 'minutely', 'd': 'daily'}
    
    # Loop through users and sensors
    for user in users:
        for sensor in sensors:
            # Construct file path
            path = os.path.join(root, f"{user}_{sensor}_combined.csv")

            if not os.path.exists(path):
                print(f"File not found for user {user} and sensor {sensor}. Skipping...")
                continue
            
            # Read data and select columns based on sensor type
            if sensor == 'e4Acc':
                columns = ['timestamp', 'magnitude_e4Acc']
            elif sensor == 'e4Bvp':
                columns = ['timestamp', 'value']
                resample_func = resample_data_bvp  # Use specific resampling function for 'e4Bvp'
            elif sensor == 'e4Eda':
                columns = ['timestamp', 'eda']
            elif sensor == 'e4Hr':
                columns = ['timestamp', 'hr']
            elif sensor == 'e4Temp':
                columns = ['timestamp', 'temp']
            elif sensor == 'mAcc':
                columns = ['timestamp', 'magnitude_mAcc']
            elif sensor == 'mGps':
                columns = ['timestamp', 'lat', 'lon', 'accuracy']
            elif sensor == 'mGyr':
                columns = ['timestamp', 'magnitude_mGyr']
            elif sensor == 'mMag':
                columns = ['timestamp', 'magnitude_mMag']
            else:
                print(f"Unknown sensor type: {sensor}")
                continue
            
            # Read data
            df = pd.read_csv(path)[columns]
            
            # Resample data
            if resample_freq in ['h', 'min', 'd']:
                if sensor == 'e4Bvp':
                    resampled_data = resample_func(df, resample_freq)  # Use specific resampling function for 'e4Bvp'
                else:
                    resampled_data = resample_data(df, resample_freq)
                
                # Update CSV filename based on resampling frequency
                csv_filename = f"{user}_{sensor}_{freq_dict[resample_freq]}.csv"
                
                # Export resampled data to CSV
                df_path = "./train_dataset/df"
                csv_path = os.path.join(df_path, csv_filename)
                resampled_data.to_csv(csv_path)
            else:
                print("Invalid resampling frequency. Please use 'h', 'min', or 'd'.")

# Example usage
root = './train_dataset/temp'
users = ['user30']
sensors = ['e4Acc', 'e4Bvp', 'e4Eda', 'e4Hr', 'e4Temp', 'mAcc', 'mGps', 'mGyr', 'mMag']
resample_freq = 'h'  # Resampling frequency ('h', 'min', or 'd')

process_sensor_data(root, users, sensors, resample_freq)

need to re-create '_combined.csv' file for 23, 24, 25
and create daily files

In [93]:
from pathlib import Path
#root_dir = Path('.')
#root_dir = Path('./train_dataset/df')
root_dir = Path('./train_dataset/temp')
#root_dir = Path('./train_dataset/user26-30')
#dest_dir = Path('./train_dataset/df')
dest_dir = Path('./train_dataset/user26-30')
#dest_dir = Path('./train_dataset/temp')
dest_dir.mkdir(parents=True, exist_ok=True)

#for csv_file in root_dir.glob('user*.csv'):
    #move_file(str(csv_file), str(dest_dir))

for csv_file in root_dir.glob('*combined.csv'):
    move_file(str(csv_file), str(dest_dir))

Moved 'train_dataset/temp/user27_e4Temp_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user28_mAcc_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user27_e4Acc_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user26_mMag_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user29_mGyr_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user27_e4Bvp_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user30_e4Bvp_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user28_mMag_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user30_mGyr_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user30_mGps_combined.csv' to 'train_dataset/user26-30' successfully.
Moved 'train_dataset/temp/user28_e4Eda_combin

In [91]:
import os
import pandas as pd
import numpy as np

# Define the directory containing the CSV files
directory = './train_dataset/df'

# Function to find the first non-NaN value
def first_non_nan(series):
    return series.dropna().iloc[0] if not series.dropna().empty else np.nan

# Initialize an empty list to hold dataframes
dfs = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a dataframe and append to the list
        dfs.append(pd.read_csv(file_path))


df_concat = pd.concat(dfs, ignore_index=True)
df_sorted = df_concat.sort_values('timestamp')
df_grouped = df_sorted.groupby('timestamp').apply(lambda x: x.apply(first_non_nan)).reset_index(drop=True)
df_grouped = df_grouped.reset_index()

# Display the final dataframe
df_grouped.to_csv("./train_dataset/sensor_data/hourly_remade/user30_hourly.csv", index=False)

  df_grouped = df_sorted.groupby('timestamp').apply(lambda x: x.apply(first_non_nan)).reset_index(drop=True)


In [53]:
import os
import shutil

base_dir = './train_dataset/sensor_new'

# Iterate over the files in the base directory
for filename in os.listdir(base_dir):
    # Construct full file path
    file_path = os.path.join(base_dir, filename)

    # Check if it's a file (not a directory)
    if os.path.isfile(file_path):
        # Check for 'minutely' in filename
        if 'minutely' in filename:
            target_dir = os.path.join(base_dir, 'minutely')
        # Check for 'hourly' in filename
        elif 'hourly' in filename:
            target_dir = os.path.join(base_dir, 'hourly')
        elif 'daily' in filename:
            target_dir = os.path.join(base_dir, 'daily')
        else:
            continue  # Skip files that don't match criteria

        # Create target directory if it doesn't exist
        os.makedirs(target_dir, exist_ok=True)

        # Move the file to the target directory
        shutil.move(file_path, os.path.join(target_dir, filename))

print("Files have been moved to their respective directories.")

        

Files have been moved to their respective directories.


In [3]:
from pathlib import Path
src_dir = Path('./train_dataset/user01-06')
dest_dir = Path('./train_dataset/temp')

for csv_file in src_dir.glob('*combined*.csv'):
    move_file(str(csv_file), str(dest_dir))

Moved 'train_dataset/user01-06/user05_e4Temp_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user06_e4Hr_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user01_e4Temp_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user01_mAcc_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user06_e4Temp_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user02_e4Hr_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user05_e4Eda_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user02_mGps_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user04_mGps_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user04_e4Temp_combined.csv' to 'train_dataset/temp' successfully.
Moved 'train_dataset/user01-06/user01_mMa

In [10]:
import os
import pandas as pd
import re

# Define the directory containing the CSV files
directory = './train_dataset/sensor_data/raw'

# Regular expression pattern to extract the user substring (e.g., 'user01', 'user02')
pattern = re.compile(r'user\d{2}')

# Dictionary to store lists of dataframes for each user substring
dfs_dict = {}

# Iterate over files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        match = pattern.search(filename)
        if match:
            user_substring = match.group(0)
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            if 'timestamp' in df.columns:
                df.set_index('timestamp', inplace=True)
            
            if user_substring not in dfs_dict:
                dfs_dict[user_substring] = []
            dfs_dict[user_substring].append(df)

# Merge dataframes for each user substring and save to new CSV files
for user_substring, dfs in dfs_dict.items():
    if dfs:
        # Concatenate dataframes on columns
        merged_df = pd.concat(dfs, axis=1)
        # Reset the index to make 'timestamp' a column again
        merged_df.reset_index(inplace=True)
        # Save the merged dataframe to a new CSV file
        merged_filename = f'sensor_{user_substring}.csv'
        merged_df.to_csv(os.path.join(directory, merged_filename), index=False)
        print(f"Merged data for {user_substring} saved to {merged_filename}")
    else:
        print(f"No CSV files found for {user_substring}")



Merged data for user01 saved to sensor_user01.csv
Merged data for user06 saved to sensor_user06.csv
Merged data for user05 saved to sensor_user05.csv
Merged data for user03 saved to sensor_user03.csv
Merged data for user04 saved to sensor_user04.csv
Merged data for user02 saved to sensor_user02.csv


## Generate dataset for regression

single instance learning

ex: predict sleep quality

- X_train: feature matrix where each row corresponds to the features extracted from a time window of the column values (like heart rate)
- y_train: labels indicating sleep quality (0 or 1)

how to handle aggregated sleep quality labels

1. Segment time series data into windows:
-- divide the continuous time series data into fixed-length windows

2. Extract features from each window:
-- for each window, extract features like mean, stdev, min, max, etc

3. Aggregate features for each person: 
-- aggregate the features from all windows for each person to create a single feature vector representing that person.

4. Construct 'X_train' and 'y_train':
-- 'X_train' will consist of the aggregated feature vector for each person.
-- 'y_train' will consist of the sleep quality labels for each person.

In [4]:
import os
os.getcwd()
os.chdir('./train_dataset')

In [15]:
import pandas as pd
df = pd.read_csv('./sensor_data/minutely/user06_minutely.csv')
df.set_index('timestamp', inplace=True)

In [24]:
# 1. Segmentation

# Parameters
window_size = '1H'  # 1 hour window
step_size = '1H'    # 1 hour step (non-overlapping)

# Load CSV file into DataFrame
df = pd.read_csv('./sensor_data/minutely/user06_minutely.csv')

# Convert 'timestamp' column to datetime and set it as index
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.dropna(inplace=True)
df.set_index('timestamp', inplace=True)

# Function to split time series into windows
def split_into_windows(df, window_size, step_size):
    windows = []
    start_time = df.index.min()
    end_time = df.index.max()
    
    while start_time + pd.to_timedelta(window_size) <= end_time:
        window_end_time = start_time + pd.to_timedelta(window_size)
        window_data = df[start_time:window_end_time]
        windows.append(window_data)
        start_time += pd.to_timedelta(step_size)
    
    return windows

# Split data into windows
windows = split_into_windows(df, window_size, step_size)


  while start_time + pd.to_timedelta(window_size) <= end_time:
  window_end_time = start_time + pd.to_timedelta(window_size)
  start_time += pd.to_timedelta(step_size)


In [None]:
# 2. Feature extraction
import numpy as np
from scipy.stats import skew, kurtosis

def extract_features_from_window(window):
    heart_rate = window['hr']
    temperature = window['temp']
    blood_pressure = window['bvp_positive']
    electrodermal_activity = window['eda']
    #sleep_score = window['sleep_score']
    #sleep_duration = window['sleep_duration']


    features = [
        np.mean(heart_rate), np.std(heart_rate), np.min(heart_rate), np.max(heart_rate), skew(heart_rate), kurtosis(heart_rate),
        np.mean(temperature), np.std(temperature), np.min(temperature), np.max(temperature), skew(temperature), kurtosis(temperature),
        np.std(blood_pressure), np.min(blood_pressure), np.max(blood_pressure), skew(blood_pressure), kurtosis(blood_pressure),
        np.mean(electrodermal_activity), np.std(electrodermal_activity), np.min(electrodermal_activity), np.max(electrodermal_activity), skew(electrodermal_activity), kurtosis(electrodermal_activity)
        #np.mean(sleep_score), np.std(sleep_score), np.min(sleep_score), np.max(sleep_score), skew(sleep_score), kurtosis(sleep_score),
        #np.mean(sleep_duration), np.std(sleep_duration), np.min(sleep_duration), np.max(sleep_duration), skew(sleep_duration), kurtosis(sleep_duration)
    ]
    return features

# Extract features for each window
features = [extract_features_from_window(window) for window in windows]
X_train = np.array(features)

In [None]:
# 3. Aggregate features for each person

def aggregate_features_for_person(windows):
    all_features = [extract_features_from_window(window) for window in windows]
    aggregated_features = np.mean(all_features, axis=0)
    return aggregated_features

# Example aggregation for multiple persons
data_per_person = {
    'person1': windows_person1,
    'person2': windows_person2,
    # Add more persons
}

X_train = []
y_train = []

for person, windows in data_per_person.items():
    aggregated_features = aggregate_features_for_person(windows)
    X_train.append(aggregated_features)
    y_train.append(sleep_quality_labels[person])

X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
#4. Training

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Split data
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initialize model
model = RandomForestClassifier()

# Train model
model.fit(X_train_split, y_train_split)

# Evaluate model
accuracy = model.score(X_val_split, y_val_split)
print(f"Validation Accuracy: {accuracy}")

In [36]:
import os
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Function to resample data to ensure unique timestamps
def resample_data(df, frequency='1T'):
    df_resampled = df.resample(frequency).mean()  # Resample to a minute frequency (adjust as needed)
    df_resampled.dropna(inplace=True)  # Drop rows with NaN values
    return df_resampled

# Function to split time series into windows
def split_into_windows(df, window_size, step_size):
    windows = []
    start_time = df.index.min()
    end_time = df.index.max()
    
    while start_time + pd.to_timedelta(window_size) <= end_time:
        window_end_time = start_time + pd.to_timedelta(window_size)
        window_data = df[start_time:window_end_time]
        windows.append(window_data)
        start_time += pd.to_timedelta(step_size)
    
    return windows

# Function to extract features from window
def extract_features_from_window(window):
    heart_rate = window['hr']
    temperature = window['temp']
    blood_pressure = window['bvp_positive']
    
    # Exclude NaN values
    heart_rate = heart_rate.dropna()
    temperature = temperature.dropna()
    blood_pressure = blood_pressure.dropna()

    features = [
        np.mean(heart_rate), np.std(heart_rate), np.min(heart_rate), np.max(heart_rate), skew(heart_rate), kurtosis(heart_rate),
        np.mean(temperature), np.std(temperature), np.min(temperature), np.max(temperature), skew(temperature), kurtosis(temperature),
        np.std(blood_pressure), np.min(blood_pressure), np.max(blood_pressure), skew(blood_pressure), kurtosis(blood_pressure)
    ]
    return features

# Function to aggregate features for person
def aggregate_features_for_person(windows):
    all_features = []
    for window in windows:
        if not window.isnull().any().any():
            features = extract_features_from_window(window)
            all_features.append(features)
    if all_features:
        aggregated_features = np.mean(all_features, axis=0)
    else:
        # Handle case where all windows contain NaN values
        aggregated_features = np.nan * np.zeros((17,))
    return aggregated_features

# Directory containing CSV files
directory = './sensor_data/minutely'

# Dictionary to store aggregated features for each person
data_per_person = {}

# Iterate over CSV files in directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        
        # Load CSV file into DataFrame
        df = pd.read_csv(filepath)
        df['timestamp'] = pd.to_datetime(df['timestamp'])
        df.set_index('timestamp', inplace=True)
        
        # Resample data to ensure unique timestamps
        df_resampled = resample_data(df, frequency='1D')  # Resample to daily aggregates
        
        # Split data into windows
        windows = split_into_windows(df_resampled, window_size='1D', step_size='1D')  # Adjust window size to 1 day
        
        # Extract features for each window
        features = [extract_features_from_window(window) for window in windows]
        
        # Aggregate features for person
        aggregated_features = aggregate_features_for_person(windows)
        
        # Store aggregated features for person in dictionary
        person_name = os.path.splitext(filename)[0]  # Extract person name from filename
        data_per_person[person_name] = aggregated_features

data_per_person

  mean = a.mean(axis, keepdims=True)
  ret = um.true_divide(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  mean = a.mean(axis, keepdims=True)
  mean = a.mean(axis, keepdims=True)
  ret = um.true_divide(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  mean = a.mean(axis, keepdims=True)
  mean = a.mean(axis, keepdims=True)
  ret = um.true_divide(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  mean = a.mean(axis, keepdims=True)
  mean = a.mean(axis, keepdims=True)
  ret = um.true_divide(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  mean = a.mean(axis, keepdims=True)
  mean = a.mean(axis, keepdims=True)
  ret = um.true_divide(
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  mean = a.mean(axis, keepdims=True)
  mean = a.mean(axis, keepdims=True)
  ret = um.true_divide(
  return 

{'user25_minutely': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan]),
 'all_users_minutely': array([ 8.16141036e+01,  1.65813113e+00,  7.99559725e+01,  8.32722347e+01,
        -2.96651057e-14, -2.00000000e+00,  3.36318333e+01,  1.86548496e-01,
         3.34452848e+01,  3.38183818e+01,  1.10335528e-13, -2.00000000e+00,
         4.39623476e+00,  5.47904846e+01,  6.35829542e+01,  2.55479815e-15,
        -2.00000000e+00]),
 'user30_minutely': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan]),
 'user12_minutely': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan]),
 'user11_minutely': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan]),
 'user26_minutely': array([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
        nan, nan, nan, nan]),
 'user21_minutely': array([nan, nan, 

In [5]:
os.getcwd()
os.chdir('./train_dataset')

In [47]:
import os
import pandas as pd
from datetime import datetime

# Function to convert Unix epoch time to datetime format
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d %H:%M:%S')

# Define the root directory
root_dir = './user26-30/user30_processed'

# Create an empty DataFrame to store concat data
concat_df = pd.DataFrame()

# Iterate through each directory
for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:
        # Check if the file ends with '_label.csv'
        if filename.endswith('_label.csv'):
            file_path = os.path.join(dirpath, filename)
            # Read the CSV file
            df = pd.read_csv(file_path)
            # Concatenate to the main DataFrame
            concat_df = pd.concat([concat_df, df], ignore_index=True)

# resample using mode
concat_df['ts'] = concat_df['ts'].apply(unix_to_datetime)
concat_df['ts'] = pd.to_datetime(concat_df['ts'])
concat_df.rename(columns={'ts':'timestamp'}, inplace=True)
concat_df.set_index('timestamp', inplace=True)
activity_mapping = {
    0: 'IN_VEHICLE',
    1: 'ON_BICYCLE',
    2: 'ON_FOOT',
    3: 'STILL',
    4: 'UNKNOWN',
    5: 'TILTING',
    7: 'WALKING',
    8: 'RUNNING'
}

concat_df['activity'] = concat_df['activity'].map(activity_mapping)
concat_df = concat_df.drop(columns={'actionOption', 'actionSub', 'actionSubOption', 
                                    'conditionSub1Option', 'conditionSub2Option'})

resampled_df = concat_df.resample('h').agg({'action':lambda x: x.dropna().mode().iloc[0] if not x.dropna().empty else None,
                                            'condition':lambda x: x.dropna().mode().iloc[0] if not x.dropna().empty else None,
                                            'place':lambda x: x.dropna().mode().iloc[0] if not x.dropna().empty else None,
                                            'emotionPositive':'mean',
                                            'emotionTension':'mean',
                                            'activity':lambda x: x.dropna().mode().iloc[0] if not x.dropna().empty else None})
resampled_df.to_csv('./user26-30/user30_labels.csv')


In [12]:
df = pd.read_csv('./user01-06/user01_labels.csv')
df['ts'] = df['ts'].apply(unix_to_datetime)
df

Unnamed: 0,ts,action,actionOption,actionSub,actionSubOption,condition,conditionSub1Option,conditionSub2Option,place,emotionPositive,emotionTension,activity
0,2020-09-13 16:10:00,recreation_media,724,,,ALONE,,,home,6,4,3
1,2020-09-13 16:11:00,recreation_media,724,,,ALONE,,,home,6,4,3
2,2020-09-13 16:12:00,recreation_media,724,,,ALONE,,,home,6,4,3
3,2020-09-13 16:13:00,recreation_media,724,,,ALONE,,,home,6,4,3
4,2020-09-13 16:14:00,recreation_media,724,,,ALONE,,,home,6,4,3
...,...,...,...,...,...,...,...,...,...,...,...,...
9855,2020-09-07 14:21:00,meal,122,meal_amount,3.0,ALONE,,,home,7,4,3
9856,2020-09-07 14:22:00,meal,122,meal_amount,3.0,ALONE,,,home,7,4,3
9857,2020-09-07 14:23:00,meal,122,meal_amount,3.0,ALONE,,,home,7,4,3
9858,2020-09-07 14:24:00,meal,122,meal_amount,3.0,ALONE,,,home,7,4,4


Validation Set

In [None]:
challenge2024_dataset_path = "./val_dataset"
print("val dataset" + "=" *20)
for file_name in os.listdir(challenge2024_dataset_path):
    if file_name.startswith('ch2024_val'):
        print(file_name) 

ch2024_val__m_ambience.parquet.gzip
ch2024_val__m_usage_stats.parquet.gzip
ch2024_val__m_acc_part_4.parquet.gzip
ch2024_val__m_acc_part_1.parquet.gzip
ch2024_val__m_activity.parquet.gzip
ch2024_val__w_heart_rate.parquet.gzip
ch2024_val__m_acc_part_3.parquet.gzip
ch2024_val__m_gps.parquet.gzip
ch2024_val__m_acc_part_2.parquet.gzip
ch2024_val__m_light.parquet.gzip
ch2024_val__w_light.parquet.gzip
ch2024_val__w_pedo.parquet.gzip


1. mACC; 스마트폰의 가속도 센서 데이터 (1초당 약 50회씩 측정)
- subject_id: 실험 참여자의 식별자
- timestamp
- x 
- y 
- z

note: ch2024_val__m_acc_part{subject_id}

(there are a total of 4 subjects)

In [None]:
filename = "ch2024_val__m_acc_part_1.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
Index: 176781170 entries, 0 to 4543437
Data columns (total 5 columns):
 #   Column      Dtype         
---  ------      -----         
 0   subject_id  int64         
 1   timestamp   datetime64[us]
 2   x           float64       
 3   y           float64       
 4   z           float64       
dtypes: datetime64[us](1), float64(3), int64(1)
memory usage: 7.9 GB


Unnamed: 0,subject_id,timestamp,x,y,z
0,1,2023-08-20 00:00:00.025,0.933201,-3.522235,9.164511
1,1,2023-08-20 00:00:00.043,0.947558,-3.522235,9.169296
2,1,2023-08-20 00:00:00.110,0.9667,-3.479164,9.164511
3,1,2023-08-20 00:00:00.131,0.947558,-3.522235,9.159725
4,1,2023-08-20 00:00:00.150,0.918844,-3.531806,9.159725


2. mActivity
- 스마트폰에서 인식된 행동 분류값. 1분마다 1회씩 기록
- subject_id: 실험 참여자의 식별자
- timestamp
- m_activity
-- 0: IN_VEHICLE
-- 1: ON_BICYCLE
-- 2: ON_FOOT
-- 3: STILL 
-- 4: UNKNOWN 
-- 5: TILTING
-- 7: WALKING
-- 8: RUNNING

In [None]:
filename = "ch2024_val__m_activity.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()
df_raw['subject_id'].unique() # 1,2,3,4

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149870 entries, 0 to 149869
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   subject_id  149870 non-null  int64         
 1   timestamp   149870 non-null  datetime64[us]
 2   m_activity  149870 non-null  object        
dtypes: datetime64[us](1), int64(1), object(1)
memory usage: 3.4+ MB


array([1, 2, 3, 4])

3. mAmbiance
- 스마트폰에서 인식된 음향 기반 레이블. 2분마다 1회씩 기록
- subject_id: 실험 참여자의 식별자
- timestamp
- ambiance_labels: 상위 10개의 레이블 및 각각의 확률 목록

In [None]:
filename = "ch2024_val__m_ambience.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74575 entries, 0 to 74574
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   subject_id       74575 non-null  int64         
 1   timestamp        74575 non-null  datetime64[us]
 2   ambience_labels  74575 non-null  object        
dtypes: datetime64[us](1), int64(1), object(1)
memory usage: 1.7+ MB


Unnamed: 0,subject_id,timestamp,ambience_labels
0,1,2023-08-20 00:00:10,"[[Speech, 0.7875364], [Narration, monologue, 0..."
1,1,2023-08-20 00:02:10,"[[Music, 0.3809659], [Singing, 0.019089445], [..."
2,1,2023-08-20 00:04:10,"[[Speech, 0.7312041], [Inside, small room, 0.0..."
3,1,2023-08-20 00:06:10,"[[Music, 0.21463676], [Speech, 0.07978396], [I..."
4,1,2023-08-20 00:08:10,"[[Speech, 0.9546498], [Narration, monologue, 0..."


4. mGps
- 스마트폰에서 산출된 GPS 좌표 정보 (단, 위도 및 경도는 상대 좌표로 변환됨). 5초 간격 (1분당 약 12회)으로 측정됨.
- subject_id: 실험 참여자의 식별자
- timestamp
- altitude
- latitude
- longitude
- speed

In [None]:
filename = "ch2024_val__m_gps.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955239 entries, 0 to 955238
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   subject_id  955239 non-null  int64         
 1   timestamp   955239 non-null  datetime64[us]
 2   altitude    955239 non-null  float64       
 3   latitude    955239 non-null  float64       
 4   longitude   955239 non-null  float64       
 5   speed       955239 non-null  float64       
dtypes: datetime64[us](1), float64(4), int64(1)
memory usage: 43.7 MB


Unnamed: 0,subject_id,timestamp,altitude,latitude,longitude,speed
0,1,2023-08-20 00:00:08,144.217651,0.016095,0.926485,0.143791
1,1,2023-08-20 00:00:13,144.217651,0.01609,0.926477,0.160771
2,1,2023-08-20 00:00:18,144.217651,0.016091,0.926478,0.006571
3,1,2023-08-20 00:00:23,144.217651,0.016091,0.926474,0.05931
4,1,2023-08-20 00:00:28,144.217651,0.016092,0.926477,0.049454


5. mLight
- 스마트폰에서 측정된 빛의 세기. 10분 간격으로 측정됨.
- subject_id: 실험 참여자의 식별자
- timestamp
- m_light: 빛의 세기

In [None]:
filename = "ch2024_val__m_light.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14906 entries, 0 to 14905
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   subject_id  14906 non-null  int64         
 1   timestamp   14906 non-null  datetime64[us]
 2   m_light     14906 non-null  float64       
dtypes: datetime64[us](1), float64(1), int64(1)
memory usage: 349.5 KB


Unnamed: 0,subject_id,timestamp,m_light
0,1,2023-08-20 00:02:00,254.0
1,1,2023-08-20 00:12:00,275.0
2,1,2023-08-20 00:22:00,261.0
3,1,2023-08-20 00:32:00,107.0
4,1,2023-08-20 00:42:00,105.0


6. mUsageStats
- 스마트폰 앱 사용량 정보. 10분 간격으로 측정됨.
- subject_id
- timestamp
- m_usage_stats: 앱 이름 및 각 사용시간에 대한 목록

In [None]:
filename = "ch2024_val__m_usage_stats.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14920 entries, 0 to 14919
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   subject_id     14920 non-null  int64         
 1   timestamp      14920 non-null  datetime64[us]
 2   m_usage_stats  14920 non-null  object        
dtypes: datetime64[us](1), int64(1), object(1)
memory usage: 349.8+ KB


Unnamed: 0,subject_id,timestamp,m_usage_stats
0,1,2023-08-20 00:00:00.012,[]
1,1,2023-08-20 00:10:00.024,[]
2,1,2023-08-20 00:20:00.010,[]
3,1,2023-08-20 00:30:00.009,[]
4,1,2023-08-20 00:40:00.010,[]


7. wHr
- 스마트워치에서 측정된 심박 데이터. 1초 간격으로 측정됨.
- subject_id
- timestamp
- heart_rate

In [None]:
filename = "ch2024_val__w_heart_rate.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130803 entries, 0 to 130802
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype         
---  ------      --------------   -----         
 0   subject_id  130803 non-null  int64         
 1   timestamp   130803 non-null  datetime64[us]
 2   heart_rate  130803 non-null  int64         
dtypes: datetime64[us](1), int64(2)
memory usage: 3.0 MB


Unnamed: 0,subject_id,timestamp,heart_rate
0,1,2023-08-20 00:00:44.572,0
1,1,2023-08-20 00:01:44.752,0
2,1,2023-08-20 00:02:44.919,0
3,1,2023-08-20 00:03:45.075,0
4,1,2023-08-20 00:04:45.248,0


8. wPedo
- 스마트워치에서 측정된 걸음수 데이터 및 관련 정보. 1분 간격으로 측정됨.
- subject_id
- timestamp
- burned_calories
- distance
- running_steps
- speed
- steps
- step_frequency
- walking_steps

In [None]:
filename = "ch2024_val__w_pedo.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103217 entries, 0 to 103216
Data columns (total 9 columns):
 #   Column           Non-Null Count   Dtype         
---  ------           --------------   -----         
 0   subject_id       103217 non-null  int64         
 1   timestamp        103217 non-null  datetime64[us]
 2   burned_calories  103217 non-null  float64       
 3   distance         103217 non-null  float64       
 4   running_steps    103217 non-null  int64         
 5   speed            103217 non-null  float64       
 6   steps            103217 non-null  int64         
 7   step_frequency   103217 non-null  float64       
 8   walking_steps    103217 non-null  int64         
dtypes: datetime64[us](1), float64(4), int64(4)
memory usage: 7.1 MB


Unnamed: 0,subject_id,timestamp,burned_calories,distance,running_steps,speed,steps,step_frequency,walking_steps
0,1,2023-08-20 00:00:00,0.0,0.0,0,0.0,0,0.0,0
1,1,2023-08-20 09:41:00,5.279053,62.480469,40,8.44,75,1.25,35
2,1,2023-08-20 09:42:00,2.160278,30.285156,0,3.405882,40,0.666667,40
3,1,2023-08-20 09:43:00,0.719971,19.941406,0,2.828571,27,0.45,27
4,1,2023-08-20 09:44:00,2.809692,42.910156,12,4.751613,56,0.933333,44


9. wLight
- 스마트워치에서의 빛의 세기. 10분 간격으로 측정됨.
- subject_id
- timestamp
- w_light

In [None]:
filename = "ch2024_val__w_light.parquet.gzip"
df_raw = pd.read_parquet(os.path.join(challenge2024_dataset_path, filename))
df_raw.info()
df_raw.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13502 entries, 0 to 13501
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   subject_id  13502 non-null  int64         
 1   timestamp   13502 non-null  datetime64[us]
 2   w_light     13502 non-null  float64       
dtypes: datetime64[us](1), float64(1), int64(1)
memory usage: 316.6 KB


Unnamed: 0,subject_id,timestamp,w_light
0,1,2023-08-20 00:02:45.280,224.0
1,1,2023-08-20 00:12:45.324,218.0
2,1,2023-08-20 00:22:45.392,224.0
3,1,2023-08-20 00:32:45.496,213.0
4,1,2023-08-20 00:42:45.574,230.0


레이블 파일

In [None]:
df = pd.read_csv("./val_label.csv")
df

Unnamed: 0,subject_id,date,Q1,Q2,Q3,S1,S2,S3,S4
0,1,2023-08-20,1,1,1,0,0,0,0
1,1,2023-08-21,1,1,1,0,0,1,0
2,1,2023-08-22,0,1,1,0,1,1,0
3,1,2023-08-23,0,1,1,0,0,1,0
4,1,2023-08-24,1,1,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...
100,4,2023-10-27,0,1,0,0,1,1,1
101,4,2023-10-28,1,1,0,1,1,1,1
102,4,2023-10-29,1,1,0,0,1,1,1
103,4,2023-10-30,0,1,0,0,0,1,1
