In [2]:
import pandas as pd
import os
from datetime import datetime
from pathlib import Path
import math
from pandas.errors import EmptyDataError
import pickle
import glob
import re

For renaming the filenames in 'user_processed' folder from Unix epoch time to datetime'

In [None]:
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d')

parent_folder = './train_dataset/user21-25/user25_processed'

for subfolder in os.listdir(parent_folder):
    subfolder_path = os.path.join(parent_folder, subfolder)
    # check if it's a directory 
    if os.path.isdir(subfolder_path):
        try:
            # convert subfolder name to datetime format
            new_name = unix_to_datetime(int(subfolder))
            os.rename(subfolder_path, os.path.join(parent_folder, new_name))
            print(f"Renamed {subfolder} to {new_name}")

            csv_file = os.path.join(parent_folder, new_name, f"{subfolder}_label.csv")
            new_csv_name = os.path.join(parent_folder, new_name, f"{new_name}_label.csv")

            os.rename(csv_file, new_csv_name)
            print(f"Renamed {csv_file} to {new_csv_name}")

        except ValueError:
            print(f"Skipping {subfolder}: Not a valid Unix epoch time")
        except Exception as e:
            print(f"Error renaming {subfolder}: {e}")


In [None]:
# Function to convert Unix epoch time to datetime format
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d %H:%M:%S')

# Path to the folder containing CSV files with Unix epoch filenames
folder_path = './train_dataset/user01-06/user01_processed/2020-08-31/e4Acc'

# Iterate through each file in the folder
for file_name in os.listdir(folder_path):
    # Check if the file is a CSV file
    if file_name.endswith('.csv'):
        try:
            # Extract Unix epoch time from the filename
            epoch_time = int(os.path.splitext(file_name)[0])
            
            # Convert Unix epoch time to datetime format
            new_name = unix_to_datetime(epoch_time)
            
            # Rename the file with the new datetime format
            old_path = os.path.join(folder_path, file_name)
            new_path = os.path.join(folder_path, f"{new_name}.csv")
            os.rename(old_path, new_path)
            
            print(f"Renamed {file_name} to {new_name}.csv")
        except ValueError:
            print(f"Skipping {file_name}: Not a valid Unix epoch time")
        except Exception as e:
            print(f"Error renaming {file_name}: {e}")


In [13]:
def open_pickle(filename: str):
    with open(filename, 'rb') as file:
        load = pickle.load(file)
    return load

def save_pickle(filename: str, data):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

Data aggregation
- Originally, for each user, date -> sensor data name -> separate csv files for each 'minute'.
- To aggregate, first produce a single file for each date's sensor data name

In [None]:
# first, add the date ('2020-08-30') and time in seconds ('36') to create a column 'timestamp' in ('%Y-%m-%d %H:%M:%S.%f') format
# then combine them for each sensor type

# Function to convert Unix time to datetime string
def unix_to_datetime(unix_time):
    return datetime.utcfromtimestamp(unix_time).strftime('%Y-%m-%d %H:%M:%S.%f')

def calculate_magnitude(x, y, z):
    return math.sqrt(x**2 + y**2 + z**2)

# Function to reformat sensor data in a directory
def reformat_sensor_data(date_directory, data_type, output_dir):
    directory = date_directory / data_type
    output_filename = f"{data_type}_{date_directory.name}_combined.csv"
    output_file = output_dir / output_filename

    # Skip processing if the output file already exists
    if output_file.exists():
        print(f"Processed file already exists: {output_file}, skipping.")
        return None

    csv_files = list(directory.glob('*.csv'))
    
    if not csv_files:
        print(f"No CSV files found in {directory}, skipping.")
        return None
    
    dfs = []
    for csv_file in csv_files:
        try:
            df = pd.read_csv(csv_file)
        except EmptyDataError:
            print(f"EmptyDataError: {csv_file} is empty, skipping this file.")
            continue
            
        time_value = csv_file.stem
        try:
            df['time_value'] = float(time_value)
        except ValueError:
            print(f"ValueError: could not convert {time_value} to float, skipping this file.")
            continue
        dfs.append(df)
    
    if not dfs:
        print(f"No valid CSV files found in {directory}, skipping.")
        return None

    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df.sort_values(['time_value', 'timestamp'], ascending=[True, True])
    
    def safe_float_conversion(x):
        try:
            return float(x)
        except ValueError:
            print(f"ValueError: could not convert {x} to float, skipping this row.")
            return None
    
    combined_df['timestamp'] = combined_df['timestamp'].apply(safe_float_conversion)
    combined_df = combined_df.dropna(subset=['timestamp'])  # Drop rows where conversion failed
    
    combined_df['timestamp'] = (combined_df['time_value'] + combined_df['timestamp']).apply(unix_to_datetime)

    final_df = combined_df.drop(columns=['time_value'])
    
    if data_type in ['mAcc', 'e4Acc', 'mGyr', 'mMag']:
        # Apply calculate_magnitude function to specified columns
        magnitude_col = f"magnitude_{data_type}"
        final_df[magnitude_col] = final_df.apply(lambda row: calculate_magnitude(row['x'], row['y'], row['z']), axis=1)

    # Save the final DataFrame to a CSV file
    final_df.to_csv(output_file, index=False)
    print(f"Saved processed data to: {output_file}")
    return final_df

# Define the base directory and output directory
base_dir = Path('./train_dataset/user01-06/user05_processed')
output_dir = Path('./train_dataset/user01-06/user05_processed')

# Traverse the directory structure and process each date directory
for date_dir in base_dir.rglob('*'):
    if date_dir.is_dir():
        for data_type in ['e4Acc', 'e4Bvp', 'e4Eda', 'e4Hr', 'e4Temp', 'mAcc', 'mGps', 'mGyr', 'mMag']:
            data_type_path = date_dir / data_type
            if data_type_path.is_dir():
                print(f"Processing directory: {data_type_path}")
                final_df = reformat_sensor_data(date_dir, data_type, output_dir)

In [None]:
def delete_combined_files(directory):
    files = os.listdir(directory)
    for file in files:
        if file.endswith("_combined.csv"):
            file_path = os.path.join(directory, file)
            os.remove(file_path)
            print(f"Deleted: {file_path}")

directory_path = './train_dataset/user01-06/user06_processed'
delete_combined_files(directory_path)

In [6]:
def concat_csv(data_type, output_dir):
    files_to_concat = [file for file in output_dir.glob("*.csv") if data_type in file.name]
    if len(files_to_concat) <2:
        print(f"Not enough files found for {data_type}, skipping concatenation")
        return None 
    
    dfs = [pd.read_csv(file) for file in files_to_concat]
    combined_df = pd.concat(dfs, ignore_index=True)
    combined_df = combined_df.sort_values('timestamp')
    return combined_df 

output_dir = Path('./train_dataset/user01-06/user06_processed')
user_id = 'user06'

for data_type in ['e4Acc', 'e4Bvp', 'e4Eda', 'e4Hr', 'e4Temp', 'mAcc', 'mGps', 'mGyr', 'mMag']:
    print(f"Processing data type: {data_type}")
    concat_df = concat_csv(data_type, output_dir)
    if concat_df is not None:
        output_filename = f"{user_id}_{data_type}_combined.csv"
        output_file = output_dir / output_filename
        concat_df.to_csv(output_file, index=False)
        print(f"Concatenated data saved to: {output_file}")

Processing data type: e4Acc
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Acc_combined.csv
Processing data type: e4Bvp
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Bvp_combined.csv
Processing data type: e4Eda
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Eda_combined.csv
Processing data type: e4Hr
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Hr_combined.csv
Processing data type: e4Temp
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_e4Temp_combined.csv
Processing data type: mAcc
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_mAcc_combined.csv
Processing data type: mGps
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_mGps_combined.csv
Processing data type: mGyr
Concatenated data saved to: train_dataset/user01-06/user06_processed/user06_mGyr_combined.csv
Processing data type: 

In [5]:
def resample_data(df, unit: str):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)
    resampled_df = df.resample(unit).mean()
    return resampled_df


def resample_data_bvp(df, unit: str):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df.set_index('timestamp', inplace=True)

    df['bvp_positive'] = df['value'].apply(lambda x: x if x > 0 else None)
    df['bvp_negative'] = df['value'].apply(lambda x: x if x < 0 else None)
    
    positive_mean = df['bvp_positive'].resample(unit).mean()
    negative_mean = df['bvp_negative'].resample(unit).mean()

    resampled_df = pd.DataFrame({'bvp_positive': positive_mean, 'bvp_negative': negative_mean})

    return resampled_df

In [90]:
def process_sensor_data(root, users, sensors, resample_freq):

    # Define dictionary for resampling frequency
    freq_dict = {'h': 'hourly', 'min': 'minutely', 'd': 'daily'}
    
    # Loop through users and sensors
    for user in users:
        for sensor in sensors:
            # Construct file path
            path = os.path.join(root, f"{user}_{sensor}_combined.csv")

            if not os.path.exists(path):
                print(f"File not found for user {user} and sensor {sensor}. Skipping...")
                continue
            
            # Read data and select columns based on sensor type
            if sensor == 'e4Acc':
                columns = ['timestamp', 'magnitude_e4Acc']
            elif sensor == 'e4Bvp':
                columns = ['timestamp', 'value']
                resample_func = resample_data_bvp  # Use specific resampling function for 'e4Bvp'
            elif sensor == 'e4Eda':
                columns = ['timestamp', 'eda']
            elif sensor == 'e4Hr':
                columns = ['timestamp', 'hr']
            elif sensor == 'e4Temp':
                columns = ['timestamp', 'temp']
            elif sensor == 'mAcc':
                columns = ['timestamp', 'magnitude_mAcc']
            elif sensor == 'mGps':
                columns = ['timestamp', 'lat', 'lon', 'accuracy']
            elif sensor == 'mGyr':
                columns = ['timestamp', 'magnitude_mGyr']
            elif sensor == 'mMag':
                columns = ['timestamp', 'magnitude_mMag']
            else:
                print(f"Unknown sensor type: {sensor}")
                continue
            
            # Read data
            df = pd.read_csv(path)[columns]
            
            # Resample data
            if resample_freq in ['h', 'min', 'd']:
                if sensor == 'e4Bvp':
                    resampled_data = resample_func(df, resample_freq)  # Use specific resampling function for 'e4Bvp'
                else:
                    resampled_data = resample_data(df, resample_freq)
                
                # Update CSV filename based on resampling frequency
                csv_filename = f"{user}_{sensor}_{freq_dict[resample_freq]}.csv"
                
                # Export resampled data to CSV
                df_path = "./train_dataset/df"
                csv_path = os.path.join(df_path, csv_filename)
                resampled_data.to_csv(csv_path)
            else:
                print("Invalid resampling frequency. Please use 'h', 'min', or 'd'.")

# Example usage
root = './train_dataset/temp'
users = ['user30']
sensors = ['e4Acc', 'e4Bvp', 'e4Eda', 'e4Hr', 'e4Temp', 'mAcc', 'mGps', 'mGyr', 'mMag']
resample_freq = 'h'  # Resampling frequency ('h', 'min', or 'd')

process_sensor_data(root, users, sensors, resample_freq)

In [91]:
# Define the directory containing the CSV files
directory = './train_dataset/df'

# Function to find the first non-NaN value
def first_non_nan(series):
    return series.dropna().iloc[0] if not series.dropna().empty else np.nan

# Initialize an empty list to hold dataframes
dfs = []

# Loop through all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)
        # Read the CSV file into a dataframe and append to the list
        dfs.append(pd.read_csv(file_path))


df_concat = pd.concat(dfs, ignore_index=True)
df_sorted = df_concat.sort_values('timestamp')
df_grouped = df_sorted.groupby('timestamp').apply(lambda x: x.apply(first_non_nan)).reset_index(drop=True)
df_grouped = df_grouped.reset_index()

# Display the final dataframe
df_grouped.to_csv("./train_dataset/sensor_data/hourly_remade/user30_hourly.csv", index=False)

  df_grouped = df_sorted.groupby('timestamp').apply(lambda x: x.apply(first_non_nan)).reset_index(drop=True)


In [10]:
# Define the directory containing the CSV files
directory = './train_dataset/sensor_data/raw'

# Regular expression pattern to extract the user substring (e.g., 'user01', 'user02')
pattern = re.compile(r'user\d{2}')

# Dictionary to store lists of dataframes for each user substring
dfs_dict = {}

# Iterate over files in the directory
for filename in os.listdir(directory):
    if filename.endswith('.csv'):
        match = pattern.search(filename)
        if match:
            user_substring = match.group(0)
            filepath = os.path.join(directory, filename)
            df = pd.read_csv(filepath)
            if 'timestamp' in df.columns:
                df.set_index('timestamp', inplace=True)
            
            if user_substring not in dfs_dict:
                dfs_dict[user_substring] = []
            dfs_dict[user_substring].append(df)

# Merge dataframes for each user substring and save to new CSV files
for user_substring, dfs in dfs_dict.items():
    if dfs:
        # Concatenate dataframes on columns
        merged_df = pd.concat(dfs, axis=1)
        # Reset the index to make 'timestamp' a column again
        merged_df.reset_index(inplace=True)
        # Save the merged dataframe to a new CSV file
        merged_filename = f'sensor_{user_substring}.csv'
        merged_df.to_csv(os.path.join(directory, merged_filename), index=False)
        print(f"Merged data for {user_substring} saved to {merged_filename}")
    else:
        print(f"No CSV files found for {user_substring}")



Merged data for user01 saved to sensor_user01.csv
Merged data for user06 saved to sensor_user06.csv
Merged data for user05 saved to sensor_user05.csv
Merged data for user03 saved to sensor_user03.csv
Merged data for user04 saved to sensor_user04.csv
Merged data for user02 saved to sensor_user02.csv
