In [18]:
import pandas as pd
import numpy as np
import os
from scipy.stats import skew, kurtosis, iqr, entropy
from numpy.fft import fft

# Define folder paths
input_folder = r'C:\Users\Gjert\Masteroppgave\Datasets\capture24\properLabels'
output_folder = r'C:\Users\Gjert\Masteroppgave\Datasets\capture24\properLabels\cleaned_window'
os.makedirs(output_folder, exist_ok=True)




In [22]:
# Define file to create windows for
df = pd.read_csv(r'C:\Users\Gjert\Masteroppgave\Datasets\capture24\properLabels\P001.csv')

print(df.columns)


Index(['time', 'x', 'y', 'z', 'label', 'user_id'], dtype='object')


In [24]:
if 'labels' in df.columns:
    df = df.rename(columns={'labels': 'label'})

print(df.columns)


Index(['time', 'x', 'y', 'z', 'label', 'user_id'], dtype='object')


In [27]:
import pandas as pd



# Parameters
window_size = 170  # Number of rows per window
step_size = 30     # Step size for 80% overlap (150 * 0.2 = 30)

# List to store the windows
windows = []

# Unique window ID counter
window_id = 0

# Group by user ID and activity label to avoid mixing data from different users or activities
for label, group_data in df.groupby(['label']):
    # Create windows of 150 rows with 80% overlap within each group
    for start in range(0, len(group_data) - window_size + 1, step_size):
        end = start + window_size
        window = group_data.iloc[start:end].copy()  # Create a copy of the window
        window['window_id'] = window_id  # Assign a unique window_id to each window
        windows.append(window)
        window_id += 1  # Increment the window_id for the next window

# Combine all windows into a single DataFrame
windowed_data = pd.concat(windows, ignore_index=True)

# Save the final DataFrame with window IDs to a new CSV file

print("CSV file 'P7_window.csv' created successfully.")


CSV file 'P7_window.csv' created successfully.


In [28]:

import pandas as pd

# Load the generated windowed data

# Count the unique window IDs to determine the number of windows
num_windows = windowed_data['window_id'].nunique()

num_windows


163236

In [29]:
print(windowed_data.head())


                      time         x         y         z    label  user_id  \
0  2016-05-21 08:17:58.080 -0.321666 -0.806220  0.599923  cycling        2   
1  2016-05-21 08:17:58.090 -0.116712 -1.042174  0.648134  cycling        2   
2  2016-05-21 08:17:58.100 -0.148243 -1.073634  0.632064  cycling        2   
3  2016-05-21 08:17:58.110 -0.321666 -0.790490  0.423148  cycling        2   
4  2016-05-21 08:17:58.120 -0.321666 -0.601727  0.374936  cycling        2   

   window_id  
0          0  
1          0  
2          0  
3          0  
4          0  


In [31]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, iqr, entropy
from numpy.fft import fft

def calculate_features(window):
    features = {}
    axes = {'x': 'Ax', 'y': 'Ay', 'z': 'Az'}
 
    for axis in axes.keys():
        data = window[axis]

        # Handle NaN and infinite values
        data = data.replace([np.inf, -np.inf], np.nan)  # Replace infinities if any
        data = data.dropna()  # Drop NaN values
        
        if data.empty:
            continue  # Skip this column if data is empty after handling NaN and infinities

        # Time-domain features
        features[f'{axes[axis]}_mean'] = np.mean(data)
        features[f'{axes[axis]}_std'] = np.std(data)
        features[f'{axes[axis]}_mad'] = np.mean(np.abs(data - np.mean(data)))
        features[f'{axes[axis]}_max'] = np.max(data)
        features[f'{axes[axis]}_min'] = np.min(data)
        features[f'{axes[axis]}_sma'] = np.sum(np.abs(data)) / len(data)
        features[f'{axes[axis]}_energy'] = np.sum(data ** 2) / len(data)
        features[f'{axes[axis]}_iqr'] = iqr(data)

        # Adjust histogram calculation for better handling
        if len(data) > 0:
            hist, _ = np.histogram(data.dropna(), bins=10)  # Ensure no NaN values are included
            features[f'{axes[axis]}_entropy'] = entropy(hist + 1e-6) if np.sum(hist) > 0 else 0

        # Frequency-domain features
        freq_data = np.abs(fft(data))[:len(data) // 2]
        features[f'{axes[axis]}_meanFreq'] = np.sum(freq_data * np.arange(len(freq_data))) / np.sum(freq_data)
        features[f'{axes[axis]}_skewness'] = skew(data)
        features[f'{axes[axis]}_kurtosis'] = kurtosis(data)
        features[f'{axes[axis]}_maxInds'] = np.argmax(freq_data)

    return features

In [40]:
# Process each window to calculate features
for window in windows:
    if not window.empty:
        window_id = window['window_id'].iloc[0]  # Ensure 'window_id' is a column
        label = window['label'].iloc[0]         # Ensure 'label' is a column
        user_id = window['user_id'].iloc[0]     # <--- Capture user_id here

        # Calculate features
        features = calculate_features(window)
        
        # Add identifiers
        features['window_id'] = window_id
        features['label'] = label
        features['user_id'] = user_id  # <--- Include user_id in the feature dictionary

        features_list.append(features)

# Convert the list of feature dictionaries to a DataFrame
features_df = pd.DataFrame(features_list)

# Save the features to a new CSV file
output_path = r'C:\Users\Gjert\Masteroppgave\Datasets\capture24\properLabels\cleaned_window\P1.csv'
features_df.to_csv(output_path, index=False)

print(f"CSV file '{output_path}' created successfully.")

  features[f'{axes[axis]}_skewness'] = skew(data)
  features[f'{axes[axis]}_kurtosis'] = kurtosis(data)


CSV file 'C:\Users\Gjert\Masteroppgave\Datasets\capture24\properLabels\cleaned_window\all.csv' created successfully.


In [41]:
# Print all column names in the DataFrame
print(features_df.columns)


Index(['Ax_mean', 'Ax_std', 'Ax_mad', 'Ax_max', 'Ax_min', 'Ax_sma',
       'Ax_energy', 'Ax_iqr', 'Ax_entropy', 'Ax_meanFreq', 'Ax_skewness',
       'Ax_kurtosis', 'Ax_maxInds', 'Ay_mean', 'Ay_std', 'Ay_mad', 'Ay_max',
       'Ay_min', 'Ay_sma', 'Ay_energy', 'Ay_iqr', 'Ay_entropy', 'Ay_meanFreq',
       'Ay_skewness', 'Ay_kurtosis', 'Ay_maxInds', 'Az_mean', 'Az_std',
       'Az_mad', 'Az_max', 'Az_min', 'Az_sma', 'Az_energy', 'Az_iqr',
       'Az_entropy', 'Az_meanFreq', 'Az_skewness', 'Az_kurtosis', 'Az_maxInds',
       'window_id', 'label', 'user_id'],
      dtype='object')
