# Generating Features for Heavy Drinking Dataset

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

## Splitting Dataset into Timed Windows

In [2]:
df = pd.read_parquet("data/Data/BU4707_250hz.parquet")
df['magnitude'] = (df['x']**2 + df['y']**2 + df['z']**2)**(1/2)
df['XYZ_sum'] = df['x'] + df['y'] + df['z']

In [3]:
df = df[:10000]

df

Unnamed: 0,time,pid,x,y,z,TAC_class,magnitude,XYZ_sum
0,2017-05-02 15:03:24.780,BU4707,-0.0094,0.0147,0.0186,0.0,0.025503,0.0239
1,2017-05-02 15:03:24.784,BU4707,-0.0094,0.0147,0.0186,0.0,0.025503,0.0239
2,2017-05-02 15:03:24.788,BU4707,-0.0094,0.0147,0.0186,0.0,0.025503,0.0239
3,2017-05-02 15:03:24.792,BU4707,-0.0094,0.0147,0.0186,0.0,0.025503,0.0239
4,2017-05-02 15:03:24.796,BU4707,-0.0094,0.0147,0.0186,0.0,0.025503,0.0239
...,...,...,...,...,...,...,...,...
9995,2017-05-02 15:04:04.760,BU4707,0.0004,0.0075,0.0019,0.0,0.007747,0.0098
9996,2017-05-02 15:04:04.764,BU4707,0.0004,0.0075,0.0019,0.0,0.007747,0.0098
9997,2017-05-02 15:04:04.768,BU4707,0.0004,0.0075,0.0019,0.0,0.007747,0.0098
9998,2017-05-02 15:04:04.772,BU4707,0.0004,0.0075,0.0019,0.0,0.007747,0.0098


In [4]:
window='4S'
grouped = df.groupby(pd.Grouper(key='time', freq=window))

  grouped = df.groupby(pd.Grouper(key='time', freq=window))


In [4]:
def create_data_windows(pid, window):
    df = pd.read_parquet(f"data/Data/{pid}_250hz.parquet")
    df['magnitude'] = (df['x']**2 + df['y']**2 + df['z']**2)**(1/2)
    df['XYZ_sum'] = df['x'] + df['y'] + df['z']

    grouped = df.groupby(pd.Grouper(key='time', freq=window))

    return grouped

In [6]:
grouped = create_data_windows("BU4707", "4S")

  grouped = df.groupby(pd.Grouper(key='time', freq=window))


## Feature Calculation Functions

In [3]:
# descriptive statistic features

def generate_mean(df_window):
    mean_features = []
    for field in ['x', 'y', 'z', 'magnitude']:
        mean_features.append(np.mean(df_window[field]))

    return mean_features


def generate_stdev(df_window):
    std_features = []
    for field in ['x', 'y', 'z', 'magnitude']:
        std_features.append(np.std(df_window[field]))

    return std_features


def generate_median(df_window):
    median_features = []
    for field in ['x', 'y', 'z', 'magnitude']:
        median_features.append(np.median(df_window[field]))

    return median_features


def generate_zero_crossing_rate(df_window):
    zero_crossing_rate_features = []
    for field in ['x', 'y', 'z']:
        changes = np.sign(df_window[field]).diff()
        num_crossings = (changes != 0).sum()
        percent_crossings = num_crossings / len(df_window)
        zero_crossing_rate_features.append(percent_crossings)

    return zero_crossing_rate_features


def generate_max(df_window):
    max_features = []
    for field in ['x', 'y', 'z', 'magnitude']:
        max_features.append(np.max(abs(df_window[field])))

    return max_features


def generate_time_domain_entropy(df_window, num_bins=10):
    # Discretize the signal into bins
    bins = np.linspace(min(df_window['XYZ_sum']), max(df_window['XYZ_sum']), num_bins+1)
    
    # Count occurrences of values falling into each bin
    counts, _ = np.histogram(df_window['XYZ_sum'], bins=bins)
    
    # Calculate probability distribution
    probabilities = counts / len(df_window['XYZ_sum'])
    
    # Calculate entropy using Shannon entropy formula
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Add small epsilon to avoid log(0)
    
    return [entropy]

In [5]:
# features requiring fast fourier transform

def generate_frequency_domain_entropy(df_window, num_bins=10):
    # Apply the FFT to the signal
    fft_result = np.fft.fft(df_window['XYZ_sum'])
    
    # Calculate the power spectrum
    power_spectrum = np.abs(fft_result) ** 2
    
    # Discretize the power spectrum into bins
    bins = np.linspace(min(power_spectrum), max(power_spectrum), num_bins+1)
    
    # Count occurrences of values falling into each bin
    counts, _ = np.histogram(power_spectrum, bins=bins)
    
    # Calculate probability distribution
    probabilities = counts / len(power_spectrum)
    
    # Calculate entropy using Shannon entropy formula
    entropy = -np.sum(probabilities * np.log2(probabilities + 1e-10))  # Add small epsilon to avoid log(0)
    
    return [entropy]


def generate_spectral_centroid(df_window):
    # Calculating the weighted mean of frequencids
    # The index in the fft_result is the frequency, and the real component of the fft represents the amplitude of the FFT

    # Apply the FFT to the signal
    fft_result = np.fft.fft(df_window['XYZ_sum'])

    centroid = 0
    for i in range(len(fft_result)):
        centroid += i * fft_result[i].real

    return [centroid]

## Calculating Features

In [2]:
def generate_features(grouped):
    feature_df_rows = []
    col_names = ['time', 'pid', 'x_mean', 'y_mean', 'z_mean', 'mag_mean', 'x_std', 'y_std', 'z_std', 'mag_std', 
                'x_median', 'y_median', 'z_median', 'mag_median', 'x_crossing', 'y_crossing', 'z_crossing', 
                'x_max', 'y_max', 'z_max', 'mag_max', 'time_entropy', 'frequency_entropy', 'spectral_centroid',
                'TAC_class']

    # the group name is the start of the window
    for group_name, group_data in tqdm(grouped):
        # the last TAC class value in the window will be the label of the feature
        TAC_class_label = group_data['TAC_class'].iloc[-1]
        
        # converting a list of data to a dataframe
        feature_funcs = [generate_mean, generate_stdev, generate_median, generate_zero_crossing_rate, generate_max, generate_time_domain_entropy,
                        generate_frequency_domain_entropy, generate_spectral_centroid]
        data = [group_name, group_data['pid'].iloc[0]]
        for func in feature_funcs:
            data += func(group_data)
        data += [TAC_class_label]
        
        feature_row = pd.DataFrame([data], columns=col_names)
        feature_df_rows.append(feature_row)

    feature_df = pd.concat(feature_df_rows)

    return feature_df

In [10]:
BU_grouped = create_data_windows("BU4707", "4s")
BU_features_df = generate_features(BU_grouped)

100%|██████████| 5105/5105 [00:19<00:00, 265.34it/s]


In [11]:
BU_features_df

Unnamed: 0,time,pid,x_mean,y_mean,z_mean,mag_mean,x_std,y_std,z_std,mag_std,...,y_crossing,z_crossing,x_max,y_max,z_max,mag_max,time_entropy,frequency_entropy,spectral_centroid,TAC_class
0,2017-05-02 15:03:24,BU4707,0.004059,0.008182,0.016625,0.020170,0.004455,0.005005,0.004811,0.004586,...,0.016149,0.001242,0.0134,0.0198,0.0294,0.032076,2.814321,0.013782,-1609.034,0.0
0,2017-05-02 15:03:28,BU4707,0.004460,0.010343,0.015098,0.026052,0.017296,0.013844,0.013065,0.018374,...,0.009000,0.007000,0.0984,0.0876,0.0523,0.107829,1.511384,0.011408,399.500,0.0
0,2017-05-02 15:03:32,BU4707,0.001078,0.009816,0.010411,0.021312,0.010945,0.010432,0.011737,0.010867,...,0.022000,0.027000,0.0421,0.0395,0.0457,0.059516,2.856420,0.011408,8647.250,0.0
0,2017-05-02 15:03:36,BU4707,0.000497,0.009387,0.011929,0.018118,0.004815,0.007249,0.008565,0.007175,...,0.017000,0.013000,0.0162,0.0349,0.0414,0.042666,2.537039,0.011408,4493.150,0.0
0,2017-05-02 15:03:40,BU4707,0.007589,0.009644,0.013029,0.052346,0.033374,0.039780,0.038114,0.041583,...,0.038000,0.041000,0.1570,0.2261,0.1856,0.262216,1.851298,0.032219,-4581.100,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2017-05-02 20:43:24,BU4707,0.016699,-0.012569,0.016587,0.197553,0.176150,0.145658,0.146950,0.188479,...,0.044000,0.050000,1.2373,0.9929,0.6614,1.588386,1.628981,0.687452,-208108.300,1.0
0,2017-05-02 20:43:28,BU4707,0.008537,-0.010702,0.007937,0.080178,0.073954,0.066523,0.069564,0.092497,...,0.027000,0.036000,0.3571,0.2873,0.2402,0.508535,1.320611,0.354640,-2335.950,1.0
0,2017-05-02 20:43:32,BU4707,0.001893,-0.000069,-0.001685,0.031464,0.032940,0.029907,0.019735,0.037221,...,0.022000,0.034000,0.1386,0.1181,0.1023,0.198836,1.716787,0.331169,-2219.100,1.0
0,2017-05-02 20:43:36,BU4707,0.008098,-0.005247,-0.001066,0.125733,0.065064,0.074173,0.119175,0.090682,...,0.018000,0.021000,0.2006,0.2026,0.6097,0.618793,2.461837,0.401491,-3092.700,1.0


In [12]:
BU_features_df.to_parquet("BU4707_features.parquet")

In [8]:
PC_grouped = create_data_windows("PC6771", "4s")
PC_features_df = generate_features(PC_grouped)

100%|██████████| 21551/21551 [01:15<00:00, 284.12it/s]


In [None]:
PC_features_df

Unnamed: 0,time,pid,x_mean,y_mean,z_mean,mag_mean,x_std,y_std,z_std,mag_std,...,y_crossing,z_crossing,x_max,y_max,z_max,mag_max,time_entropy,frequnecy_entropy,spectral_centroid,TAC_class
0,2017-05-02 14:28:08,PC6771,-0.000234,0.004719,0.000408,0.036785,0.034206,0.024611,0.027860,0.034946,...,0.047938,0.062430,0.1805,0.1400,0.1016,0.240117,2.666996,0.257686,-2853.8055,0.0
0,2017-05-02 14:28:12,PC6771,-0.001441,0.009211,-0.000628,0.038326,0.029327,0.026813,0.033540,0.036364,...,0.033000,0.043000,0.1305,0.1608,0.1964,0.253894,1.813810,0.348589,-41121.0000,0.0
0,2017-05-02 14:28:16,PC6771,-0.000472,0.003480,-0.000157,0.021697,0.015678,0.013623,0.013843,0.012830,...,0.025000,0.044000,0.0457,0.0400,0.0499,0.057050,2.959210,0.220652,-6525.5000,0.0
0,2017-05-02 14:28:20,PC6771,-0.001158,0.003452,0.000168,0.007787,0.004132,0.003345,0.005310,0.003018,...,0.043000,0.052000,0.0109,0.0146,0.0140,0.016279,3.042663,0.249421,2219.1500,0.0
0,2017-05-02 14:28:24,PC6771,-0.000589,0.002490,0.001538,0.010073,0.006384,0.008117,0.006698,0.007678,...,0.045000,0.057000,0.0224,0.0338,0.0224,0.042184,2.022944,0.236493,3980.9000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,2017-05-03 14:24:32,PC6771,0.004141,0.001454,-0.004160,0.007920,0.004333,0.003445,0.003140,0.003788,...,0.041000,0.011000,0.0234,0.0161,0.0149,0.028412,2.110097,0.400665,-867.1500,0.0
0,2017-05-03 14:24:36,PC6771,0.004367,0.001212,-0.004231,0.006879,0.001930,0.002289,0.001990,0.002009,...,0.036000,0.003000,0.0095,0.0104,0.0122,0.016488,2.322623,0.179955,-624.0000,0.0
0,2017-05-03 14:24:40,PC6771,0.004519,0.001208,-0.004322,0.015609,0.008059,0.013852,0.008073,0.010905,...,0.041000,0.028000,0.0362,0.0550,0.0322,0.066660,2.701444,0.242337,-9402.5500,0.0
0,2017-05-03 14:24:44,PC6771,0.004740,0.001729,-0.003984,0.012082,0.007737,0.008720,0.006241,0.008379,...,0.041000,0.024000,0.0394,0.0369,0.0264,0.044899,2.483432,0.167479,22607.2500,0.0


In [9]:
PC_features_df.to_parquet("PC6771_features.parquet")