In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pprint

## ADL falls dataset
https://www.sciencedirect.com/science/article/pii/S2352340923008740#abs0001

In [2]:
# Load the dataset
df2 = pd.read_csv(
    "./combined_ADLfalls_data.csv",
    dtype={"x": "float64", "y": "float64", "z": "float64"}
)

In [3]:
# First few rows
print (df2.head(), '\n\n')

      timestamp         x         y         z sensor_type  activity_label  \
0  1.820000e+11  0.126893 -0.033519  0.067038         acc  Walking slowly   
1  1.820000e+11 -0.268151 -0.167594  0.122104         acc  Walking slowly   
2  1.830000e+11 -0.883460 -0.119710 -0.392649         acc  Walking slowly   
3  1.830000e+11 -0.253785  0.605733 -0.526724         acc  Walking slowly   
4  1.830000e+11 -0.189142  0.136469  0.165200         acc  Walking slowly   

   user_id  
0        1  
1        1  
2        1  
3        1  
4        1   




In [4]:
# Summarising structure
print(df2.info(), '\n\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2568079 entries, 0 to 2568078
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   timestamp       float64
 1   x               float64
 2   y               float64
 3   z               float64
 4   sensor_type     object 
 5   activity_label  object 
 6   user_id         int64  
dtypes: float64(4), int64(1), object(2)
memory usage: 137.2+ MB
None 




In [5]:
print(df2.head())

      timestamp         x         y         z sensor_type  activity_label  \
0  1.820000e+11  0.126893 -0.033519  0.067038         acc  Walking slowly   
1  1.820000e+11 -0.268151 -0.167594  0.122104         acc  Walking slowly   
2  1.830000e+11 -0.883460 -0.119710 -0.392649         acc  Walking slowly   
3  1.830000e+11 -0.253785  0.605733 -0.526724         acc  Walking slowly   
4  1.830000e+11 -0.189142  0.136469  0.165200         acc  Walking slowly   

   user_id  
0        1  
1        1  
2        1  
3        1  
4        1  


### Reconstruct timestamp using 20Hz

In [6]:
# Ensure timestamp is numeric
df2['timestamp'] = pd.to_numeric(df2['timestamp'], errors='coerce')

print(df2['timestamp'].diff())

0                   NaN
1          0.000000e+00
2          1.000000e+09
3          0.000000e+00
4          0.000000e+00
               ...     
2568074    0.000000e+00
2568075    0.000000e+00
2568076    0.000000e+00
2568077    0.000000e+00
2568078    0.000000e+00
Name: timestamp, Length: 2568079, dtype: float64


In [7]:
import pandas as pd

# Sort data properly
df2 = df2.sort_values(['user_id', 'activity_label', 'sensor_type', 'timestamp'])

# Compute time differences in seconds
df2['time_diff_sec'] = df2.groupby(['user_id', 'activity_label', 'sensor_type'])['timestamp'].diff() / 1e9

# Summary stats per sensor type
sampling_summary = (
    df2.groupby('sensor_type')['time_diff_sec']
       .agg(['min', 'max', 'median', 'mean'])
)

print(sampling_summary)


             min           max  median       mean
sensor_type                                      
acc          0.0  602923.93582     0.0   1.983798
acg          0.0  150460.00000     0.0   0.737863
gyro         0.0  314000.00000     0.0   1.224671
hrt          0.0  602632.00000     0.0  18.193957
mgm          0.0  599940.00000     0.0   1.686121


Timestamp dt is odd, many timestamps have no difference.
But the data source says data sampled at 20Hz. we can create timestamp then check if movement makes sense.

In [8]:
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2568079 entries, 1687530 to 1278974
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   timestamp       float64
 1   x               float64
 2   y               float64
 3   z               float64
 4   sensor_type     object 
 5   activity_label  object 
 6   user_id         int64  
 7   time_diff_sec   float64
dtypes: float64(5), int64(1), object(2)
memory usage: 176.3+ MB
None


In [9]:
import numpy as np
import pandas as pd

# Define sampling rates (Hz = samples per second)
sampling_rates = {
    'acc': 20,
    'gyro': 20,
    'acg': 20,
    'mgm': 20,
    'hrt': 1   # change name to match if you have HR sensor
}

df2["timestamp_clean"] = 0.0  # initialize

for (uid, act, sensor), g in df2.groupby(["user_id","activity_label","sensor_type"], group_keys=False):
    hz = sampling_rates.get(sensor, 20)
    n = len(g)
    df2.loc[g.index, "timestamp_clean"] = np.arange(n) / hz

# Now recompute time diffs
df2["time_diff_sec"] = (
    df2.groupby(["user_id", "activity_label", "sensor_type"])["timestamp_clean"].diff()
)

# Force to numeric
df2["timestamp_clean"] = pd.to_numeric(df2["timestamp_clean"], errors="coerce")
df2["time_diff_sec"] = pd.to_numeric(df2["time_diff_sec"], errors="coerce")


# Recalculate ROC using clean timestamps
df2[["roc_x", "roc_y", "roc_z"]] = (
    df2.groupby(["user_id", "activity_label", "sensor_type"])[["x", "y", "z"]]
       .diff()
    .div(df2["time_diff_sec"], axis=0)
)

# Drop the first row of each group (where diffs are NaN)
df2 = df2[df2.groupby(["user_id", "activity_label", "sensor_type"]).cumcount() != 0].reset_index(drop=True)





In [10]:
# Create a new column "vec_sum", initialize with NaN
df2["vec_sum"] = np.nan  

# Compute only for sensor types not equal to 'hrt'
mask = df2["sensor_type"] != "hrt"
df2.loc[mask, "vec_sum"] = np.sqrt(
    df2.loc[mask, "x"]**2 +
    df2.loc[mask, "y"]**2 +
    df2.loc[mask, "z"]**2
)

In [11]:
print(df2.head())

      timestamp         x         y         z sensor_type  \
0  3.660000e+12 -0.011971  0.081403  0.088585         acc   
1  3.660000e+12 -0.220267  0.040701 -0.090980         acc   
2  3.660000e+12  0.203507 -0.287304  0.052672         acc   
3  3.660000e+12  0.023942  0.004788  0.033519         acc   
4  3.660000e+12  0.100556 -0.110133 -0.033519         acc   

                       activity_label  user_id  time_diff_sec  \
0  Backward fall from seated position        1           0.05   
1  Backward fall from seated position        1           0.05   
2  Backward fall from seated position        1           0.05   
3  Backward fall from seated position        1           0.05   
4  Backward fall from seated position        1           0.05   

   timestamp_clean     roc_x     roc_y     roc_z   vec_sum  
0             0.05 -0.143652  0.239420  2.154782  0.120901  
1             0.10 -4.165911 -0.814029 -3.591303  0.241767  
2             0.15  8.475474 -6.560113  2.873042  0.355996 

### Form mean, standard deviation features

In [12]:
import pandas as pd

# Rename the last unnamed column (assumed to be the sensor type)
# if df2.columns[5].startswith('Unnamed'):
#     df2 = df2.rename(columns={df2.columns[5]: 'sensor_type'})

# Filter rows with expected sensor types only (optional for safety)
valid_sensors = ['gyro', 'acc', 'acg', 'mgm', 'hrt']
df2 = df2[df2['sensor_type'].isin(valid_sensors)]

# Keep only expected sensor types
# df2[['x', 'y', 'z']] = df2[['x', 'y', 'z']].apply(pd.to_numeric, errors='coerce')


# # Ensure t is numeric
# df2['t'] = pd.to_numeric(df2['t'], errors='coerce')

# # Compute time difference within each user/activity/sensor group
# df2['time_diff'] = (
#     df2.groupby(['user_id', 'activity_label', 'sensor_type'])['t'].diff()
# )

# # assuming t is in nanoseconds, so divide by 1e9
# df2['time_diff_sec'] = df2['time_diff'] / 1e9


# # Calculate rate of change within each user/activity/sensor group (dx,dy,dz)
# df2 = df2.sort_values(['user_id', 'activity_label', 'sensor_type', 'time_diff_sec'])

# df2[['roc_x', 'roc_y', 'roc_z']] = (
#     df2.groupby(['user_id', 'activity_label', 'sensor_type'])[['x', 'y', 'z']]
#         .diff()
# )

# # normalize by time difference (dt)
# df2[['roc_x', 'roc_y', 'roc_z']] = df2[['roc_x', 'roc_y', 'roc_z']].div(df2['time_diff_sec'], axis=0)

# # Drop rows where time_diff_sec <= 0 or NaN (first row of each group, or duplicate timestamps)
# df2 = df2[df2['time_diff_sec'] > 0]

# # Aggregate mean and std for x, y, z, and roc_x, roc_y, roc_z
# sensor_features = (
#     df2.groupby(['user_id', 'activity_label', 'sensor_type'])
#        [['x', 'y', 'z', 'roc_x', 'roc_y', 'roc_z']]
#        .agg(['mean', 'std'])
# )

# # Flatten multi-index column names
# sensor_features.columns = ['_'.join(col) for col in sensor_features.columns]

# # Reset index
# sensor_features = sensor_features.reset_index()


In [13]:
print(df2)

            timestamp          x         y          z sensor_type  \
0        3.660000e+12  -0.011971  0.081403   0.088585         acc   
1        3.660000e+12  -0.220267  0.040701  -0.090980         acc   
2        3.660000e+12   0.203507 -0.287304   0.052672         acc   
3        3.660000e+12   0.023942  0.004788   0.033519         acc   
4        3.660000e+12   0.100556 -0.110133  -0.033519         acc   
...               ...        ...       ...        ...         ...   
2563174  6.120000e+14  13.440000 -9.300000  24.300000         mgm   
2563175  6.120000e+14  13.740000 -8.700000  24.660000         mgm   
2563176  6.120000e+14  14.219999 -7.740000  25.080000         mgm   
2563177  6.120000e+14  14.759999 -6.600000  26.039999         mgm   
2563178  6.120000e+14  15.299999 -6.120000  25.800000         mgm   

                             activity_label  user_id  time_diff_sec  \
0        Backward fall from seated position        1           0.05   
1        Backward fall from s

In [14]:
print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2563159 entries, 0 to 2563178
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   timestamp        float64
 1   x                float64
 2   y                float64
 3   z                float64
 4   sensor_type      object 
 5   activity_label   object 
 6   user_id          int64  
 7   time_diff_sec    float64
 8   timestamp_clean  float64
 9   roc_x            float64
 10  roc_y            float64
 11  roc_z            float64
 12  vec_sum          float64
dtypes: float64(10), int64(1), object(2)
memory usage: 273.8+ MB
None


In [None]:
# print(sensor_features)

      user_id                      activity_label sensor_type      x_mean  \
0           1  Backward fall from seated position         acc   -0.110670   
1           1  Backward fall from seated position         acg    1.789186   
2           1  Backward fall from seated position        gyro    0.057254   
3           1  Backward fall from seated position         hrt  106.701754   
4           1  Backward fall from seated position         mgm    2.562600   
...       ...                                 ...         ...         ...   
4895       41                      Walking slowly         acc   -0.075795   
4896       41                      Walking slowly         acg   -9.570030   
4897       41                      Walking slowly        gyro    0.024302   
4898       41                      Walking slowly         hrt  104.421769   
4899       41                      Walking slowly         mgm   27.847767   

          x_std     y_mean      y_std     z_mean     z_std  roc_x_mean  \
0

In [None]:
# print(sensor_features.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4900 entries, 0 to 4899
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   user_id         4900 non-null   int64  
 1   activity_label  4900 non-null   object 
 2   sensor_type     4900 non-null   object 
 3   x_mean          4900 non-null   float64
 4   x_std           4887 non-null   float64
 5   y_mean          3920 non-null   float64
 6   y_std           3920 non-null   float64
 7   z_mean          3920 non-null   float64
 8   z_std           3920 non-null   float64
 9   roc_x_mean      4900 non-null   float64
 10  roc_x_std       4887 non-null   float64
 11  roc_y_mean      3920 non-null   float64
 12  roc_y_std       3920 non-null   float64
 13  roc_z_mean      3920 non-null   float64
 14  roc_z_std       3920 non-null   float64
dtypes: float64(12), int64(1), object(2)
memory usage: 574.3+ KB
None


In [None]:
# df2['rotation_magnitude'] = np.sqrt(sensor_features_named['rotation_rate_x']**2 + sensor_features_named['rotation_rate_y']**2 + sensor_features_named['rotation_rate_z']**2)
# df2['acceleration_magnitude'] = np.sqrt(sensor_features_named['user_acceleration_x']**2 + sensor_features_named['user_acceleration_y']**2 + sensor_features_named['user_acceleration_z']**2)
# sensor_features_named['rotation_magnitude_mean'] = sensor_features_named['rotation_magnitude'].mean()
# sensor_features_named['acceleration_magnitude_mean'] = sensor_features_named['acceleration_magnitude'].mean()
# sensor_features_named['rotation_magnitude_std'] = sensor_features_named['rotation_magnitude'].std()
# sensor_features_named['acceleration_magnitude_std'] = sensor_features_named['acceleration_magnitude'].std()

KeyError: 'rotation_rate_x'

### Map sensor names to features names in dryad dataset

In [15]:
import pandas as pd

# Mapping sensor types to your naming style
sensor_name_map = {
    'gyro': 'rotation_rate',
    'acg': 'user_acceleration',
    'acc': 'acceleration',
    'mgm': 'magnetometer',
    'hrt': 'heart_rate'
}

rows = []

# Group by user, activity, sensor
for (user_id, activity_label, sensor_type), group in df2.groupby(['user_id', 'activity_label', 'sensor_type']):
    sensor = sensor_name_map.get(sensor_type, sensor_type)

    row = {
        'user_id': user_id,
        'activity_label': activity_label
    }

    # Mean and std of x/y/z
    for axis in ['x', 'y', 'z', 'roc_x', 'roc_y', 'roc_z', 'vec_sum']:
        row[f'{sensor}_{axis}_mean'] = group[axis].astype(float).mean()
        row[f'{sensor}_{axis}_std'] = group[axis].astype(float).std()
        # row[f'{sensor}_{axis}'] = group[axis].astype(float).iloc[-1]  # last value # not keeping the original time-series data

    rows.append(row)

# Convert to flat dataframe
sensor_features_named = pd.DataFrame(rows)

# Merge all sensors for same user_id/activity_label
sensor_features_named = (
    sensor_features_named
    .groupby(['user_id', 'activity_label'], as_index=False)
    .first()
)


In [16]:
print(sensor_features_named.head())

   user_id                      activity_label  acceleration_x_mean  \
0        1  Backward fall from seated position            -0.110670   
1        1              Climbing down normally            -3.362136   
2        1                Climbing down slowly            -0.287586   
3        1                Climbing up normally            -0.695586   
4        1                  Climbing up slowly            -0.263609   

   acceleration_x_std  acceleration_y_mean  acceleration_y_std  \
0            1.601555             0.200556            1.235462   
1            5.816553            -0.687240            4.910136   
2            1.393648            -0.159989            1.583802   
3            3.534504            -0.084783            2.985748   
4            1.574136            -0.090240            1.991222   

   acceleration_z_mean  acceleration_z_std  acceleration_roc_x_mean  \
0            -0.205179            1.090325                -0.003715   
1            -2.194130            

In [17]:
# mean, std of all the time-based data

print(sensor_features_named.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 72 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         980 non-null    int64  
 1   activity_label                  980 non-null    object 
 2   acceleration_x_mean             980 non-null    float64
 3   acceleration_x_std              980 non-null    float64
 4   acceleration_y_mean             980 non-null    float64
 5   acceleration_y_std              980 non-null    float64
 6   acceleration_z_mean             980 non-null    float64
 7   acceleration_z_std              980 non-null    float64
 8   acceleration_roc_x_mean         980 non-null    float64
 9   acceleration_roc_x_std          980 non-null    float64
 10  acceleration_roc_y_mean         980 non-null    float64
 11  acceleration_roc_y_std          980 non-null    float64
 12  acceleration_roc_z_mean         980 

In [18]:
# original time-based data

print(df2.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2563159 entries, 0 to 2563178
Data columns (total 13 columns):
 #   Column           Dtype  
---  ------           -----  
 0   timestamp        float64
 1   x                float64
 2   y                float64
 3   z                float64
 4   sensor_type      object 
 5   activity_label   object 
 6   user_id          int64  
 7   time_diff_sec    float64
 8   timestamp_clean  float64
 9   roc_x            float64
 10  roc_y            float64
 11  roc_z            float64
 12  vec_sum          float64
dtypes: float64(10), int64(1), object(2)
memory usage: 273.8+ MB
None


### Fill group_label with volunteer details

In [19]:
details = pd.read_csv("./volunteer_details.csv")
print(details.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Subject Id         41 non-null     int64  
 1   Gender             41 non-null     object 
 2   Height (cm)        41 non-null     float64
 3   Weight (KG)        41 non-null     float64
 4   Age                41 non-null     int64  
 5   Heart Rate (base)  41 non-null     int64  
 6   Health condition   41 non-null     object 
dtypes: float64(2), int64(3), object(2)
memory usage: 2.4+ KB
None


In [20]:
# Convert both ID columns to int type before merging
sensor_features_named['user_id'] = sensor_features_named['user_id'].astype(int)


# Map Health condition to health_status
# details['health_status'] = details['Health condition'].apply(
#     lambda x: 'Healthy' if x == 'No existing Health Issues' else 'Unhealthy'
# )

details['health_status'] = details['Health condition'].apply(
    lambda x: 'Unhealthy' if any(cond in str(x).lower() for cond in ['obese', 'overweight', 'arthritis']) else 'Healthy'
)

# Create age_group based on threshold
details['age_group'] = np.where(details['Age'] <= 35, 'young', 'older')


# Merge the group_label into df based on user_id
sensor_features_named = sensor_features_named.merge(details[['Subject Id', 'health_status', 'age_group']],
              left_on='user_id', right_on='Subject Id', how='left')

# Drop the temporary column
sensor_features_named = sensor_features_named.drop(columns=['Subject Id'])  # Drop Subject Id after merge
print(sensor_features_named.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 74 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         980 non-null    int64  
 1   activity_label                  980 non-null    object 
 2   acceleration_x_mean             980 non-null    float64
 3   acceleration_x_std              980 non-null    float64
 4   acceleration_y_mean             980 non-null    float64
 5   acceleration_y_std              980 non-null    float64
 6   acceleration_z_mean             980 non-null    float64
 7   acceleration_z_std              980 non-null    float64
 8   acceleration_roc_x_mean         980 non-null    float64
 9   acceleration_roc_x_std          980 non-null    float64
 10  acceleration_roc_y_mean         980 non-null    float64
 11  acceleration_roc_y_std          980 non-null    float64
 12  acceleration_roc_z_mean         980 

In [21]:
# Drop heart rate y and z

drop_cols = ['heart_rate_y','heart_rate_z','heart_rate_roc_y','heart_rate_roc_z','heart_rate_vec_sum']
columns_to_drop = [col for col in sensor_features_named.columns if any(k in col.lower() for k in drop_cols)]

sensor_features_named = sensor_features_named.drop(columns=columns_to_drop)

print(sensor_features_named.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 64 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         980 non-null    int64  
 1   activity_label                  980 non-null    object 
 2   acceleration_x_mean             980 non-null    float64
 3   acceleration_x_std              980 non-null    float64
 4   acceleration_y_mean             980 non-null    float64
 5   acceleration_y_std              980 non-null    float64
 6   acceleration_z_mean             980 non-null    float64
 7   acceleration_z_std              980 non-null    float64
 8   acceleration_roc_x_mean         980 non-null    float64
 9   acceleration_roc_x_std          980 non-null    float64
 10  acceleration_roc_y_mean         980 non-null    float64
 11  acceleration_roc_y_std          980 non-null    float64
 12  acceleration_roc_z_mean         980 

In [22]:
print(sensor_features_named['health_status'].unique())
print(sensor_features_named['age_group'].unique())

['Healthy' 'Unhealthy']
['young' 'older']


### Missing values

In [23]:
# Detecting missing values
missing_list = list(sensor_features_named.columns[sensor_features_named.isna().any()])       

# Percentage of missing values
row_count = sensor_features_named.shape[0] 
# Create list of dicts (one per column)
data = []
for col in missing_list:
    count = sensor_features_named[col].isnull().sum()
    percent = (count / row_count) * 100
    data.append({
        'Columns with missing values': col,
        'Missing count': count,
        'Missing %': percent,
        'DataType' : sensor_features_named[col].dtype
    })

# Create DataFrame from list of dicts
missing_df = pd.DataFrame(data)
print(missing_df)

  Columns with missing values  Missing count  Missing % DataType
0            heart_rate_x_std             13   1.326531  float64
1        heart_rate_roc_x_std             13   1.326531  float64


In [24]:
# Checking if missing values have a common pattern (13)

null_val = sensor_features_named[sensor_features_named['heart_rate_x_std'].isnull()]
print(null_val.shape)
for column in null_val.columns:
    print(null_val[column].value_counts(),"\n")

(13, 64)
user_id
1     1
3     1
6     1
7     1
11    1
21    1
25    1
26    1
28    1
30    1
35    1
37    1
39    1
Name: count, dtype: int64 

activity_label
Slowly sitting on chair                   2
Forward fall landing on knee              2
Climbing up normally                      2
Lying on back and getting up slowly       1
Nearly sitting on chair and getting up    1
Grabbing while falling                    1
Climbing down normally                    1
Rapidly sitting on chair                  1
Lying on bed                              1
Walking slowly                            1
Name: count, dtype: int64 

acceleration_x_mean
-0.409282    1
 0.034173    1
 0.006174    1
 0.009353    1
-1.973457    1
 0.060649    1
-0.010947    1
 0.074109    1
 0.033691    1
 0.018031    1
 0.055932    1
-0.084038    1
 0.020249    1
Name: count, dtype: int64 

acceleration_x_std
0.868738    1
0.618508    1
0.729795    1
3.993570    1
7.597158    1
0.280410    1
2.893882    1
1.267732

No common user_id or activity_label

In [25]:
import plotly.express as px
import plotly.io as pio

# Set the default renderer to open plots in your web browser ('browser') or inline ('notebook')
pio.renderers.default = 'browser' 

# Subset only columns with missing values, sorted in a certain way to view correlations
missing_list_sort = ['heart_rate_x_std', 
                     'heart_rate_roc_x_std']

missing_df = sensor_features_named[missing_list_sort]

# Filter rows with missing values
missing_rows = missing_df[missing_df.isnull().any(axis=1)]

# Create boolean DataFrame of missing values
mask = missing_rows.isnull()

# Convert to long format for Plotly heatmap
mask_long = mask.reset_index().melt(id_vars='index', var_name='Column', value_name='Is Missing')

# Plot interactive heatmap
fig = px.imshow(
    mask.values,
    labels=dict(x="Column", y="Row Index", color="Missing"),
    x=mask.columns,
    y=mask.index,
    color_continuous_scale=["#ffffff", "#636efa"],  # white for present, blue for missing
    aspect="auto"
)

fig.update_layout(title="Interactive Missing Data Heatmap", height=600)
fig.show()

the missing data in the 2 columns occur in the same rows.

### Standardize activities

In [26]:
# Target categories
# ['Other exercise' 'Hygiene' 'Work' 'Eat' 'Relax' 'Travel' 'Errands' 'Sleep' 'Socialize' 'Walk' 'Hobby' 'Sitting down' 'Climbing stairs' 'Housework' 'Dressing' 'Run' 'Fall']

# Convert to lowercase for easier matching
sensor_features_named['activity_label'] = sensor_features_named['activity_label'].astype(str).str.lower()

# # Convert to lowercase for easier matching
# target_df['activity_label'] = target_df['activity_label'].astype(str).str.lower()

# Define mapping function
def categorize_activity(label):
    if 'eat' in label or 'meal' in label or 'drink' in label:
        return 'Eat'
    elif 'errand' in label or 'shop' in label or 'store' in label or 'grocer' in label:
        return 'Errands'
    elif 'exercise' in label or  'workout' in label or 'cycle' in label or  'jump' in label:
        return 'Other exercise'
    elif 'run' in label or 'jog' in label:
        return 'Run'
    elif 'walk' in label:
        return 'Walk'
    elif 'climb' in label:
        return 'Climbing stairs'
    elif 'sit' in label:
        return 'Sitting down'
    elif 'game' in label or 'gaming' in label or 'play' in label or 'art' in label or 'tv' in label or 'read' in label or 'video' in label:
        return 'Hobby'
    elif 'clean' in label or 'chores' in label or 'housework' in label or 'dishes' in label or 'cook' in label:
        return 'Housework'
    elif 'brush' in label or 'toilet' in label or 'shower' in label or 'hygiene' in label or 'groom' in label or 'bath' in label:
        return 'Hygiene'
    elif 'dress' in label or 'makeup' in label or 'hair' in label:
        return 'Dressing'
    elif 'relax' in label or 'chill' in label or 'rest' in label or 'watch' in label:
        return 'Relax'
    elif 'sleep' in label or 'lying' in label or 'lie' in label: # or 'wake' in label or 'bed' in label or 'woke' in label:
        return 'Sleep'
    elif 'social' in label or 'talk' in label or 'phone' in label or 'message' in label or 'text' in label:
        return 'Socialize'
    elif 'travel' in label or 'bus' in label or 'drive' in label or 'commute' in label: # or 'fly' in label
        return 'Travel'
    elif 'work' in label or 'school' in label or 'lab' in label or 'homework' in label or 'report' in label or 'meeting' in label or 'class' in label or 'research' in label or 'study' in label:
        return 'Work'
    elif 'fall' in label:
        return 'Fall'
    else:
        return None # rows not in the above categories will be dropped

# Apply the categorization
sensor_features_named['activity_label'] = sensor_features_named['activity_label'].apply(categorize_activity)

# Drop rows where category is None (would have been 'Other')
sensor_features_named = sensor_features_named[sensor_features_named['activity_label'].notna()]

# # Apply the categorization
# target_df['activity_category'] = target_df['activity_label'].apply(categorize_activity)

# # Drop rows where category is None (would have been 'Other')
# target_df = target_df[target_df['activity_category'].notna()]

In [27]:
print(sensor_features_named['activity_label'].unique())

['Eat' 'Climbing stairs' 'Fall' 'Run' 'Other exercise' 'Sleep'
 'Sitting down' 'Walk']


In [28]:
print(sensor_features_named.info())

<class 'pandas.core.frame.DataFrame'>
Index: 939 entries, 0 to 979
Data columns (total 64 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   user_id                         939 non-null    int64  
 1   activity_label                  939 non-null    object 
 2   acceleration_x_mean             939 non-null    float64
 3   acceleration_x_std              939 non-null    float64
 4   acceleration_y_mean             939 non-null    float64
 5   acceleration_y_std              939 non-null    float64
 6   acceleration_z_mean             939 non-null    float64
 7   acceleration_z_std              939 non-null    float64
 8   acceleration_roc_x_mean         939 non-null    float64
 9   acceleration_roc_x_std          939 non-null    float64
 10  acceleration_roc_y_mean         939 non-null    float64
 11  acceleration_roc_y_std          939 non-null    float64
 12  acceleration_roc_z_mean         939 non-n

### Saving ADL falls dataset

In [29]:
# Save renamed ADL falls dataset
# sensor_features_named.to_csv("ADLfalls_renamed.csv", index=False)
sensor_features_named.to_csv("ADLfalls_renamed_relabelledHealth_limitedActivities.csv", index=False)


## Cleaned ADL falls dataset

In [56]:
# Load renamed ADL falls dataset
df3 = pd.read_csv("./ADLfalls_renamed.csv")

In [57]:
print(df3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 100 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   user_id                       980 non-null    int64  
 1   activity_label                980 non-null    object 
 2   acceleration_x_mean           980 non-null    float64
 3   acceleration_x_std            980 non-null    float64
 4   acceleration_x                980 non-null    float64
 5   acceleration_y_mean           980 non-null    float64
 6   acceleration_y_std            980 non-null    float64
 7   acceleration_y                980 non-null    float64
 8   acceleration_z_mean           980 non-null    float64
 9   acceleration_z_std            980 non-null    float64
 10  acceleration_z                980 non-null    float64
 11  acceleration_roc_x_mean       980 non-null    float64
 12  acceleration_roc_x_std        980 non-null    float64
 13  acce

In [55]:
# keywords of columns to keep
eng_cols = ['mean', 'std', 'label', 'id', 'health', 'age']

# Filter columns that contain any of the keywords
columns_to_keep = [col for col in df3.columns if any(k in col.lower() for k in eng_cols)]

# Create a new filtered DataFrame
filtered_df3 = df3[columns_to_keep]
print(filtered_df3.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 980 entries, 0 to 979
Data columns (total 68 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   user_id                       980 non-null    int64  
 1   activity_label                980 non-null    object 
 2   acceleration_x_mean           980 non-null    float64
 3   acceleration_x_std            980 non-null    float64
 4   acceleration_y_mean           980 non-null    float64
 5   acceleration_y_std            980 non-null    float64
 6   acceleration_z_mean           980 non-null    float64
 7   acceleration_z_std            980 non-null    float64
 8   acceleration_roc_x_mean       980 non-null    float64
 9   acceleration_roc_x_std        980 non-null    float64
 10  acceleration_roc_y_mean       980 non-null    float64
 11  acceleration_roc_y_std        980 non-null    float64
 12  acceleration_roc_z_mean       980 non-null    float64
 13  accel

## Dryad (ArWISE) dataset
https://datadryad.org/landing/show?id=doi%3A10.5061%2Fdryad.jdfn2z3nm#readme

In [None]:
df1 = pd.read_csv("./combined_filtered_withActivities.csv")
print(df1.info())

In [None]:
# keywords of columns to keep
eng_cols = ['mean', 'std', 'magnitude', 'label', 'id']

# Filter columns that contain any of the keywords
columns_to_keep = [col for col in df1.columns if any(k in col.lower() for k in eng_cols)]

# Create a new filtered DataFrame
filtered_df1 = df1[columns_to_keep]

# keywords of columns to drop
drop_cols = ['speed','course','home','day']
columns_to_drop = [col for col in filtered_df1.columns if any(k in col.lower() for k in drop_cols)]

filtered_df1 = filtered_df1.drop(columns=columns_to_drop)

print(filtered_df1.info())

In [None]:
# Detecting missing values
missing_list = list(filtered_df1.columns[filtered_df1.isna().any()])       

# Percentage of missing values
row_count = filtered_df1.shape[0] 
# Create list of dicts (one per column)
data = []
for col in missing_list:
    count = filtered_df1[col].isnull().sum()
    percent = (count / row_count) * 100
    data.append({
        'Columns with missing values': col,
        'Missing count': count,
        'Missing %': percent,
        'DataType' : filtered_df1[col].dtype
    })

# Create DataFrame from list of dicts
missing_df = pd.DataFrame(data)
print(missing_df)

In [None]:
# looks like 0.04% of dryad dataset have missing values, just drop because we have a lot of data
filtered_df1 = filtered_df1.dropna()

In [None]:
# Detecting missing values
missing_list = list(filtered_df1.columns[filtered_df1.isna().any()])       

print(missing_list)

In [None]:
print(mask.info())

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Filter only rows with 'HOA/SCD/MCI'
mask = filtered_df1['group_label'] == 'HOA/SCD/MCI'
hoa_data = filtered_df1.loc[mask]

# Pick numeric health-related features (example columns, change to your real ones)
exclude = ['activity_label', 'group_label', 'user_id']
features = hoa_data.drop(columns=exclude).select_dtypes(include='number').columns
X = hoa_data[features].dropna()

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Cluster into 2 groups: healthy vs unhealthy
# Cluster into 3 groups: HOA, SCD and MCI
kmeans = KMeans(n_clusters=3, random_state=42)
hoa_data['health_cluster'] = kmeans.fit_predict(X_scaled)

# Map cluster IDs to 'healthy'/'unhealthy' after inspecting means
cluster_means = hoa_data.groupby('health_cluster')[features].mean()
print(cluster_means)

# Example mapping after inspection
# cluster_map = {0: 'healthy', 1: 'unhealthy'}  # Adjust based on means
# hoa_data['health_status'] = hoa_data['health_cluster'].map(cluster_map)


# Merge back into original details dataframe
# filtered_df1.loc[mask, 'health_status'] = hoa_data['health_status']


In [None]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import numpy as np

# Reduce to 2D for visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Get cluster centers in original space, then transform to PCA space
cluster_centers = hoa_data.groupby('health_cluster')[features].mean().values
cluster_centers_pca = pca.transform(cluster_centers)

# Plot sampled points
sample_idx = np.random.choice(len(X_pca), size=500, replace=False)
plt.scatter(X_pca[sample_idx, 0], X_pca[sample_idx, 1],
            c=hoa_data['health_cluster'].iloc[sample_idx],
            cmap='coolwarm', alpha=0.6)

# Plot cluster means
plt.scatter(cluster_centers_pca[:, 0], cluster_centers_pca[:, 1],
            c='black', marker='X', s=200, label='Cluster Means')

plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title("KMeans Clusters with Means")
plt.legend()
plt.show()


In [None]:
# Get distances to each cluster center
distances = kmeans.transform(X_scaled)

# Calculate "margin" between distances
margin = np.abs(distances[:, 0] - distances[:, 1])

# Keep only points with a large margin (far from boundary)
threshold = np.percentile(margin, 20)  # remove lowest 20% (closest to boundary)
hoa_data_clean = hoa_data[margin > threshold]

print(f"Removed {len(hoa_data) - len(hoa_data_clean)} boundary points")


In [None]:
from sklearn.decomposition import PCA

# Features from the clean dataset
X_clean = hoa_data_clean[features].dropna()

# Scale using the SAME scaler from before
X_clean_scaled = scaler.transform(X_clean)

# PCA using the SAME pca from before (so axes match original plot)
X_clean_pca = pca.transform(X_clean_scaled)

# Plot clean clusters
# Plot sampled points
sample_idx = np.random.choice(len(X_pca), size=500, replace=False)
plt.scatter(X_pca[sample_idx, 0], X_pca[sample_idx, 1],
            c=hoa_data['health_cluster'].iloc[sample_idx],
            cmap='coolwarm', alpha=0.6)

# Plot cluster centers (same as before)
plt.scatter(cluster_centers_pca[:, 0], cluster_centers_pca[:, 1],
            c='black', marker='X', s=200, label='Cluster Means')

plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.title("KMeans Clusters (Boundary Points Removed)")
plt.legend(loc='upper right')
plt.show()



In [None]:

print(hoa_data['health_status'].unique())

## Merging datasets

In [None]:
import pandas as pd

# Assuming filtered_df1 (dryad) and filtered_df3 (ADLfalls) have the same column names
# sort=False keeps order
# if either df doesn't have same columns, leave as NaN
merged_df = pd.concat([filtered_df1, filtered_df3], axis=0, ignore_index=True, sort=False)


In [None]:
print(merged_df.info())

### Drop some features

In [None]:
# Drop acceleration and magnetometer as these are not in dryad dataset
# Drop yaw pitch roll as these are not in ADLfalls

# keywords of columns to drop
drop_kw = ['yaw','pitch','roll','magnetometer']
columns_to_drop = [col for col in merged_df.columns if any(k in col.lower() for k in drop_kw)]

drop_cols = ['acceleration_x_mean','acceleration_x_std',
             'acceleration_y_mean','acceleration_y_std',
             'acceleration_z_mean','acceleration_z_std'
            ] + columns_to_drop

merged_df = merged_df.drop(columns=drop_cols)

print(merged_df.info())

In [None]:
# Detecting missing values
missing_list = list(merged_df.columns[merged_df.isna().any()])       

# Percentage of missing values
row_count = merged_df.shape[0] 
# Create list of dicts (one per column)
data = []
for col in missing_list:
    count = merged_df[col].isnull().sum()
    percent = (count / row_count) * 100
    data.append({
        'Columns with missing values': col,
        'Missing count': count,
        'Missing %': percent,
        'DataType' : merged_df[col].dtype
    })

# Create DataFrame from list of dicts
missing_df = pd.DataFrame(data)
print(missing_df)

### Standardize and keep only certain activity categories in merged dataset

In [None]:

# Target categories
# ['Other exercise' 'Hygiene' 'Work' 'Eat' 'Relax' 'Travel' 'Errands' 'Sleep' 'Socialize' 'Walk' 'Hobby' 'Sitting down' 'Climbing stairs' 'Housework' 'Dressing' 'Run' 'Fall']

# Convert to lowercase for easier matching
merged_df['activity_label'] = merged_df['activity_label'].astype(str).str.lower()

# Define mapping function
def categorize_activity(label):
    if 'eat' in label or 'meal' in label or 'drink' in label:
        return 'Eat'
    elif 'errand' in label or 'shop' in label or 'store' in label or 'grocer' in label:
        return 'Errands'
    elif 'exercise' in label or  'workout' in label or 'cycle' in label or  'jump' in label:
        return 'Other exercise'
    elif 'run' in label or 'jog' in label:
        return 'Run'
    elif 'walk' in label:
        return 'Walk'
    elif 'climb' in label:
        return 'Climbing stairs'
    elif 'sit' in label:
        return 'Sitting down'
    elif 'game' in label or 'gaming' in label or 'play' in label or 'art' in label or 'tv' in label or 'read' in label or 'video' in label:
        return 'Hobby'
    elif 'clean' in label or 'chores' in label or 'housework' in label or 'dishes' in label or 'cook' in label:
        return 'Housework'
    elif 'brush' in label or 'toilet' in label or 'shower' in label or 'hygiene' in label or 'groom' in label or 'bath' in label:
        return 'Hygiene'
    elif 'dress' in label or 'makeup' in label or 'hair' in label:
        return 'Dressing'
    elif 'relax' in label or 'chill' in label or 'rest' in label or 'watch' in label:
        return 'Relax'
    elif 'sleep' in label or 'lying' in label or 'lie' in label: # or 'wake' in label or 'bed' in label or 'woke' in label:
        return 'Sleep'
    elif 'social' in label or 'talk' in label or 'phone' in label or 'message' in label or 'text' in label:
        return 'Socialize'
    elif 'travel' in label or 'bus' in label or 'drive' in label or 'commute' in label: # or 'fly' in label
        return 'Travel'
    elif 'work' in label or 'school' in label or 'lab' in label or 'homework' in label or 'report' in label or 'meeting' in label or 'class' in label or 'research' in label or 'study' in label:
        return 'Work'
    elif 'fall' in label:
        return 'Fall'
    else:
        return None # rows not in the above categories will be dropped

# Apply the categorization
merged_df['activity_category'] = merged_df['activity_label'].apply(categorize_activity)

# Drop rows where category is None (would have been 'Other')
merged_df = merged_df[merged_df['activity_category'].notna()]


In [None]:
print(merged_df['activity_category'].unique())
print(merged_df['group_label'].unique())

In [None]:
print(merged_df.info())

In [None]:
# Save final combined dataset

merged_df.to_csv("final_combined.csv", index=False)

## Load final combined df

In [None]:
df = pd.read_csv("./final_combined.csv")
print(df.info())

In [None]:
# Statistics
# mean, standard deviation, minimum (min), maximum (max), and quartile values
print(df.describe().T)

In [None]:
# Detecting missing values
missing_list = list(df.columns[df.isna().any()])       

# Percentage of missing values
row_count = df.shape[0] 
# Create list of dicts (one per column)
data = []
for col in missing_list:
    count = df[col].isnull().sum()
    percent = (count / row_count) * 100
    data.append({
        'Columns with missing values': col,
        'Missing count': count,
        'Missing %': percent,
        'DataType' : df[col].dtype
    })

# Create DataFrame from list of dicts
missing_df = pd.DataFrame(data)
print(missing_df)

In [None]:
print(df['group_label'].unique())

consider using ADLfalls to learn to cluster HOA vs MCI/? as healthy vs unhealthy