In [None]:
# Mount Google Drive
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

# Define the paths to your datasets in Google Drive
mhealth_file_path = '/content/drive/My Drive/Pawsey/mhealth_raw_data.csv'
pamap2_file_path = '/content/drive/My Drive/PAMAP2.csv'
#uci_har_file_path = '/content/UCI_HAR_dataset.csv'

# Read the datasets
mhealth_raw_sample = pd.read_csv(mhealth_file_path)
pamap2_sample = pd.read_csv(pamap2_file_path)
#uci_har_sample = pd.read_csv(uci_har_file_path)

# Define the common column names for accelerometer and gyroscope data
common_columns = {
    'accelerometer_x': ['alx', 'handAcc16_1', 'tBodyAcc-mean()-X'],
    'accelerometer_y': ['aly', 'handAcc16_2', 'tBodyAcc-mean()-Y'],
    'accelerometer_z': ['alz', 'handAcc16_3', 'tBodyAcc-mean()-Z'],
    'gyroscope_x': ['glx', 'handGyro1', 'tBodyGyro-mean()-X'],
    'gyroscope_y': ['gly', 'handGyro2', 'tBodyGyro-mean()-Y'],
    'gyroscope_z': ['glz', 'handGyro3', 'tBodyGyro-mean()-Z'],
    'activity': ['Activity', 'activityID', 'activity'],
    'subject': ['subject', 'subject_id', 'subject']
}

# Create a function to rename the columns in each dataset
def rename_columns(df, column_mapping):
    for common_name, original_names in column_mapping.items():
        for original_name in original_names:
            if original_name in df.columns:
                df = df.rename(columns={original_name: common_name})
    return df

# Rename the columns in each dataset
mhealth_renamed = rename_columns(mhealth_raw_sample, common_columns)
pamap2_renamed = rename_columns(pamap2_sample, common_columns)
#uci_har_renamed = rename_columns(uci_har_sample, common_columns)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Define the common activities in all three datasets
mhealth_activities = [0, 1, 3, 10, 8]  # Standing still, Sitting and relaxing, Walking, Running, Cycling
pamap2_activities = [3, 2, 4, 5, 6]    # Standing, Sitting, Walking, Running, Cycling
pamap2_activities2 = [7, 16, 13, 17, 12]    # Standing, Sitting, Walking, Running, Cycling
#uci_har_activities = ['STANDING', 'SITTING', 'WALKING','WALKING_UPSTAIRS','WALKING_DOWNSTAIRS']         # Standing, Sitting, Walking

# Filter datasets by common activities
def filter_by_activity(df, activity_col, activities):
    filtered_df = df[df[activity_col].isin(activities)]
    return filtered_df

# Apply filtering to each dataset
mhealth_filtered = filter_by_activity(mhealth_renamed, 'activity', mhealth_activities)
pamap2_filtered = filter_by_activity(pamap2_renamed, 'activity', pamap2_activities)
uci_har_filtered = filter_by_activity(pamap2_renamed, 'activity', pamap2_activities2)

# Select the common columns from each dataset
selected_columns = list(common_columns.keys())
mhealth_selected = mhealth_filtered[selected_columns]
pamap2_selected = pamap2_filtered[selected_columns]
uci_har_selected = uci_har_filtered[selected_columns]

# Save the new datasets to CSV files
mhealth_selected.to_csv('/content/drive/My Drive/Pawsey/mhealth_filtered.csv', index=False)
pamap2_selected.to_csv('/content/drive/My Drive/pamap2_filtered.csv', index=False)
uci_har_selected.to_csv('/content/drive/My Drive/uci_har_filtered.csv', index=False)

In [None]:
pamap2_selected_hidden = pamap2_selected.reset_index(drop=True)

In [None]:
pamap2_selected_hidden

Unnamed: 0,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,activity,subject,drift
0,2.124620,9.24108,2.77335,0.037944,0.024816,-0.016013,2,1,hidden drift
1,2.338250,9.27392,2.39156,0.040065,0.011347,0.009483,2,1,hidden drift
2,2.227290,9.23752,2.46700,0.017566,-0.017100,-0.006770,2,1,hidden drift
3,2.190040,9.12433,2.50531,-0.047912,0.007181,0.005693,2,1,hidden drift
4,2.391820,9.23971,2.85433,-0.046287,-0.000137,0.012163,2,1,hidden drift
...,...,...,...,...,...,...,...,...,...
876674,0.489770,2.82383,9.23639,-0.017195,0.006944,0.007758,5,8,hidden drift
876675,0.370253,2.70955,9.08103,-0.032333,-0.044224,0.011758,5,8,hidden drift
876676,0.644139,2.70986,9.35454,-0.004483,-0.001148,0.039415,5,8,hidden drift
876677,0.566153,2.74771,9.27627,-0.022839,0.045628,0.049042,5,8,hidden drift


In [None]:
pamap2_selected_hidden['drift'] = 'hidden drift'

In [None]:

uci_har_selected['drift'] = 'real drift'
pamap2_selected['drift'] = 'No drift'
mhealth_selected['drift'] = 'psedo drift'

In [None]:
pamap2_selected_hidden

Unnamed: 0,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,activity,subject,drift
30115,2.124620,9.24108,2.77335,0.037944,0.024816,-0.016013,2,1,No drift
30116,2.338250,9.27392,2.39156,0.040065,0.011347,0.009483,2,1,No drift
30117,2.227290,9.23752,2.46700,0.017566,-0.017100,-0.006770,2,1,No drift
30118,2.190040,9.12433,2.50531,-0.047912,0.007181,0.005693,2,1,No drift
30119,2.391820,9.23971,2.85433,-0.046287,-0.000137,0.012163,2,1,No drift
...,...,...,...,...,...,...,...,...,...
2818443,0.489770,2.82383,9.23639,-0.017195,0.006944,0.007758,5,8,No drift
2818444,0.370253,2.70955,9.08103,-0.032333,-0.044224,0.011758,5,8,No drift
2818445,0.644139,2.70986,9.35454,-0.004483,-0.001148,0.039415,5,8,No drift
2818446,0.566153,2.74771,9.27627,-0.022839,0.045628,0.049042,5,8,No drift


Dataset Type	        Description	Purpose

*   Original Dataset (O)	Original HAR dataset	Baseline signature (PAMAP)
*   Pseudo Change (P)	   Slight change in statistical signature but no new activitie (mhealth)
*   Real Change (R)	      Inject new activities or alter class distributions(UCI)
*   Hidden Change (H)	 Signature stays the same, but some activities silently altered

In [None]:
import pandas as pd

# Assuming mhealth_selected and pamap2_selected are your DataFrames

# Define the mappings from old labels to new labels
label_mapping_mhealth = {
    0: 0,  # Standing still -> 0
    1: 1,  # Sitting and relaxing -> 1
    3: 2,  # Walking -> 2
    10: 3, # Running -> 3
    8: 4   # Cycling -> 4
}

label_mapping_pamap2 = {
    3: 0,  # Standing -> 0
    2: 1,  # Sitting -> 1
    4: 2,  # Walking -> 2
    5: 3,  # Running -> 3
    6: 4   # Cycling -> 4
}


label_mapping_pamap4 = {
    3: 2,  # Standing -> 0
    2: 3,  # Sitting -> 1
    4: 4,  # Walking -> 2
    5: 1,  # Running -> 3
    6: 0   # Cycling -> 4
}



label_mapping_pamap3 = {
    7: 0,  # Standing -> 0
    16: 1,  # Sitting -> 1
    13: 2,  # Walking -> 2
    17: 3,  # Running -> 3
    12: 4   # Cycling -> 4
}



# Apply the mappings to the activity columns
mhealth_selected['activity'] = mhealth_selected['activity'].replace(label_mapping_mhealth)
pamap2_selected['activity'] = pamap2_selected['activity'].replace(label_mapping_pamap2)
uci_har_selected['activity'] = uci_har_selected['activity'].replace(label_mapping_pamap3)
pamap2_selected_hidden['activity'] = pamap2_selected['activity'].replace(label_mapping_pamap4)

# Check the results
print(mhealth_selected['activity'].value_counts())
print(pamap2_selected['activity'].value_counts())
print(uci_har_selected['activity'].value_counts())
print(pamap2_selected_hidden['activity'].value_counts())

activity
0    872550
1     30720
2     30720
3     30720
4     29337
Name: count, dtype: int64
activity
2    238761
0    189931
1    185188
4    164600
3     98199
Name: count, dtype: int64
activity
3    238690
0    188107
1    175353
4    117216
2    104944
Name: count, dtype: int64
activity
3.0    54786
1.0    51490
4.0    48683
0.0    47293
2.0    30503
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mhealth_selected['activity'] = mhealth_selected['activity'].replace(label_mapping_mhealth)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pamap2_selected['activity'] = pamap2_selected['activity'].replace(label_mapping_pamap2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  uci_har_selected['activity

In [None]:
mhealth_selected

Unnamed: 0,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,activity,subject,drift
0,2.1849,-9.6967,0.63077,0.103900,-0.84053,-0.68762,0,subject1,psedo drift
1,2.3876,-9.5080,0.68389,0.085343,-0.83865,-0.68369,0,subject1,psedo drift
2,2.4086,-9.5674,0.68113,0.085343,-0.83865,-0.68369,0,subject1,psedo drift
3,2.1814,-9.4301,0.55031,0.085343,-0.83865,-0.68369,0,subject1,psedo drift
4,2.4173,-9.3889,0.71098,0.085343,-0.83865,-0.68369,0,subject1,psedo drift
...,...,...,...,...,...,...,...,...,...
1215740,1.7849,-9.8287,0.29725,-0.341370,-0.90056,-0.61493,0,subject10,psedo drift
1215741,1.8687,-9.8766,0.46236,-0.341370,-0.90056,-0.61493,0,subject10,psedo drift
1215742,1.6928,-9.9290,0.16631,-0.341370,-0.90056,-0.61493,0,subject10,psedo drift
1215743,1.5279,-9.6306,0.30458,-0.341370,-0.90056,-0.61493,0,subject10,psedo drift


In [None]:
pamap2_selected

Unnamed: 0,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,activity,subject,drift
30115,2.124620,9.24108,2.77335,0.037944,0.024816,-0.016013,1,1,No drift
30116,2.338250,9.27392,2.39156,0.040065,0.011347,0.009483,1,1,No drift
30117,2.227290,9.23752,2.46700,0.017566,-0.017100,-0.006770,1,1,No drift
30118,2.190040,9.12433,2.50531,-0.047912,0.007181,0.005693,1,1,No drift
30119,2.391820,9.23971,2.85433,-0.046287,-0.000137,0.012163,1,1,No drift
...,...,...,...,...,...,...,...,...,...
2818443,0.489770,2.82383,9.23639,-0.017195,0.006944,0.007758,3,8,No drift
2818444,0.370253,2.70955,9.08103,-0.032333,-0.044224,0.011758,3,8,No drift
2818445,0.644139,2.70986,9.35454,-0.004483,-0.001148,0.039415,3,8,No drift
2818446,0.566153,2.74771,9.27627,-0.022839,0.045628,0.049042,3,8,No drift


In [None]:
uci_har_selected

Unnamed: 0,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,activity,subject,drift
84967,2.43859,7.02674,5.74905,-0.075247,0.070647,-0.040244,3,1,real drift
84968,2.58814,6.98730,5.71293,-0.102412,-0.011661,-0.035736,3,1,real drift
84969,2.59135,6.87449,5.82882,-0.039697,-0.004252,-0.019637,3,1,real drift
84970,2.55276,6.95033,5.78950,0.015871,0.023975,-0.032205,3,1,real drift
84971,2.75106,7.14058,6.02276,0.068200,-0.031930,-0.030819,3,1,real drift
...,...,...,...,...,...,...,...,...,...
2796343,-1.68428,-8.97338,3.43203,-0.231392,-0.391747,0.180935,0,8,real drift
2796344,-1.72527,-9.04964,3.35469,-0.252115,-0.338597,0.180709,0,8,real drift
2796345,-1.53312,-8.97455,3.43429,-0.176675,-0.311570,0.172539,0,8,real drift
2796346,-1.50362,-9.01479,3.20395,-0.173602,-0.291495,0.170721,0,8,real drift


In [None]:
pamap2_selected_hidden

Unnamed: 0,accelerometer_x,accelerometer_y,accelerometer_z,gyroscope_x,gyroscope_y,gyroscope_z,activity,subject,drift
0,2.124620,9.24108,2.77335,0.037944,0.024816,-0.016013,,1,hidden drift
1,2.338250,9.27392,2.39156,0.040065,0.011347,0.009483,,1,hidden drift
2,2.227290,9.23752,2.46700,0.017566,-0.017100,-0.006770,,1,hidden drift
3,2.190040,9.12433,2.50531,-0.047912,0.007181,0.005693,,1,hidden drift
4,2.391820,9.23971,2.85433,-0.046287,-0.000137,0.012163,,1,hidden drift
...,...,...,...,...,...,...,...,...,...
876674,0.489770,2.82383,9.23639,-0.017195,0.006944,0.007758,1.0,8,hidden drift
876675,0.370253,2.70955,9.08103,-0.032333,-0.044224,0.011758,1.0,8,hidden drift
876676,0.644139,2.70986,9.35454,-0.004483,-0.001148,0.039415,1.0,8,hidden drift
876677,0.566153,2.74771,9.27627,-0.022839,0.045628,0.049042,1.0,8,hidden drift


In [None]:

# Combine them into a single DataFrame
combined_df = pd.concat([
    mhealth_selected,
    pamap2_selected,
    uci_har_selected,
    pamap2_selected_hidden
], ignore_index=True)

In [None]:
combined_df['drift'].value_counts()

Unnamed: 0_level_0,count
drift,Unnamed: 1_level_1
psedo drift,994047
No drift,876679
hidden drift,876679
real drift,824310


In [None]:
combined_df.to_csv('/content/drive/My Drive/Early Drift Detection/Final_dataset_drift.csv', index=False)