# UCI HAR Dataset

In [8]:
import pandas as pd
from collections import defaultdict



In [11]:
import pandas as pd
from collections import Counter

# File paths
base = "./Human Activity Recognition using Smartphones/UCI HAR Dataset/"
X_train_path = base + "train/X_train.txt"
y_train_path = base + "train/y_train.txt"
subject_train_path = base + "train/subject_train.txt"
X_test_path = base + "test/X_test.txt"
y_test_path = base + "test/y_test.txt"
subject_test_path = base + "test/subject_test.txt"
features_path = base + "features.txt"
activity_labels_path = base + "activity_labels.txt"

# Load features with deduplication
features_path = "./Human Activity Recognition using Smartphones/UCI HAR Dataset/features.txt"
features_df = pd.read_csv(features_path, sep=r"\s+", header=None)
raw_features = features_df[1].tolist()

# Deduplicate feature names
counts = defaultdict(int)
features = []
for name in raw_features:
    if counts[name]:
        new_name = f"{name}_{counts[name]}"
    else:
        new_name = name
    features.append(new_name)
    counts[name] += 1

# Load datasets
X_train = pd.read_csv(X_train_path, sep=r'\s+', header=None, names=features)
y_train = pd.read_csv(y_train_path, header=None, names=["activity"])
subject_train = pd.read_csv(subject_train_path, header=None, names=["subject"])
X_test = pd.read_csv(X_test_path, sep=r'\s+', header=None, names=features)
y_test = pd.read_csv(y_test_path, header=None, names=["activity"])
subject_test = pd.read_csv(subject_test_path, header=None, names=["subject"])

# Combine
train_df = pd.concat([subject_train, y_train, X_train], axis=1)
test_df = pd.concat([subject_test, y_test, X_test], axis=1)

# Activity label mapping
activity_labels = pd.read_csv(activity_labels_path, sep=r'\s+', header=None, names=["id", "label"])
activity_map = dict(zip(activity_labels.id, activity_labels.label))
train_df["activity"] = train_df["activity"].map(activity_map)
test_df["activity"] = test_df["activity"].map(activity_map)

print(train_df.head())


   subject  activity  tBodyAcc-mean()-X  tBodyAcc-mean()-Y  tBodyAcc-mean()-Z  \
0        1  STANDING           0.288585          -0.020294          -0.132905   
1        1  STANDING           0.278419          -0.016411          -0.123520   
2        1  STANDING           0.279653          -0.019467          -0.113462   
3        1  STANDING           0.279174          -0.026201          -0.123283   
4        1  STANDING           0.276629          -0.016570          -0.115362   

   tBodyAcc-std()-X  tBodyAcc-std()-Y  tBodyAcc-std()-Z  tBodyAcc-mad()-X  \
0         -0.995279         -0.983111         -0.913526         -0.995112   
1         -0.998245         -0.975300         -0.960322         -0.998807   
2         -0.995380         -0.967187         -0.978944         -0.996520   
3         -0.996091         -0.983403         -0.990675         -0.997099   
4         -0.998139         -0.980817         -0.990482         -0.998321   

   tBodyAcc-mad()-Y  ...  fBodyBodyGyroJerkMag-mea

In [12]:
train_df

Unnamed: 0,subject,activity,tBodyAcc-mean()-X,tBodyAcc-mean()-Y,tBodyAcc-mean()-Z,tBodyAcc-std()-X,tBodyAcc-std()-Y,tBodyAcc-std()-Z,tBodyAcc-mad()-X,tBodyAcc-mad()-Y,...,fBodyBodyGyroJerkMag-meanFreq(),fBodyBodyGyroJerkMag-skewness(),fBodyBodyGyroJerkMag-kurtosis(),"angle(tBodyAccMean,gravity)","angle(tBodyAccJerkMean),gravityMean)","angle(tBodyGyroMean,gravityMean)","angle(tBodyGyroJerkMean,gravityMean)","angle(X,gravityMean)","angle(Y,gravityMean)","angle(Z,gravityMean)"
0,1,STANDING,0.288585,-0.020294,-0.132905,-0.995279,-0.983111,-0.913526,-0.995112,-0.983185,...,-0.074323,-0.298676,-0.710304,-0.112754,0.030400,-0.464761,-0.018446,-0.841247,0.179941,-0.058627
1,1,STANDING,0.278419,-0.016411,-0.123520,-0.998245,-0.975300,-0.960322,-0.998807,-0.974914,...,0.158075,-0.595051,-0.861499,0.053477,-0.007435,-0.732626,0.703511,-0.844788,0.180289,-0.054317
2,1,STANDING,0.279653,-0.019467,-0.113462,-0.995380,-0.967187,-0.978944,-0.996520,-0.963668,...,0.414503,-0.390748,-0.760104,-0.118559,0.177899,0.100699,0.808529,-0.848933,0.180637,-0.049118
3,1,STANDING,0.279174,-0.026201,-0.123283,-0.996091,-0.983403,-0.990675,-0.997099,-0.982750,...,0.404573,-0.117290,-0.482845,-0.036788,-0.012892,0.640011,-0.485366,-0.848649,0.181935,-0.047663
4,1,STANDING,0.276629,-0.016570,-0.115362,-0.998139,-0.980817,-0.990482,-0.998321,-0.979672,...,0.087753,-0.351471,-0.699205,0.123320,0.122542,0.693578,-0.615971,-0.847865,0.185151,-0.043892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,30,WALKING_UPSTAIRS,0.299665,-0.057193,-0.181233,-0.195387,0.039905,0.077078,-0.282301,0.043616,...,-0.070157,-0.588433,-0.880324,-0.190437,0.829718,0.206972,-0.425619,-0.791883,0.238604,0.049819
7348,30,WALKING_UPSTAIRS,0.273853,-0.007749,-0.147468,-0.235309,0.004816,0.059280,-0.322552,-0.029456,...,0.165259,-0.390738,-0.680744,0.064907,0.875679,-0.879033,0.400219,-0.771840,0.252676,0.050053
7349,30,WALKING_UPSTAIRS,0.273387,-0.017011,-0.045022,-0.218218,-0.103822,0.274533,-0.304515,-0.098913,...,0.195034,0.025145,-0.304029,0.052806,-0.266724,0.864404,0.701169,-0.779133,0.249145,0.040811
7350,30,WALKING_UPSTAIRS,0.289654,-0.018843,-0.158281,-0.219139,-0.111412,0.268893,-0.310487,-0.068200,...,0.013865,0.063907,-0.344314,-0.101360,0.700740,0.936674,-0.589479,-0.785181,0.246432,0.025339


# HuGADB

In [2]:
import pandas as pd
import glob
import os


In [3]:
file_path = "./HuGaDB/Data/HuGaDB_v1_bicycling_01_00.txt"

df = pd.read_csv(file_path, sep=r"\s+", engine='python', skiprows=3)


In [4]:
df.shape

(8867, 39)

In [30]:
df

Unnamed: 0,acc_rf_x,acc_rf_y,acc_rf_z,gyro_rf_x,gyro_rf_y,gyro_rf_z,acc_rs_x,acc_rs_y,acc_rs_z,gyro_rs_x,...,gyro_ls_z,acc_lt_x,acc_lt_y,acc_lt_z,gyro_lt_x,gyro_lt_y,gyro_lt_z,EMG_r,EMG_l,act
0,-14392,-96,10628,138,-388,492,-13980,-1408,-6808,-519,...,183,-11752,2228,9120,261,277,522,130,125,9
1,-11248,164,7744,169,699,429,-14044,-2608,-6992,-214,...,63,-13920,3292,8916,132,86,581,136,124,9
2,-15892,-1952,9324,-217,709,360,-13336,-3564,-7192,-326,...,69,-12756,2608,8796,174,-96,641,127,127,9
3,-13020,892,8604,-297,-62,170,-13832,-3220,-8068,-482,...,155,-13900,4236,8312,88,-63,553,122,127,9
4,-12780,-948,10744,-301,-1359,39,-13680,-2888,-6068,-889,...,-146,-12428,2596,8448,166,66,583,122,125,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8862,-22292,1944,2356,4170,7649,7617,-17820,-4640,-8456,135,...,-3608,-1508,-1188,1908,-407,-5415,2324,122,144,9
8863,-22428,5292,5204,2678,5379,5434,-20172,-2864,-6400,1496,...,-3247,-580,-2140,1988,837,-2803,1683,120,131,9
8864,-25484,-388,12520,6228,6605,6589,-22460,-4828,-4252,-2783,...,-3496,-1764,-380,3444,1672,-568,1471,128,120,9
8865,-16140,-3892,5920,5414,9979,8453,-22008,-820,-6416,1041,...,-3842,-4256,280,6032,2075,1275,1231,134,101,9


In [106]:
file_pattern = os.path.join(data_folder, "HuGaDB_v1_*.txt")

In [107]:
file_pattern

'/Users/ecekaracam/Documents/GitHub/Erdos Su25/Decoding Human Activity Erdos Summer 2025/HuGaDB/Data/HuGaDB_v1_*.txt'

In [108]:
a = glob.glob(file_pattern)[0]

In [109]:
a

'/Users/ecekaracam/Documents/GitHub/Erdos Su25/Decoding Human Activity Erdos Summer 2025/HuGaDB/Data/HuGaDB_v1_various_16_05.txt'

In [110]:
a = os.path.basename(a)

In [111]:
parts = a.replace(".txt", "").split("_")

In [112]:
parts[-2]+parts[1]

'16v1'

In [114]:
parts[3]


'16'

In [133]:
def load_hugadb_to_dataframe(data_folder):
    all_records = []
    # match every HuGaDB_v1_<activity>_<subject>_<trial>.txt
    file_pattern = os.path.join(data_folder, "HuGaDB_v1_*.txt")

    for filepath in glob.glob(file_pattern):
        basename = os.path.basename(filepath)
        parts = basename.replace(".txt", "").split("_")
        # parts = ["HuGaDB", "v1", "activity", "subject_id", "counter"]
        if parts[3] == 'str':
            activity = str(parts[2]+parts+[3]+parts[4])
        else:
            activity = parts[2]
        if activity == "various":
            # skip every file whose activity == "various"
            continue

        participant_id = int(parts[-2])
        
        # now read “filepath” as before…
        # (skiprows=3 so that the 4th line is the header row,
        #  then load all numeric rows)
        df = pd.read_csv(
            filepath,
            sep = r"\s+|\t",
            header = 0,
            skiprows = 3,
            engine = 'python'
        )
        df.insert(0, "subject", participant_id)
        df.insert(1, "activity", activity)
        all_records.append(df)

    return pd.concat(all_records, ignore_index = True) if all_records else pd.DataFrame()

In [134]:
data_folder = '/Users/ecekaracam/Documents/GitHub/Erdos Su25/Decoding Human Activity Erdos Summer 2025/HuGaDB/Data'

In [135]:
df = load_hugadb_to_dataframe(data_folder)
# df = df.sort_values(by = ['subject','activity']).reset_index(drop = True)


In [121]:
df['activity']

0         bicycling
1         bicycling
2         bicycling
3         bicycling
4         bicycling
            ...    
973878      walking
973879      walking
973880      walking
973881      walking
973882      walking
Name: activity, Length: 973883, dtype: object

In [136]:
df

Unnamed: 0,subject,activity,acc_rf_x,acc_rf_y,acc_rf_z,gyro_rf_x,gyro_rf_y,gyro_rf_z,acc_rs_x,acc_rs_y,...,gyro_ls_z,acc_lt_x,acc_lt_y,acc_lt_z,gyro_lt_x,gyro_lt_y,gyro_lt_z,EMG_r,EMG_l,act
0,6,walking,-30460,21604,19660,2132,5002,2255,-26360,-19264,...,-853,-26072,5912,17784,260,-1891,198,123,131,1
1,6,walking,-21988,3956,25252,1383,1600,2521,-15864,-18384,...,-357,-28824,-6344,20440,-548,-2469,-494,115,116,1
2,6,walking,-31924,-30064,7848,1825,610,684,-23544,-7760,...,-518,-22728,-3944,5696,903,-2106,-579,139,131,1
3,6,walking,-14096,-4760,17476,1182,27,658,-15880,-6472,...,-927,-28000,6352,-13408,2848,-1334,78,116,132,1
4,6,walking,-9992,-280,18764,751,-136,268,-13816,5536,...,-1181,-28304,9600,-10688,1586,-2191,1009,114,129,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973878,6,sitting,-8228,-5444,12636,-13,16,-22,-14976,592,...,21,-2704,4896,16184,31,-2,8,117,134,5
973879,6,sitting,-8272,-5460,12640,-13,12,-25,-14904,616,...,23,-2672,4936,16168,28,-5,5,116,134,5
973880,6,sitting,-8284,-5440,12636,-3,14,-24,-15000,432,...,23,-2648,5008,16120,38,-6,0,116,133,5
973881,6,sitting,-8228,-5496,12636,-5,11,-21,-15040,648,...,29,-2608,4968,16200,41,-9,-1,117,133,5


In [122]:
df.to_csv("hugadb_all.csv",index = False)

In [125]:
df = pd.read_csv("/Users/ecekaracam/Documents/GitHub/Erdos Su25/Decoding Human Activity Erdos Summer 2025/HuGaDB Data Frame/hugadb_all.csv"
               )