In [11]:
import pandas as pd
import os

data_dir = r"C:\Users\Gjert\Masteroppgave\Datasets\GOTOV\Windows_User_individual"

# Get filenames:

test_files = [f"GOTOV_with_WINDOWS_{i}.csv" for i in range(1, 6)]
train_files = [f"GOTOV_with_WINDOWS_{i}.csv" for i in range(6, 36)]

# Read and concatenate the files
df_test = pd.concat(
    [pd.read_csv(os.path.join(data_dir, f)) for f in test_files],
    ignore_index=True
)
df_train = pd.concat(
    [pd.read_csv(os.path.join(data_dir, f)) for f in train_files],
    ignore_index=True
)

# Check results
print(f"Test shape: {df_test.shape}")
print(f"Train shape: {df_train.shape}")

df_test.to_csv("test_data.csv", index=False)
df_train.to_csv("train_data.csv", index=False)


Test shape: (41810, 42)
Train shape: (203001, 42)


In [3]:
import pandas as pd
import os

# Load the CSV files
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

# Print to verify
print("Train shape:", train_data.shape)
print("Test shape:", test_data.shape)

Train shape: (203001, 42)
Test shape: (41810, 42)


In [5]:
print(train_data.columns)
print(test_data.columns)


Index(['Ax_mean', 'Ax_std', 'Ax_mad', 'Ax_max', 'Ax_min', 'Ax_sma',
       'Ax_energy', 'Ax_iqr', 'Ax_entropy', 'Ax_meanFreq', 'Ax_skewness',
       'Ax_kurtosis', 'Ax_maxInds', 'Ay_mean', 'Ay_std', 'Ay_mad', 'Ay_max',
       'Ay_min', 'Ay_sma', 'Ay_energy', 'Ay_iqr', 'Ay_entropy', 'Ay_meanFreq',
       'Ay_skewness', 'Ay_kurtosis', 'Ay_maxInds', 'Az_mean', 'Az_std',
       'Az_mad', 'Az_max', 'Az_min', 'Az_sma', 'Az_energy', 'Az_iqr',
       'Az_entropy', 'Az_meanFreq', 'Az_skewness', 'Az_kurtosis', 'Az_maxInds',
       'window_id', 'label', 'user_id'],
      dtype='object')
Index(['Ax_mean', 'Ax_std', 'Ax_mad', 'Ax_max', 'Ax_min', 'Ax_sma',
       'Ax_energy', 'Ax_iqr', 'Ax_entropy', 'Ax_meanFreq', 'Ax_skewness',
       'Ax_kurtosis', 'Ax_maxInds', 'Ay_mean', 'Ay_std', 'Ay_mad', 'Ay_max',
       'Ay_min', 'Ay_sma', 'Ay_energy', 'Ay_iqr', 'Ay_entropy', 'Ay_meanFreq',
       'Ay_skewness', 'Ay_kurtosis', 'Ay_maxInds', 'Az_mean', 'Az_std',
       'Az_mad', 'Az_max', 'Az_min', 'Az_sma', 

In [7]:
print(train_data['user_id'].unique())
print(test_data['user_id'].unique())


[ 6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
 30 31 32 33 34 35]
[1 2 3 4 5]


In [9]:
import pandas as pd

# Empty DataFrames to store the results
train_2 = pd.DataFrame()
test_data_new = pd.DataFrame()

# Go through each user_id
for user in test_data['user_id'].unique():
    user_data = test_data[test_data['user_id'] == user]
    
    # Go through each label for this user
    for label in user_data['label'].unique():
        user_label_data = user_data[user_data['label'] == label]
        
        # Calculate how many samples are 5% (at least 1)
        n_samples = max(1, int(0.05 * len(user_label_data)))
        
        # Split: first 5% to train_2, rest stays in test_data_new
        train_2 = pd.concat([train_2, user_label_data.iloc[:n_samples]])
        test_data_new = pd.concat([test_data_new, user_label_data.iloc[n_samples:]])

# Reset index to keep things clean
train_2 = train_2.reset_index(drop=True)
test_data = test_data_new.reset_index(drop=True)

# Check results
print("Train_2 shape:", train_2.shape)
print("New Test_data shape:", test_data_new.shape)


Train_2 shape: (2070, 42)
New Test_data shape: (39740, 42)


In [11]:
# Label counts per user in train_2
print("\nLabel counts per user in train_2:")
print(train_2.groupby(['user_id', 'label']).size())

# Label counts per user in new test_data
print("\nLabel counts per user in new test_data:")
print(test_data_new.groupby(['user_id', 'label']).size())



Label counts per user in train_2:
user_id  label                   
1        cycling                     122
         jumping                       2
         lying                        47
         sitting                      78
         standing                      8
         standing_hosehold_chores     51
         vacuum_cleaning              25
         walking                     130
         walkingStairsUp               2
2        cycling                     120
         jumping                       2
         lying                        51
         sitting                      77
         standing                      8
         standing_hosehold_chores     52
         vacuum_cleaning              30
         walking                     128
         walkingStairsUp               3
3        cycling                     106
         jumping                       3
         lying                        49
         sitting                      78
         standing            

In [13]:
# Overall label count in train_2
print("\nLabel counts in train_2:")
print(train_2['label'].value_counts())

# Overall label count in new test_data
print("\nLabel counts in new test_data:")
print(test_data_new['label'].value_counts())



Label counts in train_2:
label
walking                     644
sitting                     361
cycling                     348
standing_hosehold_chores    257
lying                       249
vacuum_cleaning             135
standing                     53
jumping                      12
walkingStairsUp              11
Name: count, dtype: int64

Label counts in new test_data:
label
walking                     12285
sitting                      6903
cycling                      6641
standing_hosehold_chores     4944
lying                        4778
vacuum_cleaning              2609
standing                     1056
jumping                       286
walkingStairsUp               238
Name: count, dtype: int64
