<a href="https://colab.research.google.com/github/eylulgokce/Met-App/blob/main/met_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MET ML

Download the dataset from "https://archive.ics.uci.edu/dataset/344/heterogeneity+activity+recognition", prepare it for modeling, train and evaluate Random Forest, Support Vector Machine, Gradient Boosting, and Neural Network models, compare their performance, select the best model, and export the selected model and the feature extraction/post-processing pipeline.

## Data preparation


download, unzip and read


In [2]:
import pandas as pd
import os

# dataset path
dataset_path = 'HHAR/Activity_Recognition_exp/Activity recognition exp/Phones_accelerometer.csv'

# Check if the dataset file already exists
if not os.path.exists(dataset_path):
    print("Dataset not found locally. Downloading and unzipping...")
    # Download the dataset
    !wget https://archive.ics.uci.edu/static/public/344/heterogeneity+activity+recognition.zip -O heterogeneity+activity+recognition.zip

    # Unzip the dataset
    !unzip heterogeneity+activity+recognition.zip

    # Create directories for unzipping nested files
    os.makedirs('HHAR/Activity_Recognition_exp', exist_ok=True)
    os.makedirs('HHAR/Still_exp', exist_ok=True)

    # Unzip the nested zip files
    !unzip 'Activity recognition exp.zip' -d HHAR/Activity_Recognition_exp
    !unzip 'Still exp.zip' -d HHAR/Still_exp
else:
    print("Dataset found locally. Skipping download and unzip.")

try:
    df_full = pd.read_csv(dataset_path)
    print("Full DataFrame loaded successfully.")

    print("\nFirst 5 rows of the full DataFrame:")
    display(df_full.head())
    print("\nFull DataFrame Info:")
    display(df_full.info())
except FileNotFoundError:
    print(f"Error: Dataset not found at {dataset_path}. Please make sure the file is in the correct directory.")
    df_full = None # Ensure df_full is None if loading fails

Dataset found locally. Skipping download and unzip.
Full DataFrame loaded successfully.

First 5 rows of the full DataFrame:


Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt
0,0,1424696633908,1424696631913248572,-5.958191,0.688065,8.135345,a,nexus4,nexus4_1,stand
1,1,1424696633909,1424696631918283972,-5.95224,0.670212,8.136536,a,nexus4,nexus4_1,stand
2,2,1424696633918,1424696631923288855,-5.995087,0.653549,8.204376,a,nexus4,nexus4_1,stand
3,3,1424696633919,1424696631928385290,-5.942718,0.676163,8.128204,a,nexus4,nexus4_1,stand
4,4,1424696633929,1424696631933420691,-5.991516,0.641647,8.135345,a,nexus4,nexus4_1,stand



Full DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13062475 entries, 0 to 13062474
Data columns (total 10 columns):
 #   Column         Dtype  
---  ------         -----  
 0   Index          int64  
 1   Arrival_Time   int64  
 2   Creation_Time  int64  
 3   x              float64
 4   y              float64
 5   z              float64
 6   User           object 
 7   Model          object 
 8   Device         object 
 9   gt             object 
dtypes: float64(3), int64(3), object(4)
memory usage: 996.6+ MB


None

In [16]:
display(df_full.head())

Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt,x_mean,y_mean,z_mean,x_var,y_var,z_var,x_std,y_std,z_std,met_class
49,49,1424696634164,1424696632160105261,-5.947479,0.733292,8.157959,a,nexus4,nexus4_1,stand,-5.936744,0.694944,8.128228,0.001139,0.002862,0.001706,0.033755,0.053497,0.041298,Sedentary
50,50,1424696634165,1424696632165140662,-5.902252,0.744003,8.097259,a,nexus4,nexus4_1,stand,-5.935625,0.696063,8.127466,0.001153,0.002909,0.001723,0.033956,0.053933,0.041515,Sedentary
51,51,1424696634165,1424696632170176062,-5.930817,0.705917,8.107971,a,nexus4,nexus4_1,stand,-5.935197,0.696777,8.126895,0.001148,0.002897,0.001729,0.033877,0.05382,0.041584,Sedentary
52,52,1424696634175,1424696632175241980,-5.904633,0.665451,8.097259,a,nexus4,nexus4_1,stand,-5.933387,0.697015,8.124753,0.00109,0.002878,0.00162,0.033018,0.053651,0.040249,Sedentary
53,53,1424696634178,1424696632180277380,-5.89154,0.664261,8.072266,a,nexus4,nexus4_1,stand,-5.932364,0.696777,8.123634,0.001123,0.002891,0.001675,0.033512,0.053772,0.040923,Sedentary


In [5]:
# Check for missing values
print("\nMissing values per column:")
display(df_full.isnull().sum())

# Handle missing values in the 'gt' column by dropping rows
df_full.dropna(subset=['gt'], inplace=True)

print("\nMissing values after dropping rows with missing 'gt':")
display(df_full.isnull().sum())

# Check data types
print("\nData types of columns:")
display(df_full.dtypes)


Missing values per column:


Unnamed: 0,0
Index,0
Arrival_Time,0
Creation_Time,0
x,0
y,0
z,0
User,0
Model,0
Device,0
gt,1783200



Missing values after dropping rows with missing 'gt':


Unnamed: 0,0
Index,0
Arrival_Time,0
Creation_Time,0
x,0
y,0
z,0
User,0
Model,0
Device,0
gt,0



Data types of columns:


Unnamed: 0,0
Index,int64
Arrival_Time,int64
Creation_Time,int64
x,float64
y,float64
z,float64
User,object
Model,object
Device,object
gt,object


In [10]:

window_size = 50

# Calculate basic statistical features (mean, variance, std dev) for each window
df_full['x_mean'] = df_full['x'].rolling(window=window_size).mean()
df_full['y_mean'] = df_full['y'].rolling(window=window_size).mean()
df_full['z_mean'] = df_full['z'].rolling(window=window_size).mean()

df_full['x_var'] = df_full['x'].rolling(window=window_size).var()
df_full['y_var'] = df_full['y'].rolling(window=window_size).var()
df_full['z_var'] = df_full['z'].rolling(window=window_size).var()

df_full['x_std'] = df_full['x'].rolling(window=window_size).std()
df_full['y_std'] = df_full['y'].rolling(window=window_size).std()
df_full['z_std'] = df_full['z'].rolling(window=window_size).std()

# Drop the first `window_size - 1` rows as they will have NaN for the rolling features
df_full.dropna(inplace=True)

print("\nDataFrame with extracted features:")
display(df_full.head())


DataFrame with extracted features:


Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt,x_mean,y_mean,z_mean,x_var,y_var,z_var,x_std,y_std,z_std
49,49,1424696634164,1424696632160105261,-5.947479,0.733292,8.157959,a,nexus4,nexus4_1,stand,-5.936744,0.694944,8.128228,0.001139,0.002862,0.001706,0.033755,0.053497,0.041298
50,50,1424696634165,1424696632165140662,-5.902252,0.744003,8.097259,a,nexus4,nexus4_1,stand,-5.935625,0.696063,8.127466,0.001153,0.002909,0.001723,0.033956,0.053933,0.041515
51,51,1424696634165,1424696632170176062,-5.930817,0.705917,8.107971,a,nexus4,nexus4_1,stand,-5.935197,0.696777,8.126895,0.001148,0.002897,0.001729,0.033877,0.05382,0.041584
52,52,1424696634175,1424696632175241980,-5.904633,0.665451,8.097259,a,nexus4,nexus4_1,stand,-5.933387,0.697015,8.124753,0.00109,0.002878,0.00162,0.033018,0.053651,0.040249
53,53,1424696634178,1424696632180277380,-5.89154,0.664261,8.072266,a,nexus4,nexus4_1,stand,-5.932364,0.696777,8.123634,0.001123,0.002891,0.001675,0.033512,0.053772,0.040923


## Adaptation of MET values


Approximate MET values:
- Sitting: 1.0-1.3 (Sedentary)
- Standing: 1.3-1.8 (Sedentary to Light)
- Walking: 2.0-5.0 (Light to Moderate)
- Biking: 3.0-8.0+ (Moderate to Vigorous)
- Stair Up: 4.0-8.0+ (Moderate to Vigorous)
- Stair Down: 3.0-6.0 (Moderate)

defining a simplified mapping to the four required MET classes using the actual labels from the dataset:
- Sedentary (< 1.5 METs)
- Light (1.5–3 METs)
- Moderate (3–6 METs)
- Vigorous (> 6 METs)

In [14]:
activity_to_met_class = {
    'sit': 'Sedentary',
    'stand': 'Sedentary',
    'walk': 'Light',
    'bike': 'Moderate',
    'stairsup': 'Moderate',
    'stairsdown': 'Moderate'
}

# Apply the mapping to met_class col
df_full['met_class'] = df_full['gt'].map(activity_to_met_class)

# Check the distribution MET classes
print("\nDistribution of MET classes:")
display(df_full['met_class'].value_counts())

print("\nDataFrame with 'met_class' column:")
display(df_full.head())


Distribution of MET classes:


Unnamed: 0_level_0,count
met_class,Unnamed: 1_level_1
Moderate,5243463
Sedentary,3843362
Light,2192401



DataFrame with 'met_class' column:


Unnamed: 0,Index,Arrival_Time,Creation_Time,x,y,z,User,Model,Device,gt,x_mean,y_mean,z_mean,x_var,y_var,z_var,x_std,y_std,z_std,met_class
49,49,1424696634164,1424696632160105261,-5.947479,0.733292,8.157959,a,nexus4,nexus4_1,stand,-5.936744,0.694944,8.128228,0.001139,0.002862,0.001706,0.033755,0.053497,0.041298,Sedentary
50,50,1424696634165,1424696632165140662,-5.902252,0.744003,8.097259,a,nexus4,nexus4_1,stand,-5.935625,0.696063,8.127466,0.001153,0.002909,0.001723,0.033956,0.053933,0.041515,Sedentary
51,51,1424696634165,1424696632170176062,-5.930817,0.705917,8.107971,a,nexus4,nexus4_1,stand,-5.935197,0.696777,8.126895,0.001148,0.002897,0.001729,0.033877,0.05382,0.041584,Sedentary
52,52,1424696634175,1424696632175241980,-5.904633,0.665451,8.097259,a,nexus4,nexus4_1,stand,-5.933387,0.697015,8.124753,0.00109,0.002878,0.00162,0.033018,0.053651,0.040249,Sedentary
53,53,1424696634178,1424696632180277380,-5.89154,0.664261,8.072266,a,nexus4,nexus4_1,stand,-5.932364,0.696777,8.123634,0.001123,0.002891,0.001675,0.033512,0.053772,0.040923,Sedentary


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Select features (X) and target variable (y)
feature_columns = ['x_mean', 'y_mean', 'z_mean', 'x_var', 'y_var', 'z_var', 'x_std', 'y_std', 'z_std']
X = df_full[feature_columns]
y = df_full['met_class']

# Encode the target variable 'met_class' to numerical representation
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)
print("\nDistribution of MET classes in y_train:")
display(pd.Series(y_train).value_counts(normalize=True))
print("\nDistribution of MET classes in y_test:")
display(pd.Series(y_test).value_counts(normalize=True))

Shape of X_train: (9023380, 9)
Shape of X_test: (2255846, 9)
Shape of y_train: (9023380,)
Shape of y_test: (2255846,)

Distribution of MET classes in y_train:


Unnamed: 0,proportion
1,0.464878
2,0.340747
0,0.194375



Distribution of MET classes in y_test:


Unnamed: 0,proportion
1,0.464878
2,0.340747
0,0.194375


## ML models

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# n_estimators=10
rf_model = RandomForestClassifier(n_estimators=10, random_state=42, n_jobs=-1)

# Train the model
print("Training Random Forest model...")
rf_model.fit(X_train, y_train)
print("Training complete.")

# Make predictions
print("Making predictions on testing data...")
y_pred_rf = rf_model.predict(X_test)
print("Predictions complete.")

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

print("\nRandom Forest Model Performance:")
print(f"Accuracy: {accuracy_rf:.4f}")
print(f"Precision: {precision_rf:.4f}")
print(f"Recall: {recall_rf:.4f}")
print(f"F1-score: {f1_rf:.4f}")

Training Random Forest model...
Training complete.
Making predictions on testing data...
Predictions complete.

Random Forest Model Performance:
Accuracy: 0.4019
Precision: 0.3728
Recall: 0.4019
F1-score: 0.3796


In [None]:
import joblib
import os

model_filename = 'random_forest_model.joblib'

# save model
joblib.dump(rf_model, model_filename)

print(f"Random Forest model saved to '{model_filename}'")

Random Forest model saved to 'random_forest_model.joblib'


### Support Vector Machine (SVM)



In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

svm_model = LinearSVC(random_state=42, dual=False, max_iter=1)

# Train the model
print("Training LinearSVC model...")
start_time = time.time()
svm_model.fit(X_train, y_train)
end_time = time.time()
print(f"Training complete in {end_time - start_time:.2f} seconds.")

# Make predictions
print("Making predictions on testing data...")
start_time = time.time()
y_pred_svm = svm_model.predict(X_test)
end_time = time.time()
print(f"Predictions complete in {end_time - start_time:.2f} seconds.")

# Evaluate the model
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm, average='weighted', zero_division=0)
recall_svm = recall_score(y_test, y_pred_svm, average='weighted', zero_division=0)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted', zero_division=0)

print("\nLinearSVC Model Performance:")
print(f"Accuracy: {accuracy_svm:.4f}")
print(f"Precision: {precision_svm:.4f}")
print(f"Recall: {recall_svm:.4f}")
print(f"F1-score: {f1_svm:.4f}")

Training LinearSVC model...


NameError: name 'X_train' is not defined

### Gradient Boosting

### Neural Network models