In [12]:
import pandas as pd
import numpy as np

In [13]:
# laod data
data = pd.read_csv('dataset/dementia_wandering_data.csv')

In [14]:
# get the number of rows and columns
print(data.shape)

(10000, 5)


In [15]:
# extract the first 5 row
data.head()

Unnamed: 0,timestamp,distance_from_safe_zone,heart_rate,speed,wandering_label
0,2022-07-02 12:12:18.793879376,52,112,1.264261,0
1,2021-11-15 09:37:12.583258320,315,79,2.804538,1
2,2020-09-09 22:11:07.506750672,66,84,0.848975,1
3,2021-11-23 21:00:42.124212416,0,62,0.322175,0
4,2021-10-22 14:01:11.287128712,463,107,1.520318,1


In [16]:
# split the data into features and target
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train

Unnamed: 0,timestamp,distance_from_safe_zone,heart_rate,speed
9254,2021-07-23 20:19:39.477947792,423,72,3.375025
1561,2021-06-16 07:05:33.753375336,148,103,2.300360
1670,2022-04-01 14:27:32.565256520,28,86,4.887102
6087,2021-07-12 17:48:52.493249320,0,66,0.323448
6669,2022-05-09 03:41:38.289828976,0,72,0.334211
...,...,...,...,...
5734,2022-08-15 11:44:13.825382528,0,64,0.175052
5191,2022-05-20 13:13:13.879387936,492,99,3.164147
5390,2022-07-04 06:17:10.423042304,41,99,2.588583
860,2023-08-24 20:01:56.651665152,175,74,4.784344


In [18]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# transform timestamp to day_of_week and hour_of_day
X_train['timestamp'] = pd.to_datetime(X_train['timestamp'])
X_train['day_of_week'] = X_train['timestamp'].dt.dayofweek
X_train['hour_of_day'] = X_train['timestamp'].dt.hour
X_train = X_train.drop('timestamp', axis=1)

# pipeline for preprocessing label encoding and scaling and training
preprocessor = Pipeline([
    ('column_transformer', ColumnTransformer([
        ('num', StandardScaler(), ['heart_rate', 'speed', 'distance_from_safe_zone', 'hour_of_day']),
        ('cat', OneHotEncoder(), ['day_of_week'])
    ]))
])

# pipeline for training
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Example usage
# Assuming X_train and y_train are already defined
pipeline.fit(X_train, y_train)

# transform timestamp to day_of_week and hour_of_day
X_test['timestamp'] = pd.to_datetime(X_test['timestamp'])
X_test['day_of_week'] = X_test['timestamp'].dt.dayofweek
X_test['hour_of_day'] = X_test['timestamp'].dt.hour
X_test = X_test.drop('timestamp', axis=1)

predictions = pipeline.predict(X_test)

In [19]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

report = classification_report(y_test, predictions)
print(f'Report: \n{report}')

Accuracy: 1.00
Report: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       968
           1       1.00      1.00      1.00      1032

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



In [20]:
# save the model
import joblib

joblib.dump(pipeline, 'model.pkl')

['model.pkl']

In [22]:
# load the model
model = joblib.load('model.pkl')

# Example usage
# Assuming X_test is already defined
predictions = model.predict(X_test)
print(predictions)

[1 1 0 ... 0 0 1]
