In [1]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from datetime import datetime
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils import compute_class_weight

from keras.layers import Dense, LSTM, Bidirectional, Concatenate
from keras.layers import Embedding
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, CSVLogger

In [2]:
# fix random seed for reproducibility
seed = 7
test_size = 0.2
units = 64
epochs = 10

In [3]:
main_data = pd.read_csv('sensor_readings.csv')

main_data

Unnamed: 0,activity_label,D001-CLOSED,D001-OPEN,D002-CLOSED,D002-OPEN,D004-CLOSED,D004-OPEN,M001-OFF,M001-ON,M002-OFF,...,T005-15.97 - 18.70,T005-18.70 - 21.40,T005-21.40 - 24.10,T005-24.10 - 26.80,T005-26.80 - 29.50,T005-29.50 - 32.20,T005-32.20 - 34.90,T005-34.90 - 37.60,T005-37.60 - 40.30,T005-40.30 - 43.00
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693587,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693588,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693589,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693590,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
with open("dataset_summary.json") as f:
    dataset_summary = json.load(f)

dataset_summary

{'activity_encoder': {'Sleeping': 0,
  'no_activity': 1,
  'Bed_to_Toilet': 2,
  'Meal_Preparation': 3,
  'Relax': 4,
  'Housekeeping': 5,
  'Eating': 6,
  'Wash_Dishes': 7,
  'Leave_Home': 8,
  'Enter_Home': 9,
  'Work': 10,
  'Respirate': 11},
 'activity_decoder': {'0': 'Sleeping',
  '1': 'no_activity',
  '2': 'Bed_to_Toilet',
  '3': 'Meal_Preparation',
  '4': 'Relax',
  '5': 'Housekeeping',
  '6': 'Eating',
  '7': 'Wash_Dishes',
  '8': 'Leave_Home',
  '9': 'Enter_Home',
  '10': 'Work',
  '11': 'Respirate'},
 'columns': ['sampled_time',
  'activity_label',
  'D001-CLOSED',
  'D001-OPEN',
  'D002-CLOSED',
  'D002-OPEN',
  'D004-CLOSED',
  'D004-OPEN',
  'M001-OFF',
  'M001-ON',
  'M002-OFF',
  'M002-ON',
  'M003-OFF',
  'M003-ON',
  'M004-OFF',
  'M004-ON',
  'M005-OFF',
  'M005-ON',
  'M006-OFF',
  'M006-ON',
  'M007-OFF',
  'M007-ON',
  'M008-OFF',
  'M008-ON',
  'M009-OFF',
  'M009-ON',
  'M010-OFF',
  'M010-ON',
  'M011-OFF',
  'M011-ON',
  'M012-OFF',
  'M012-ON',
  'M013-OFF',
 

In [5]:
# normalize the data
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
scaler = MinMaxScaler()

data = main_data.copy()

# Get column names for back-transforming scaled data into DataFrame
column_names = [col for col in data.columns if col not in ['activity_label',  'sampled_time']]

# Fit the scaler to the data
data[column_names] = scaler.fit_transform(data[column_names])

# data.to_csv('normalized_data.csv', index=False)

data

Unnamed: 0,activity_label,D001-CLOSED,D001-OPEN,D002-CLOSED,D002-OPEN,D004-CLOSED,D004-OPEN,M001-OFF,M001-ON,M002-OFF,...,T005-15.97 - 18.70,T005-18.70 - 21.40,T005-21.40 - 24.10,T005-24.10 - 26.80,T005-26.80 - 29.50,T005-29.50 - 32.20,T005-32.20 - 34.90,T005-34.90 - 37.60,T005-37.60 - 40.30,T005-40.30 - 43.00
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693587,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693588,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693589,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693590,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
X = data.drop(columns=['activity_label'])
y = data['activity_label']

In [7]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=seed)
oversampled_X, oversampled_Y = ros.fit_resample(X, y)

# Split the oversampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(oversampled_X, oversampled_Y, test_size=test_size, random_state=seed)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3197116, 118), (3197116,), (799280, 118), (799280,))

In [8]:
oversampled_X

Unnamed: 0,D001-CLOSED,D001-OPEN,D002-CLOSED,D002-OPEN,D004-CLOSED,D004-OPEN,M001-OFF,M001-ON,M002-OFF,M002-ON,...,T005-15.97 - 18.70,T005-18.70 - 21.40,T005-21.40 - 24.10,T005-24.10 - 26.80,T005-26.80 - 29.50,T005-29.50 - 32.20,T005-32.20 - 34.90,T005-34.90 - 37.60,T005-37.60 - 40.30,T005-40.30 - 43.00
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3996391,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996392,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3996394,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
# from tsaug import AddNoise

# # Convert DataFrame to NumPy arrays
# X_np = X.to_numpy()
# y_np = y.to_numpy()

# # Oversample
# ros = RandomOverSampler(random_state=seed)
# oversampled_X, oversampled_Y = ros.fit_resample(X_np, y_np)

# # AddNoise expects 2D or 3D array (n_samples, n_features[, time_steps])
# # Here we assume (n_samples, n_features); no reshaping needed if it's 2D
# augmenter = AddNoise(scale=0.05)

# # Perform augmentation in batch
# augmented_X = augmenter.augment(oversampled_X)

# # Train-test split
# X_train, X_test, y_train, y_test = train_test_split(
#     augmented_X, oversampled_Y, test_size=test_size, random_state=seed
# )

# # Check shapes
# X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [10]:
X_train[X_train["D001-CLOSED"]>0]

Unnamed: 0,D001-CLOSED,D001-OPEN,D002-CLOSED,D002-OPEN,D004-CLOSED,D004-OPEN,M001-OFF,M001-ON,M002-OFF,M002-ON,...,T005-15.97 - 18.70,T005-18.70 - 21.40,T005-21.40 - 24.10,T005-24.10 - 26.80,T005-26.80 - 29.50,T005-29.50 - 32.20,T005-32.20 - 34.90,T005-34.90 - 37.60,T005-37.60 - 40.30,T005-40.30 - 43.00
3265698,0.5,0.0,1.0,0.0,0.5,0.0,0.333333,0.0,0.333333,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1756815,0.5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1940348,0.5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1901919,0.5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3149083,0.5,0.0,1.0,0.0,0.5,0.0,0.333333,0.0,0.333333,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3198991,0.5,0.0,1.0,0.0,0.5,0.0,0.333333,0.0,0.333333,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1839840,0.5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1782249,0.5,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3100073,0.5,0.0,1.0,0.0,0.5,0.0,0.333333,0.0,0.333333,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# # Scale training data and reformat as DataFrame with original column names
# X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=column_names)

# # Scale testing data and reformat as DataFrame
# X_test = pd.DataFrame(scaler.transform(X_test), columns=column_names)

In [12]:
# from sklearn.neighbors import KNeighborsClassifier
# model = KNeighborsClassifier(n_neighbors=len(activity_encoder), n_jobs=-1)

# # from sklearn.tree import DecisionTreeClassifier
# # model = DecisionTreeClassifier(random_state=seed)

# from sklearn.ensemble import RandomForestClassifier
# model = RandomForestClassifier(random_state=seed)

# XGBoost
from xgboost import XGBClassifier
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=seed)

# fit the model
model.fit(X_train, y_train)

# make predictions
predictions = model.predict(X_test)

confusion_matrix_values = confusion_matrix(y_test, predictions)
classification_report_values = classification_report(y_test, predictions, output_dict=True)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [13]:
# save the model
import joblib
joblib.dump(model, 'model.pkl')

['model.pkl']

In [14]:
# normalize confusion matrix
df_ = confusion_matrix_values.astype('float') / confusion_matrix_values.sum(axis=1)[:, np.newaxis]

fig = px.imshow(df_, 
                x=list(dataset_summary["activity_encoder"].keys()), 
                y=list(dataset_summary["activity_encoder"].keys()),
                color_continuous_scale='Viridis',
                title='Confusion Matrix',
                width=800,
                height=1200,
                labels=dict(x="Predicted", y="True", color="Count"))
fig.update_xaxes(side="top")
fig.show()

In [15]:
df_ = pd.DataFrame(classification_report_values).transpose()
df_.reset_index(inplace=True)
df_.columns = ['activity', 'precision', 'recall', 'f1-score', 'support']

# decode the activity labels
df_['activity'] = df_['activity'].apply(lambda x: dataset_summary["activity_decoder"][x] if x in dataset_summary["activity_decoder"] else "---- "+x)

fig = go.Figure(data=[go.Table(
    header=dict(values=list(df_.columns),
                fill_color='paleturquoise',
                align='left'),
    cells=dict(values=[df_.activity, df_['precision'], df_['recall'], df_['f1-score'], df_['support']],
               fill_color='lavender',
               align='left'))
])

fig.update_layout(
    autosize=False,
    width=900,
    height=600,
    title_text="Classification Report",
    title_x=0.5
)
fig.show()


In [16]:
raise Exception("End of the script")

Exception: End of the script

----------

### LSTM model for Time Series Forecasting

In [None]:
def get_model(input_dim, output_dim, no_activities):
    model = Sequential(name='biLSTM')
    model.add(Embedding(input_dim, output_dim, mask_zero=True))
    model.add(Bidirectional(LSTM(output_dim)))
    model.add(Dense(no_activities, activation='softmax'))
    return model

def compile_model(model):
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model

In [None]:
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

for train, test in kfold.split(X_train, y_train):
  print("X train shape: ", X_train.iloc[train].shape)
  print("y train shape: ", y_train.iloc[train].shape)
  
  model = get_model(X_train.shape[1], units, len(dataset_summary["activity_encoder"]))
  model = compile_model(model)
  
  current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  
  csv_logger = CSVLogger(f'logs/lstm_{current_time}.log', append=True, separator=';')
  model_checkpoint = ModelCheckpoint(f'logs/lstm_{current_time}.h5', monitor='val_loss', save_best_only=True, mode='min')
  
  print("Training model...")
  class_weight = compute_class_weight('balanced', classes=np.unique(y_train.iloc[train]), y=y_train.iloc[train])
  # Ensure input data contains valid indices for the embedding layer
  X_train_mapped = X_train.applymap(lambda x: max(0, x))  # Map negative values to 0
  X_train_mapped = X_train_mapped.clip(upper=X_train.shape[1] - 1)  # Clip values to valid range

  model.fit(
      X_train_mapped.iloc[train], 
      y_train.iloc[train],
      validation_split=0.2,
      epochs=epochs,
      batch_size=64,
      verbose=1)
  
  print("Evaluating model...")
  scores = model.evaluate(X_train.iloc[test], y_train.iloc[test],batch_size=64, verbose=1)
  print('%s: %.2f' % (model.metrics_names[1], scores[1]*100))