# Install and import libraries

In [21]:
import numpy as np
import pandas as pd
import pickle

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical, plot_model

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from pathlib import Path

from config import (
    DATA_INPUT_PATH,
    CLASSES,
    MODEL_PATH,
    METADATA_PATH,
)

n_classes = len(CLASSES.keys())

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Convert .csv(s) to dataframes and concatenate

In [273]:
# Read all of the files in the data folder
files_in_folder = Path(DATA_INPUT_PATH).rglob("*.csv")
files = [x for x in files_in_folder]

# Read the data from the files
dfs = []

for file in files:
    df = pd.read_csv(str(file))
    dfs.append(df)
        
# Convert the data to a DataFrame
df = pd.concat([x for x in dfs], axis=0)
df = df.drop_duplicates()

# Parameters
window_size = 80  # Number of past samples to include

# Extract feature columns (excluding label, if it exists)
feature_columns = [col for col in df.columns if col != 'Label']
features = df[feature_columns].to_numpy()

# Convert features into a 3D array: [samples, features, time]
n_samples = len(features)
n_features = len(feature_columns)
reshaped_data = []

for i in range(n_samples):
    # Get the last `window_size` rows or fewer for each sample
    window = features[max(i - window_size + 1, 0):i + 1]
    
    # Pad the window if it has fewer than `window_size` rows
    if len(window) < window_size:
        padding = np.zeros((window_size - len(window), n_features))
        window = np.vstack((padding, window))
    
    reshaped_data.append(window)

# Convert list to 3D numpy array
reshaped_data = np.stack(reshaped_data, axis=0)
reshaped_data = reshaped_data.transpose(0, 2, 1)
df[feature_columns] = np.sqrt(np.mean(np.square(reshaped_data), axis=2))

   Sensor1  Sensor2  Sensor3  Sensor4  Sensor5  Sensor6  Sensor7  Sensor8  \
0       -1        2       -8       -1        1       -1        1        0   
1       -1       -1        3       -1        0       -1       -1        0   
2        0        8        6        1       -1       -1       -2        0   
3       -1        0       -7       -6       -1        1       -1        0   
4        2        4        8        2        0       -3        0       -1   

   Sensor9  Sensor10  Sensor11  Sensor12  Sensor13  Sensor14  Sensor15  \
0        0        -1         1         1        -1        -4         0   
1       -1         1        -5        -4         0         0        -1   
2       -1        -8       -11         2        -1         0        -1   
3       -1         0         0         2         0        -2         0   
4       -2        -7        -9         0        -1        -2        -2   

   Sensor16  Label  
0         0      0  
1        -2      0  
2        -2      0  
3       

## Scale and clean the data, then train the model

In [1]:
X = df.drop(columns=['Label'])
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# scale the features and do PCA to retain variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

pca = PCA(n_components=0.95, svd_solver='full')
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# filter for valid classes because the data is not clean
mask_train = y_train.isin(CLASSES.keys())
mask_test = y_test.isin(CLASSES.keys())

# apply the mask to the training and testing data
X_train_filtered = X_train_pca[mask_train]
y_train_filtered = y_train[mask_train]

X_test_filtered = X_test_pca[mask_test]
y_test_filtered = y_test[mask_test]

# one hot encode the target data
y_train_categorical = to_categorical(y_train_filtered, num_classes=len(CLASSES))
y_test_categorical = to_categorical(y_test_filtered, num_classes=len(CLASSES))

model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_filtered.shape[1], 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(len(CLASSES), activation='softmax'))

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)
# Compile the model with categorical crossentropy for multi-class classification
model.compile(
    optimizer=Adam(learning_rate=0.001), 
    loss='categorical_crossentropy', 
    metrics=['accuracy'],
)

# Train the model using the filtered training data
model.fit(X_train_filtered, y_train_categorical, epochs=100, batch_size=64, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the filtered test set
test_loss, test_acc = model.evaluate(X_test_filtered, y_test_categorical)
print(f"Test accuracy: {test_acc}")

NameError: name 'df' is not defined

In [277]:
# Save the model
model.save(MODEL_PATH)

# Save the scaler and the column names to a pickle file
with open(METADATA_PATH, 'wb') as f:
    pickle.dump((scaler, X_train.columns), f)

