https://github.com/dbouchabou/Fully-Convolutional-Network-Smart-Homes/tree/master

In [None]:
from google.colab import drive
drive.mount('/content/drive' , force_remount = True)

Mounted at /content/drive


In [None]:
import os
import numpy as np
import pandas as pd
import pickle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

### MHEALTH

In [None]:
cd "/content/drive/MyDrive/MHEALTHDATASET/"

/content/drive/MyDrive/MHEALTHDATASET


In [None]:
df_mhealth = pd.read_pickle('mhealth_for_lstm.pkl')

#### LOSO split here

In [None]:
train_data = df_mhealth[(df_mhealth["subject"] != 'subject4') & (df_mhealth["subject"] != 'subject7')]
test_data = df_mhealth[(df_mhealth["subject"] == 'subject4')]
val_data = df_mhealth[(df_mhealth["subject"] == 'subject7')]
#train_data = df_mhealth[(df_mhealth["subject"] != 'subject4')]

In [None]:
train_data = train_data.sort_values(by=['subject', 'activity'])
test_data = test_data.sort_values(by=['subject', 'activity'])
val_data = val_data.sort_values(by=['subject', 'activity'])

In [None]:
def segment_activities(df):
    activitiesSeq = []

    ponentialIndex = df.activity.ne(df.activity.shift())

    ii = np.where(ponentialIndex == True)[0]

    for i,end in enumerate(ii):
        if i > 0 :

          dftmp = df[ii[i-1]:end]
          activitiesSeq.append(dftmp)
    return activitiesSeq

In [None]:
train_activitySequences = segment_activities(train_data)
val_activitySequences = segment_activities(val_data)
test_activitySequences = segment_activities(test_data)

In [None]:
def generate_sentence(df2):
    sentence = ""

    #columns for which to retain values
    columns_of_interest = ['acc_ch_x', 'acc_ch_y', 'acc_ch_z', 'ecg_sig_1', 'ecg_sig_2',
       'acc_la_x', 'acc_la_y', 'acc_la_z', 'gyr_la_x', 'gyr_la_y', 'gyr_la_z',
       'mag_la_x', 'mag_la_y', 'mag_la_z', 'acc_rw_x', 'acc_rw_y', 'acc_rw_z',
       'gyr_rw_x', 'gyr_rw_y', 'gyr_rw_z', 'mag_rw_x', 'mag_rw_y', 'mag_rw_z',
       'activity', 'subject']

    #iterate over columns
    for column in columns_of_interest:
        #column value
        value = df2[column].values[0]  # Directly access the single value in the column

        #column name and val
        sentence += "{}{}".format(column, value)

        #space if not last col
        if column != columns_of_interest[-1]:
            sentence += " "

    return sentence

In [None]:
def sequencesToSentences(activitySequences):
	sentences = []
	label_sentences = []

	for i in range(len(activitySequences)):

		sentence = generate_sentence(activitySequences[i])

		sentences.append(sentence)
		label_sentences.append(activitySequences[i].activity.values[0])

	return sentences, label_sentences

In [None]:
train_sentences, train_label_sentences = sequencesToSentences(train_activitySequences)
val_sentences, val_label_sentences = sequencesToSentences(val_activitySequences)
test_sentences, test_label_sentences = sequencesToSentences(test_activitySequences)

In [None]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
combined_sentences = train_sentences + val_sentences + test_sentences
tokenizer.fit_on_texts(combined_sentences)

# Tokenize train, validation, and test sets
train_indexed_sentences = tokenizer.texts_to_sequences(train_sentences)
val_indexed_sentences = tokenizer.texts_to_sequences(val_sentences)
test_indexed_sentences = tokenizer.texts_to_sequences(test_sentences)

#### sliding window data segmentation

In [None]:
def slidingWindow(sequence,winSize,step=1):

    try: it = iter(sequence)
    except TypeError:
        raise Exception("**ERROR** sequence must be iterable.")
    if not ((type(winSize) == type(0)) and (type(step) == type(0))):
        raise Exception("**ERROR** type(winSize) and type(step) must be int.")
    if step > winSize:
        raise Exception("**ERROR** step must not be larger than winSize.")

    numOfChunks = int(((len(sequence)-winSize)/step)+1)

    if winSize > len(sequence):
        yield sequence[0:len(sequence)]
    else:
        for i in range(0,numOfChunks*step,step):
            yield sequence[i:i+winSize]

In [None]:
X_train_windowed = []
Y_train_windowed = []
winSize = 50
step = 1

for i,s in enumerate(train_indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_train_windowed.append(chunk)
		Y_train_windowed.append(train_label_sentences[i])

X_val_windowed = []
Y_val_windowed = []
winSize = 50
step = 1

for i,s in enumerate(val_indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_val_windowed.append(chunk)
		Y_val_windowed.append(val_label_sentences[i])


X_test_windowed = []
Y_test_windowed = []
winSize = 50
step = 1

for i,s in enumerate(test_indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_test_windowed.append(chunk)
		Y_test_windowed.append(test_label_sentences[i])

In [None]:
train_padded_windows = pad_sequences(X_train_windowed)
test_padded_windows = pad_sequences(X_test_windowed)
val_padded_windows = pad_sequences(X_val_windowed)

Y_train_windowed = np.array(Y_train_windowed)
Y_test_windowed = np.array(Y_test_windowed)
Y_val_windowed = np.array(Y_val_windowed)

In [None]:
x_train = train_padded_windows
y_train = Y_train_windowed
x_test = test_padded_windows
y_test = Y_test_windowed
x_val = val_padded_windows
y_val = Y_val_windowed

In [None]:
from scipy.stats import entropy

js_divergences = np.zeros(x_train.shape[1])
for i in range(x_train.shape[1]):
    # Compute histograms with the same bins for both X_train and X_test
    bins = max(len(np.unique(x_train[:, i])), len(np.unique(x_test[:, i])))
    p, _ = np.histogram(x_train[:, i], bins=bins, density=True)
    q, _ = np.histogram(x_test[:, i], bins=bins, density=True)
    m = 0.5 * (p + q)
    js_divergences[i] = 0.5 * (entropy(p, m) + entropy(q, m))

avg_js_divergence = np.mean(js_divergences)
print("Average Jensen-Shannon divergence:", avg_js_divergence)

Average Jensen-Shannon divergence: 0.17813294591044218


### PAMAP2

In [None]:
cd "/content/drive/MyDrive/PAMAP2_Dataset"

/content/drive/MyDrive/PAMAP2_Dataset


In [None]:
data = pd.read_pickle('pamap2_for_lstm.pkl')
data = data[data['id'] != 109]

In [None]:
activity_counts = data.groupby('id')['activity_id'].value_counts()
activity_counts_df = activity_counts.reset_index(name='count')

In [None]:
test_data = data[(data["id"] == 107)]
train_data = data[(data["id"] != 107) & (data["id"] != 104)]
val_data = data[(data["id"] == 104)]

train_activity_id = train_data['activity_id']
train_sub_id = train_data['id']
test_activity_id = test_data['activity_id']
test_sub_id = test_data['id']
val_activity_id = val_data['activity_id']
val_sub_id = val_data['id']

train_df_without_id = train_data.drop(columns=['activity_id', 'id'])
test_df_without_id = test_data.drop(columns=['activity_id', 'id'])
val_df_without_id = val_data.drop(columns=['activity_id', 'id'])

In [None]:
from sklearn.preprocessing import StandardScaler
numerical_columns = train_df_without_id.select_dtypes(include=['number']).columns

scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_df_without_id[numerical_columns])
train_scaled_df = pd.DataFrame(train_scaled, columns=numerical_columns, index=train_df_without_id.index)

test_scaled = scaler.transform(test_df_without_id[numerical_columns])
test_scaled_df = pd.DataFrame(test_scaled, columns=numerical_columns, index=test_df_without_id.index)

val_scaled = scaler.transform(val_df_without_id[numerical_columns])
val_scaled_df = pd.DataFrame(val_scaled, columns=numerical_columns, index=val_df_without_id.index)

train_scaled_df = pd.concat([train_activity_id, train_sub_id, train_scaled_df], axis=1)
test_scaled_df = pd.concat([test_activity_id, test_sub_id, test_scaled_df], axis=1)
val_scaled_df = pd.concat([val_activity_id, val_sub_id, val_scaled_df], axis=1)

In [None]:
from sklearn.model_selection import LeaveOneGroupOut
X = data.drop(columns=['id', 'activity_id'])  # Assuming 'activity_id' and 'id' are dropped as features
y = data['activity_id']

groups = data['id']
logo = LeaveOneGroupOut()

# Iterate through the train and test indices generated by LeaveOneGroupOut
for train_index, test_index in logo.split(X, y, groups):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
def segment_activities(df):
    activitiesSeq = []

    ponentialIndex = df.activity_id.ne(df.activity_id.shift())

    ii = np.where(ponentialIndex == True)[0]

    for i,end in enumerate(ii):
        if i > 0 :

          dftmp = df[ii[i-1]:end]
          activitiesSeq.append(dftmp)
    return activitiesSeq

In [None]:
train_activitySequences = segment_activities(train_data)
val_activitySequences = segment_activities(val_data)
test_activitySequences = segment_activities(test_data)

#### with my columns

In [None]:
def generate_sentence(df2):
    sentence = ""

    # Define the column names you want to extract values from
    columns_of_interest = ['heart_rate', 'hand_temperature', 'hand_3D_acceleration_16_x', 'hand_3D_acceleration_16_y',
                           'hand_3D_acceleration_16_z', 'hand_3D_acceleration_6_x', 'hand_3D_acceleration_6_y',
                           'hand_3D_acceleration_6_z', 'hand_3D_gyroscope_x', 'hand_3D_gyroscope_y',
                           'hand_3D_gyroscope_z', 'hand_3D_magnetometer_x', 'hand_3D_magnetometer_y',
                           'hand_3D_magnetometer_z', 'chest_temperature', 'chest_3D_acceleration_16_x',
                           'chest_3D_acceleration_16_y', 'chest_3D_acceleration_16_z', 'chest_3D_acceleration_6_x',
                           'chest_3D_acceleration_6_y', 'chest_3D_acceleration_6_z', 'chest_3D_gyroscope_x',
                           'chest_3D_gyroscope_y', 'chest_3D_gyroscope_z', 'chest_3D_magnetometer_x',
                           'chest_3D_magnetometer_y', 'chest_3D_magnetometer_z', 'ankle_temperature',
                           'ankle_3D_acceleration_16_x', 'ankle_3D_acceleration_16_y', 'ankle_3D_acceleration_16_z',
                           'ankle_3D_acceleration_6_x', 'ankle_3D_acceleration_6_y', 'ankle_3D_acceleration_6_z',
                           'ankle_3D_gyroscope_x', 'ankle_3D_gyroscope_y', 'ankle_3D_gyroscope_z',
                           'ankle_3D_magnetometer_x', 'ankle_3D_magnetometer_y', 'ankle_3D_magnetometer_z']

    # Iterate over the columns of interest
    for column in columns_of_interest:
        # Get the value for the current column
        value = df2[column].values[0]  # Directly access the single value in the column

        # Add column name and value to the sentence
        sentence += "{}{}".format(column, value)

        # Add a space if it's not the last column
        if column != columns_of_interest[-1]:
            sentence += " "

    return sentence

In [None]:
def sequencesToSentences(activitySequences):
	sentences = []
	label_sentences = []

	for i in range(len(activitySequences)):

		sentence = generate_sentence(activitySequences[i])

		sentences.append(sentence)
		label_sentences.append(activitySequences[i].activity_id.values[0])

	return sentences, label_sentences

In [None]:
train_sentences, train_label_sentences = sequencesToSentences(train_activitySequences)
val_sentences, val_label_sentences = sequencesToSentences(val_activitySequences)
test_sentences, test_label_sentences = sequencesToSentences(test_activitySequences)

In [None]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
combined_sentences = train_sentences + val_sentences + test_sentences
tokenizer.fit_on_texts(combined_sentences)

#tokenize train, validation, and test sets
train_indexed_sentences = tokenizer.texts_to_sequences(train_sentences)
val_indexed_sentences = tokenizer.texts_to_sequences(val_sentences)
test_indexed_sentences = tokenizer.texts_to_sequences(test_sentences)

In [None]:
def slidingWindow(sequence,winSize,step=1):

    try: it = iter(sequence)
    except TypeError:
        raise Exception("**ERROR** sequence must be iterable.")
    if not ((type(winSize) == type(0)) and (type(step) == type(0))):
        raise Exception("**ERROR** type(winSize) and type(step) must be int.")
    if step > winSize:
        raise Exception("**ERROR** step must not be larger than winSize.")

    numOfChunks = int(((len(sequence)-winSize)/step)+1)

    # Do the work
    if winSize > len(sequence):
        yield sequence[0:len(sequence)]
    else:
        for i in range(0,numOfChunks*step,step):
            yield sequence[i:i+winSize]

In [None]:
X_train_windowed = []
Y_train_windowed = []
winSize = 50
step = 1

for i,s in enumerate(train_indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_train_windowed.append(chunk)
		Y_train_windowed.append(train_label_sentences[i])

X_val_windowed = []
Y_val_windowed = []
winSize = 50
step = 1

for i,s in enumerate(val_indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_val_windowed.append(chunk)
		Y_val_windowed.append(val_label_sentences[i])


X_test_windowed = []
Y_test_windowed = []
winSize = 50
step = 1

for i,s in enumerate(test_indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_test_windowed.append(chunk)
		Y_test_windowed.append(test_label_sentences[i])

In [None]:
train_padded_windows = pad_sequences(X_train_windowed)
test_padded_windows = pad_sequences(X_test_windowed)
val_padded_windows = pad_sequences(X_val_windowed)

Y_train_windowed = np.array(Y_train_windowed)
Y_test_windowed = np.array(Y_test_windowed)
Y_val_windowed = np.array(Y_val_windowed)

In [None]:
x_train = train_padded_windows
y_train = Y_train_windowed
x_test = test_padded_windows
y_test = Y_test_windowed
x_val = val_padded_windows
y_val = Y_val_windowed

In [None]:
from scipy.stats import entropy

js_divergences = np.zeros(x_train.shape[1])
for i in range(x_train.shape[1]):
    # Compute histograms with the same bins for both X_train and X_test
    bins = max(len(np.unique(x_train[:, i])), len(np.unique(x_test[:, i])))
    p, _ = np.histogram(x_train[:, i], bins=bins, density=True)
    q, _ = np.histogram(x_test[:, i], bins=bins, density=True)
    m = 0.5 * (p + q)
    js_divergences[i] = 0.5 * (entropy(p, m) + entropy(q, m))

avg_js_divergence = np.mean(js_divergences)
print("Average Jensen-Shannon divergence:", avg_js_divergence)

Average Jensen-Shannon divergence: 0.15987888346998558


#### Split on subjects:
  subjects 105 for val
  106 for test

In [None]:
all_data = pd.concat(activitySequences)

In [None]:
unique_subject_ids = all_data['id'].unique()

train_data = []
val_data = []
test_data = []

for subject_id in unique_subject_ids:
    subject_data = all_data[all_data['id'] == subject_id]

    if subject_id == 105:
        val_data.append(subject_data)
    elif subject_id == 106:
        test_data.append(subject_data)
    else:
        train_data.append(subject_data)


In [None]:
train_data = pd.concat(train_data)
val_data = pd.concat(val_data)
test_data = pd.concat(test_data)

In [None]:
unique_train_ids = val_data['id'].unique()
print(unique_train_ids)

[105]


In [None]:
def generate_sentence(df2):
    sentence = ""

    # Define the column names you want to extract values from
    columns_of_interest = ['heart_rate', 'hand_temperature', 'hand_3D_acceleration_16_x', 'hand_3D_acceleration_16_y',
                           'hand_3D_acceleration_16_z', 'hand_3D_acceleration_6_x', 'hand_3D_acceleration_6_y',
                           'hand_3D_acceleration_6_z', 'hand_3D_gyroscope_x', 'hand_3D_gyroscope_y',
                           'hand_3D_gyroscope_z', 'hand_3D_magnetometer_x', 'hand_3D_magnetometer_y',
                           'hand_3D_magnetometer_z', 'chest_temperature', 'chest_3D_acceleration_16_x',
                           'chest_3D_acceleration_16_y', 'chest_3D_acceleration_16_z', 'chest_3D_acceleration_6_x',
                           'chest_3D_acceleration_6_y', 'chest_3D_acceleration_6_z', 'chest_3D_gyroscope_x',
                           'chest_3D_gyroscope_y', 'chest_3D_gyroscope_z', 'chest_3D_magnetometer_x',
                           'chest_3D_magnetometer_y', 'chest_3D_magnetometer_z', 'ankle_temperature',
                           'ankle_3D_acceleration_16_x', 'ankle_3D_acceleration_16_y', 'ankle_3D_acceleration_16_z',
                           'ankle_3D_acceleration_6_x', 'ankle_3D_acceleration_6_y', 'ankle_3D_acceleration_6_z',
                           'ankle_3D_gyroscope_x', 'ankle_3D_gyroscope_y', 'ankle_3D_gyroscope_z',
                           'ankle_3D_magnetometer_x', 'ankle_3D_magnetometer_y', 'ankle_3D_magnetometer_z']

    # Iterate over the columns of interest
    for column in columns_of_interest:
        # Get the value for the current column
        value = df2[column]  # Directly access the single value in the column

        # Add column name and value to the sentence
        sentence += "{}{}".format(column, value)

        # Add a space if it's not the last column
        if column != columns_of_interest[-1]:
            sentence += " "

    return sentence

In [None]:
def sequencesToSentences(activitySequences):
    sentences = []
    label_sentences = []

    for _, row in activitySequences.iterrows():
        sentence = generate_sentence(row)
        sentences.append(sentence)
        label_sentences.append(row['activity_id'])

    return sentences, label_sentences

In [None]:
train_sentences, train_label_sentences = sequencesToSentences(train_data)

In [None]:
test_sentences, test_label_sentences = sequencesToSentences(test_data)

In [None]:
val_sentences, val_label_sentences = sequencesToSentences(val_data)

In [None]:
len(val_sentences)

272442

In [None]:
train_sentences_array = np.array(train_sentences)
train_label_sentences_array = np.array(train_label_sentences)

# Define the file paths
sentences_file = "train_sentences.txt"
labels_file = "train_label_sentences.txt"

# Save the sentences and labels to separate text files
np.savetxt(sentences_file, train_sentences_array, fmt="%s")
np.savetxt(labels_file, train_label_sentences_array, fmt="%s")

In [None]:
tokenizer1 = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer1.fit_on_texts(train_sentences)
word_index1 = tokenizer1.word_index
train_indexed_sentences = tokenizer1.texts_to_sequences(train_sentences)

In [None]:
tokenizer2 = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer2.fit_on_texts(test_sentences)
word_index2 = tokenizer2.word_index
test_indexed_sentences = tokenizer2.texts_to_sequences(test_sentences)

In [None]:
tokenizer3 = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer3.fit_on_texts(val_sentences)
word_index3 = tokenizer3.word_index
val_indexed_sentences = tokenizer3.texts_to_sequences(val_sentences)

#### original

In [None]:
def generate_sentence(df2):

    sentence = ""

    val = ""

    #extract sensors list
    sensors = df2.sensor.values

    values = df2.value.values

    #iterate on sensors list
    for i in range(len(sensors)):

        val = values[i]

        if i == len(sensors) - 1:
            sentence += "{}{}".format(sensors[i],val)
        else:
            sentence += "{}{} ".format(sensors[i],val)

    return sentence

#### with my columns

In [None]:
def generate_sentence(df2):
    sentence = ""

    # Define the column names you want to extract values from
    columns_of_interest = ['heart_rate', 'hand_temperature', 'hand_3D_acceleration_16_x', 'hand_3D_acceleration_16_y',
                           'hand_3D_acceleration_16_z', 'hand_3D_acceleration_6_x', 'hand_3D_acceleration_6_y',
                           'hand_3D_acceleration_6_z', 'hand_3D_gyroscope_x', 'hand_3D_gyroscope_y',
                           'hand_3D_gyroscope_z', 'hand_3D_magnetometer_x', 'hand_3D_magnetometer_y',
                           'hand_3D_magnetometer_z', 'chest_temperature', 'chest_3D_acceleration_16_x',
                           'chest_3D_acceleration_16_y', 'chest_3D_acceleration_16_z', 'chest_3D_acceleration_6_x',
                           'chest_3D_acceleration_6_y', 'chest_3D_acceleration_6_z', 'chest_3D_gyroscope_x',
                           'chest_3D_gyroscope_y', 'chest_3D_gyroscope_z', 'chest_3D_magnetometer_x',
                           'chest_3D_magnetometer_y', 'chest_3D_magnetometer_z', 'ankle_temperature',
                           'ankle_3D_acceleration_16_x', 'ankle_3D_acceleration_16_y', 'ankle_3D_acceleration_16_z',
                           'ankle_3D_acceleration_6_x', 'ankle_3D_acceleration_6_y', 'ankle_3D_acceleration_6_z',
                           'ankle_3D_gyroscope_x', 'ankle_3D_gyroscope_y', 'ankle_3D_gyroscope_z',
                           'ankle_3D_magnetometer_x', 'ankle_3D_magnetometer_y', 'ankle_3D_magnetometer_z']

    # Iterate over the columns of interest
    for column in columns_of_interest:
        # Get the value for the current column
        value = df2[column].values[0]  # Directly access the single value in the column

        # Add column name and value to the sentence
        sentence += "{}{}".format(column, value)

        # Add a space if it's not the last column
        if column != columns_of_interest[-1]:
            sentence += " "

    return sentence

In [None]:
def sequencesToSentences(activitySequences):
	sentences = []
	label_sentences = []

	for i in range(len(activitySequences)):

		sentence = generate_sentence(activitySequences[i])

		sentences.append(sentence)
		label_sentences.append(activitySequences[i].activity_id.values[0])

	return sentences, label_sentences

In [None]:
sentences, label_sentences = sequencesToSentences(activitySequences)

sentences indexization

In [None]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
indexed_sentences = tokenizer.texts_to_sequences(sentences)

In [None]:
len(tokenizer.word_index)

4038

In [None]:
indexed_sentences

sliding windows

In [None]:
def slidingWindow(sequence,winSize,step=1):

    try: it = iter(sequence)
    except TypeError:
        raise Exception("**ERROR** sequence must be iterable.")
    if not ((type(winSize) == type(0)) and (type(step) == type(0))):
        raise Exception("**ERROR** type(winSize) and type(step) must be int.")
    if step > winSize:
        raise Exception("**ERROR** step must not be larger than winSize.")

    numOfChunks = int(((len(sequence)-winSize)/step)+1)

    # Do the work
    if winSize > len(sequence):
        yield sequence[0:len(sequence)]
    else:
        for i in range(0,numOfChunks*step,step):
            yield sequence[i:i+winSize]

In [None]:
X_windowed = []
Y_windowed = []
winSize = 50
step = 1

for i,s in enumerate(indexed_sentences):
	chunks = slidingWindow(s,winSize,step)
	for chunk in chunks:
		X_windowed.append(chunk)
		Y_windowed.append(label_sentences[i])

In [None]:
padded_windows = pad_sequences(X_windowed)

In [None]:
Y_windowed = np.array(Y_windowed)

## Save files ##
print("STEP 8: save sliding windows and labels")
np.save("{}_{}_padded_x.npy".format("lstm",winSize), padded_windows)
np.save("{}_{}_padded_y.npy".format("lstm",winSize), Y_windowed)

STEP 8: save sliding windows and labels


In [None]:
padded_windows.shape

(14593, 50)

In [None]:
padded_windows[0].shape

(50,)

### main

In [None]:
train_padded_windows = np.load("{}_{}_tpadded_x.npy".format("lstm", winSize))
Y_train_windowed = np.load("{}_{}_tpadded_y.npy".format("lstm", winSize))
test_padded_windows = np.load("{}_{}_ttpadded_x.npy".format("lstm", winSize))
Y_test_windowed = np.load("{}_{}_ttpadded_y.npy".format("lstm", winSize))
val_padded_windows = np.load("{}_{}_vpadded_x.npy".format("lstm", winSize))
Y_val_windowed = np.load("{}_{}_vpadded_y.npy".format("lstm", winSize))

In [None]:
Y_train_windowed.shape

(4099,)

In [None]:
x_train = train_padded_windows
y_train = Y_train_windowed
x_test = test_padded_windows
y_test = Y_test_windowed
x_val = val_padded_windows
y_val = Y_val_windowed

#### FCN embedded

In [None]:
pip install tensorflow

In [None]:
class LSTMModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.lstm1 = tf.keras.layers.LSTM(hidden_dim, return_sequences=True)
        self.dropout = tf.keras.layers.Dropout(0.5)  # Example dropout rate
        self.lstm2 = tf.keras.layers.LSTM(hidden_dim)
        self.fc = tf.keras.layers.Dense(output_dim, activation='softmax')

    def call(self, inputs):
        embedded = self.embedding(inputs)
        lstm_out1 = self.lstm1(embedded)
        lstm_out2 = self.lstm2(self.dropout(lstm_out1))
        output = self.fc(lstm_out2)
        return output

In [None]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100
hidden_dim = 128
output_dim = 25

In [None]:
model = LSTMModel(vocab_size, embedding_dim, hidden_dim, output_dim)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=3, batch_size=20, validation_data=(x_val, y_val))

#### predict

In [None]:
predictions = model.predict(x_test)



In [None]:
predictions.shape

(1807, 25)

In [None]:
import numpy as np

class_predictions = np.argmax(predictions, axis=1)

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, class_predictions)
print("Accuracy:", accuracy)

Accuracy: 0.055623471882640586
