In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import os
import random

# Define constants
WINDOW_SIZE = 2000
OVERLAP_SIZE = 1000  # 50% overlap
NUM_WINDOWS_TO_SAMPLE = 10
PARTICIPANT_ID = 1
POSITIONS = ['left_pocket', 'right_pocket']
ACTIVITIES = ['Walking', 'Running', 'Standing', 'Sitting', 'Biking', 'Walking Upstairs', 'Walking Downstairs']

# Directory where the Excel files are stored
data_dir = '/content/drive/My Drive/Dataset New'
output_data = []

# Function to get windows with 50% overlap
def get_windows(data, window_size=2000, overlap_size=1000):
    windows = []
    start = 0
    while start + window_size <= len(data):
        window = data.iloc[start:start + window_size]
        windows.append(window)
        start += overlap_size
    return windows

# Iterate over activities and positions
for activity in ACTIVITIES:
    for position in POSITIONS:
        # Construct the file path
        file_path = os.path.join(data_dir, f"{activity}_{position}.xlsx")

        # Check if the file exists
        if os.path.isfile(file_path):
            print(f"Processing file: {file_path}")

            # Load the Excel file
            df = pd.read_excel(file_path)

            # Filter by participant ID
            df_participant = df[df['participant_id'] == PARTICIPANT_ID]
            print(f"Rows after filtering for participant {PARTICIPANT_ID}: {len(df_participant)}")

            # Skip if no data for the participant
            if df_participant.empty:
                print(f"No data found for participant {PARTICIPANT_ID} in {activity}_{position}. Skipping...")
                continue

            # Get windows of 2000 rows with 50% overlap
            windows = get_windows(df_participant, window_size=WINDOW_SIZE, overlap_size=OVERLAP_SIZE)

            # Check if any windows were created
            if not windows:
                print(f"No windows created for {activity}_{position}. Skipping...")
                continue

            # Randomly sample 50 windows
            if len(windows) > NUM_WINDOWS_TO_SAMPLE:
                sampled_windows = random.sample(windows, NUM_WINDOWS_TO_SAMPLE)
            else:
                print(f"Not enough windows found in {activity}_{position}, selecting all available windows.")
                sampled_windows = windows

            # Append sampled windows to output list with labels
            for window in sampled_windows:
                window['activity'] = activity
                window['position'] = position
                window['participant_id'] = PARTICIPANT_ID
                output_data.append(window)

# Check if any data was collected
if not output_data:
    print("No valid data collected. Exiting...")
else:
    # Concatenate all sampled windows into a single DataFrame
    final_dataset = pd.concat(output_data, ignore_index=True)

    # Save the final dataset as a CSV file
    output_path = os.path.join(data_dir, "subset_participant1_labeled.csv")
    final_dataset.to_csv(output_path, index=False)
    print(f"Subset dataset with labels saved to {output_path}")


Processing file: /content/drive/My Drive/Dataset New/Walking_left_pocket.xlsx
Rows after filtering for participant 1: 63000
Processing file: /content/drive/My Drive/Dataset New/Walking_right_pocket.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Running_right_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Standing_left_pocket.xlsx
Rows after filtering for participant 1: 63000
Processing file: /content/drive/My Drive/Dataset New/Standing_right_pocket.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Sitting_left_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Sitting_right_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Biking_left_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Biking_right_pocket.xlsx
Rows after filtering for participant 1: 63000
Processing file: /content/drive/My Drive/Dataset New/Walking Upstairs_left_pocket.xlsx


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Walking Upstairs_right_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Walking Downstairs_left_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Processing file: /content/drive/My Drive/Dataset New/Walking Downstairs_right_pocket.xlsx
Rows after filtering for participant 1: 63000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['activity'] = activity
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['position'] = position
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  window['participant_id'] = PARTICIPANT_ID
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

Subset dataset with labels saved to /content/drive/My Drive/Dataset New/subset_participant1_labeled.csv


In [None]:
import pandas as pd

# Read the CSV without dtype specification
df = pd.read_csv('/content/drive/My Drive/subset_participant1_labeled.csv', low_memory=False)

# Convert the magnetometer_z column to numeric, setting errors to NaN
df['Mz'] = pd.to_numeric(df['Mz'], errors='coerce')

# Check if there are any NaN values after conversion
nan_count = df['Mz'].isna().sum()
print(f"Number of NaN values in magnetometer_z after conversion: {nan_count}")

# Optionally, drop rows with NaN values or fill them
df.dropna(subset=['Mz'], inplace=True)  # Drop rows with NaN in magnetometer_z
# or alternatively:
# df['magnetometer_z'].fillna(0, inplace=True)  # Fill NaNs with 0

# Check rows where magnetometer_z is not numeric
invalid_rows = df[~df['Mz'].apply(lambda x: isinstance(x, (int, float)))]
print(f"Number of invalid rows in 'Mz': {len(invalid_rows)}")
print(invalid_rows.head())



Number of NaN values in magnetometer_z after conversion: 0
Number of invalid rows in 'Mz': 0
Empty DataFrame
Columns: [participant_id, activity_position, Ax, Ay, Az, Lx, Ly, Lz, Gx, Gy, Gz, Mx, My, Mz, normalized, activity, position]
Index: []


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch

# Load the dataset
data_path = '/content/drive/My Drive/subset_participant1_labeled.csv'
df = pd.read_csv('/content/drive/My Drive/subset_participant1_labeled.csv', low_memory=False)

# Convert the magnetometer_z column to numeric, setting errors to NaN
df['Mz'] = pd.to_numeric(df['Mz'], errors='coerce')

# Check if there are any NaN values after conversion
nan_count = df['Mz'].isna().sum()
print(f"Number of NaN values in magnetometer_z after conversion: {nan_count}")

# Optionally, drop rows with NaN values or fill them
df.dropna(subset=['Mz'], inplace=True)  # Drop rows with NaN in magnetometer_z
# or alternatively:
# df['magnetometer_z'].fillna(0, inplace=True)  # Fill NaNs with 0
df = pd.read_csv(data_path, low_memory=False)

# Define feature columns for sensor readings
FEATURE_COLUMNS = ['Ax', 'Ay', 'Az',
                   'Lx', 'Ly', 'Lz',
                   'Gx', 'Gy', 'Gz',
                   'Mx', 'My', 'Mz',]



# Encode activity labels
label_encoder = LabelEncoder()
df['activity_encoded'] = label_encoder.fit_transform(df['activity'])

# Define window size and overlap
WINDOW_SIZE = 2000
OVERLAP_SIZE = 1000

# Create sequences of sensor data and labels
X = []
y = []

# Create overlapping windows
for start in range(0, len(df) - WINDOW_SIZE + 1, OVERLAP_SIZE):
    end = start + WINDOW_SIZE
    window = df.iloc[start:end]

    # Flatten the window data as a sequence of features (acts as input tokens for BERT)
    feature_sequence = window[FEATURE_COLUMNS].values.flatten()  # Creates a flat sequence for BERT input
    label = window['activity_encoded'].mode()[0]  # Most common label in the window

    X.append(feature_sequence)
    y.append(label)

# Convert lists to numpy arrays
X = np.array(X)
y = np.array(y)

print(f"Feature sequence shape: {X.shape}")
print(f"Labels shape: {y.shape}")


Number of NaN values in magnetometer_z after conversion: 0
Feature sequence shape: (259, 24000)
Labels shape: (259,)


In [None]:
DOWNSAMPLE_FACTOR = 4
WINDOW_SIZE = 2000 // DOWNSAMPLE_FACTOR

X = []
y = []

for start in range(0, len(df) - WINDOW_SIZE * DOWNSAMPLE_FACTOR + 1, 1000):
    end = start + WINDOW_SIZE * DOWNSAMPLE_FACTOR
    window = df.iloc[start:end:DOWNSAMPLE_FACTOR]  # Downsampling

    feature_sequence = window[FEATURE_COLUMNS].values.flatten()
    label = window['activity_encoded'].mode()[0]

    X.append(feature_sequence)
    y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Downsampled Feature sequence shape: {X.shape}")


Downsampled Feature sequence shape: (259, 6000)


In [None]:
import torch
import numpy as np

# Define constants
MAX_LENGTH = 512  # Typical maximum sequence length for BERT

# Padding or truncating sequences to fit the max length (512)
X_padded = torch.tensor(
    [
        np.pad(seq, (0, max(0, MAX_LENGTH - len(seq))), 'constant')[:MAX_LENGTH]
        for seq in X_standardized
    ],
    dtype=torch.float32
)

# Check the shape to ensure padding/truncating worked correctly
print(f"Padded input shape: {X_padded.shape}")


Padded input shape: torch.Size([259, 512])


In [None]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert labels to tensor
y_tensor = torch.tensor(y, dtype=torch.long)

# Create TensorDataset
dataset = TensorDataset(X_padded, y_tensor)

# Create DataLoader
BATCH_SIZE = 16
data_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

print(f"DataLoader created with batch size: {BATCH_SIZE}")


DataLoader created with batch size: 16


In [None]:
import torch.nn as nn
from transformers import BertModel, BertConfig

# Define a custom BERT-based model
class TimeSeriesBERT(nn.Module):
    def __init__(self, num_classes):
        super(TimeSeriesBERT, self).__init__()
        # BERT configuration
        self.bert_config = BertConfig(
            hidden_size=128,
            num_attention_heads=4,
            num_hidden_layers=4,
            intermediate_size=256
        )
        # Initialize BERT model
        self.bert = BertModel(self.bert_config)

        # Linear layer to map time-series input to BERT's expected hidden size
        self.linear_mapping = nn.Linear(1, 128)

        # Output layer for classification
        self.classifier = nn.Linear(128, num_classes)

    def forward(self, x):
        # Add an extra dimension for the input (batch_size, sequence_length, 1)
        x = x.unsqueeze(-1)

        # Linear mapping to match BERT's hidden size
        x = self.linear_mapping(x)

        # BERT expects input in the format (batch_size, sequence_length, hidden_size)
        bert_output = self.bert(inputs_embeds=x)['pooler_output']

        # Classification layer
        logits = self.classifier(bert_output)
        return logits

# Define number of activity classes
NUM_CLASSES = len(set(y))
model = TimeSeriesBERT(num_classes=NUM_CLASSES)

# Move the model to the appropriate device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)


In [None]:
import torch.optim as optim
import torch.nn as nn

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

# Specify number of epochs for training
num_epochs = 10

inputs = inputs.to(torch.float32)
labels = labels.to(torch.long)  # Ensure labels are integer type for classification tasks



In [None]:
def initialize_weights(m):
    # Apply Xavier initialization to Linear layers
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.constant_(m.bias, 0)

    # Apply Xavier initialization to Embedding layers (optional)
    elif isinstance(m, nn.Embedding):
        nn.init.xavier_uniform_(m.weight)

    # Apply Xavier initialization to LayerNorm layers (optional)
    elif isinstance(m, nn.LayerNorm):
        nn.init.constant_(m.bias, 0)
        nn.init.constant_(m.weight, 1.0)

# Apply weight initialization across the model
model.apply(initialize_weights)



TimeSeriesBERT(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_

In [None]:
# Replace NaNs with the mean of each sequence
for i in range(X_padded.size(0)):  # Iterate over each sequence
    seq = X_padded[i]
    nan_mask = torch.isnan(seq)
    mean_value = seq[~nan_mask].mean() if torch.sum(~nan_mask) > 0 else 0
    seq[nan_mask] = mean_value


In [None]:
# Check for NaNs or infinite values in the dataset
print(f"Number of NaN values in X_padded: {torch.isnan(X_padded).sum().item()}")
print(f"Number of infinite values in X_padded: {torch.isinf(X_padded).sum().item()}")


Number of NaN values in X_padded: 0
Number of infinite values in X_padded: 0


In [None]:
# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    total_loss = 0

    # Iterate over the DataLoader
    for inputs, labels in data_loader:
        # Move inputs and labels to the appropriate device (CPU in your case)
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate the loss
        total_loss += loss.item()

    # Calculate the average loss for this epoch
    avg_loss = total_loss / len(data_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")


Epoch [1/10], Loss: 2.2769
Epoch [2/10], Loss: 2.0992
Epoch [3/10], Loss: 2.0484
Epoch [4/10], Loss: 1.9876
Epoch [5/10], Loss: 1.9700
Epoch [6/10], Loss: 2.0052
Epoch [7/10], Loss: 1.9560
Epoch [8/10], Loss: 1.9687
Epoch [9/10], Loss: 1.9703
Epoch [10/10], Loss: 1.9555


In [None]:
from sklearn.model_selection import train_test_split

# Assuming 'X_padded' and 'y_tensor' are your features and labels
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_tensor, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")


Training set size: 207
Test set size: 52


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom Dataset class
class TimeSeriesDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

# Create the Dataset objects
train_dataset = TimeSeriesDataset(X_train, y_train)
test_dataset = TimeSeriesDataset(X_test, y_test)

# Create DataLoader objects
train_data_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_data_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

print(f"Train DataLoader: {len(train_data_loader)} batches")
print(f"Test DataLoader: {len(test_data_loader)} batches")


Train DataLoader: 13 batches
Test DataLoader: 4 batches
