In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import KFold
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [15]:
#We first read the 2 data files
df1 = pd.read_csv('pirvision_office_dataset1.csv')
df2 = pd.read_csv('pirvision_office_dataset2.csv')

In [16]:
# #We first shuffle these 2 dataframes
# df1 = df1.sample(frac = 1, random_state=1).reset_index(drop=True)
# df2 = df2.sample(frac = 1, random_state=1).reset_index(drop=True)

#We now merge these 2 dataframes
df = pd.concat([df1, df2], ignore_index=True)

#We print the shapes of all datafmrames
print(df1.shape, df2.shape, df.shape)

#Displaying the merged dataframe
# display(df.head(100))


(7651, 59) (7651, 59) (15302, 59)


In [17]:
df[df['Label'] == 3]['Temperature_F'].value_counts()


Temperature_F
0    1142
Name: count, dtype: int64

In [18]:
class lstm(nn.Module):
    def __init__(self, input_size, hidden_size = 64, num_layers = 1, output_size = 32):
        super(lstm, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(output_size, 3)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = (out[:, -1, :])
        out = self.relu(self.fc1(out))
        out = (self.fc2(out))
        return out

In [20]:
import torch
from torch.utils.data import Dataset, DataLoader

# --- Custom Dataset class ---
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        sequence = self.X[idx][1:]
        label = self.y[idx]
        sequence_tensor = torch.tensor(sequence, dtype=torch.float32).unsqueeze(1)  # shape: (seq_len, 1)
        label_tensor = torch.tensor(label, dtype=torch.long)
        return sequence_tensor, label_tensor

# --- Data prep ---
X = df.drop(columns=["Label", "Date", "Time"]).values

label_map = {0: 0, 1: 1, 3: 2}
y_raw = df["Label"].values
y = np.array([label_map[label] for label in y_raw])

kf = KFold(n_splits=5, shuffle=True, random_state=0)

input_size = 1  
model = lstm(input_size=input_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

nb_epochs = 5
batch_size = 64  # Adjustable

for fold, (train_i, test_i) in enumerate(kf.split(X), 1):
    print(f"\nFold {fold}")
    if fold == 2:
        break

    X_train, X_test = X[train_i], X[test_i]
    y_train, y_test = y[train_i], y[test_i]

    print("Train label distribution:", np.bincount(y_train))
    print("Test label distribution:", np.bincount(y_test))

    # --- Create DataLoader ---
    train_dataset = TimeSeriesDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(nb_epochs):
        total_loss = 0.0
        model.train()
        for sequences, labels in train_loader:
            output = model(sequences)  # shape: (batch_size, num_classes)
            loss = criterion(output, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * sequences.size(0)

        avg_loss = total_loss / len(train_dataset)
        print(f"epoch {epoch+1}, loss: {avg_loss:.4f}")

    # --- Evaluation ---
    print("testing time")
    model.eval()
    correct = 0
    total = 0

    class_counts = {0: 0, 1: 0, 2: 0}
    class_correct = {0: 0, 1: 0, 2: 0}

    with torch.no_grad():
        for i in range(X_test.shape[0]):
            x = X_test[i]
            sequence = x[1:]
            label = y_test[i]

            sequence_tensor = torch.tensor(sequence, dtype=torch.float32).unsqueeze(0).unsqueeze(2)

            output = model(sequence_tensor)
            predicted_class = torch.argmax(output, dim=1).item()

            class_counts[label] += 1
            if predicted_class == label:
                correct += 1
                class_correct[label] += 1
            total += 1

    accuracy = correct / total
    print(f"fold {fold} test accuracy: {accuracy:.4f}")

    print("\nPer-class accuracy:")
    for cls in class_counts:
        total_cls = class_counts[cls]
        correct_cls = class_correct[cls]
        acc_cls = correct_cls / total_cls if total_cls > 0 else 0.0
        print(f"  Class {cls}: {correct_cls}/{total_cls} correct ({acc_cls * 100:.2f}%)")

    model.train()



Fold 1
Train label distribution: [10041  1321   879]
Test label distribution: [2453  345  263]
epoch 1, loss: 0.6307
epoch 2, loss: 0.5927
epoch 3, loss: 0.5929
epoch 4, loss: 0.5926
epoch 5, loss: 0.5927
testing time
fold 1 test accuracy: 0.8014

Per-class accuracy:
  Class 0: 2453/2453 correct (100.00%)
  Class 1: 0/345 correct (0.00%)
  Class 2: 0/263 correct (0.00%)

Fold 2
