In [2]:
import numpy as np 
import csv 
import glob 
import os
import re
from collections import defaultdict
import torch
import h5py

In [3]:
input_path = "../data/UT_HAR_OG/input"
annotation_path = "../data/UT_HAR_OG/annotation"

In [4]:
input_files = os.listdir(input_path)
annotation_files = os.listdir(annotation_path)

def extract_keys(input_files, annotation_files):
    # Define patterns for the two naming conventions
    pattern_activity = r"input_([a-z]+_\d+_\d+_\d+)"  # Matches input_ACTIVITY_XXXX_XXXX_XX
    pattern_name = r"input_\d+_([a-z]+_[a-z]+_\d+)"    # Matches input_XXXX_NAME_ACTIVITY_XX

    # Dictionary to hold associations
    file_associations = defaultdict(lambda: {"input": None, "annotation": None})

    # Process input files
    for file in input_files:
        match_activity = re.match(pattern_activity, file)
        match_name = re.match(pattern_name, file)
        if match_activity:
            key = match_activity.group(1)
            file_associations[key]["input"] = file
        elif match_name:
            key = match_name.group(1)
            file_associations[key]["input"] = file

    # Process annotation files
    for file in annotation_files:
        # Extract keys based on the annotation naming conventions
        match_activity = re.match(r"annotation_([a-z]+_\d+_\d+_\d+)", file)
        match_name = re.match(r"annotation_([a-z]+_[a-z]+_\d+)", file)
        if match_activity:
            key = match_activity.group(1)
            file_associations[key]["annotation"] = file
        elif match_name:
            key = match_name.group(1)
            file_associations[key]["annotation"] = file

    # Convert defaultdict to regular dictionary for final output
    return dict(file_associations)

# Print associations
file_associations = extract_keys(input_files, annotation_files)
for key, value in file_associations.items():
    print(f"{key}: {value}")

pickup_170309_1240_07: {'input': 'input_pickup_170309_1240_07.csv', 'annotation': 'annotation_pickup_170309_1240_07.csv'}
sankalp_bed_1: {'input': 'input_161219_sankalp_bed_1.csv', 'annotation': 'annotation_sankalp_bed_1.csv'}
sankalp_bed_10: {'input': 'input_161219_sankalp_bed_10.csv', 'annotation': 'annotation_sankalp_bed_10.csv'}
sankalp_bed_2: {'input': 'input_161219_sankalp_bed_2.csv', 'annotation': 'annotation_sankalp_bed_2.csv'}
sankalp_bed_3: {'input': 'input_161219_sankalp_bed_3.csv', 'annotation': 'annotation_sankalp_bed_3.csv'}
sankalp_bed_4: {'input': 'input_161219_sankalp_bed_4.csv', 'annotation': 'annotation_sankalp_bed_4.csv'}
sankalp_bed_5: {'input': 'input_161219_sankalp_bed_5.csv', 'annotation': 'annotation_sankalp_bed_5.csv'}
sankalp_bed_6: {'input': 'input_161219_sankalp_bed_6.csv', 'annotation': 'annotation_sankalp_bed_6.csv'}
sankalp_bed_7: {'input': 'input_161219_sankalp_bed_7.csv', 'annotation': 'annotation_sankalp_bed_7.csv'}
sankalp_bed_8: {'input': 'input_161

In [5]:
input_files = os.listdir(input_path)
annotation_files = os.listdir(annotation_path) 
file_associations = extract_keys(input_files, annotation_files) 

len(file_associations)

557

In [6]:
from scripts.data_prep import create_dataset

data, labels = create_dataset(input_path, annotation_path)

In [7]:
data.shape

torch.Size([110793, 10, 180])

In [8]:
data[0,0]

tensor([10.6950, 12.8740, 14.3286, 14.5863, 13.9406, 13.9584, 13.6541, 14.1433,
        14.5322, 13.6877, 14.3536, 15.1551, 15.4785, 15.6999, 15.4025, 15.5814,
        16.1365, 15.9383, 15.9762, 15.7084, 15.8916, 16.2583, 16.9452, 17.8875,
        17.3135, 17.7727, 18.7844, 17.8897, 17.9145, 15.8511, 19.6886, 19.4382,
        18.4740, 16.5998, 13.4976,  9.9954,  5.6839,  4.4559,  6.0463,  9.5209,
        12.0684, 11.3604, 16.3969, 16.8864, 16.2622, 16.8962, 18.6060, 18.8925,
        19.4515, 18.6545, 18.8137, 18.4594, 18.0410, 18.4079, 16.6320, 16.3903,
        17.6095, 15.4443, 14.9548, 13.2542, 22.2011, 23.1024, 24.2402, 24.2752,
        23.8516, 24.1291, 23.6664, 24.2073, 23.9600, 23.0639, 22.8500, 22.3850,
        22.0804, 22.3259, 21.5964, 21.7315, 22.3874, 22.5784, 23.1304, 22.3360,
        22.4556, 23.1191, 23.4349, 24.2516, 22.9457, 22.6667, 23.5428, 20.9695,
        21.0038, 18.5973, 13.6499, 13.4685, 13.1772, 12.9875, 12.8131, 12.7653,
        12.8412, 12.9358, 12.9641, 13.05

In [7]:
labels.shape

torch.Size([110793])

def create_dataset(input_path, annotation_path):
    input_files = os.listdir(input_path)
    annotation_files = os.listdir(annotation_path) 
    file_associations = extract_keys(input_files, annotation_files) 

    label_encoder = {"NoActivity": 0, "bed": 1, "fall": 2, "pickup": 3, "run": 4, "sitdown": 5, "standup": 6, "walk": 7}
    #create np array that will store the attributes in the format [capture, rows, columns]
    #only store the rows that have an associated annotation different from 'NoActivity'
    #ignore 1st column from input file since it contains timestamps
    #create vector that will store the labels
    data = []
    labels = []
    idx = 0
    for capture, files in file_associations.items():
        input_file = files["input"]
        annotation_file = files["annotation"]

        if not input_file or not annotation_file:
            # Skip if either input or annotation file is missing
            continue

        # Load input and annotation files
        input_data = np.loadtxt(os.path.join(input_path, input_file), delimiter=',')
        annotation_data = np.loadtxt(os.path.join(annotation_path, annotation_file), delimiter=',', dtype=str)

        # Check if input and annotation file sizes match
        if input_data.shape[0] != annotation_data.shape[0]:
            print(f"Skipping {capture}: Mismatch in row counts between input and annotation files.")
            continue

        # Ignore the first column (timestamps) in input file
        input_data = input_data[:, 1:]

        # Iterate over rows and filter based on annotation
        capture_data = [] 
        capture_labels = []
        for i in range(annotation_data.shape[0]):
            #if annotation_data[i] != 'NoActivity':  # Assume the first column in annotation contains labels
            capture_data.append(torch.tensor(input_data[i,:],dtype=torch.float32))
            capture_labels.append(torch.tensor(label_encoder[annotation_data[i]],dtype=torch.long))
        data.append(torch.stack(capture_data)) 
        labels.append(torch.stack(capture_labels))

    # Convert data and labels to numpy arrays
    # labels = np.array(labels)

    return data, labels


In [6]:
data, labels = create_dataset(input_path, annotation_path)

#go through data and labels and cut last rows for each observation to make all of them
#the same length

def crop_equal_length(data, labels):
    min_rows = min(tensor.size(0) for tensor in data)

    cropped_data = []
    cropped_labels = []

    for tensor, label in zip(data,labels):
        cropped_tensor = tensor[:min_rows]
        cropped_data.append(cropped_tensor)
        cropped_label = label[:min_rows]
        print(label[min_rows:min_rows+2])
        cropped_labels.append(cropped_label)

    return cropped_data, cropped_labels

cropped_data, cropped_labels = crop_equal_length(data,labels)

cropped_data[3].shape

len(labels[3])

data[3].shape

def pad_to_3d(data, labels):
    max_rows = max(tensor.size(0) for tensor in data)  # Determine the maximum number of rows

    for i in range(len(data)):
        tensor = data[i]
        label = labels[i]
        padding_rows = max_rows - tensor.size(0)  # Calculate the number of rows to pad

        if padding_rows > 0:
            # Repeat the last row and append it to the tensor
            last_row = tensor[-1].unsqueeze(0)  # Extract the last row and add a batch dimension
            repeated_rows = last_row.repeat(padding_rows, *[1] * (tensor.dim() - 1))  # Repeat for padding_rows times
            data[i] = torch.cat((tensor, repeated_rows), dim=0)  # Concatenate along rows

            # Append the last label value padding_rows times
            last_label = label[-1] 
            labels[i] = torch.cat((label, last_label.repeat(padding_rows)))

    return torch.stack(data), torch.stack(labels)  # Stack data into a 3D tensor and return modified labels


padded_data, padded_labels = pad_to_3d(data,labels)

padded_data.shape

padded_labels.shape

## Motivating the Usage of CSI Ratio for Preprocessing CSI Data

### Motivation
The **Channel State Information (CSI)** contains detailed amplitude and phase data about the wireless channel. While useful, CSI data often suffers from significant challenges that limit its direct application in sensing tasks:

1. **Noisy Phase Data**:
   - CSI phase is prone to instability caused by hardware imperfections such as carrier frequency offset (CFO), sampling frequency offset (SFO), and time-varying random phase offsets.
   - These phase errors reduce the reliability of using raw CSI data.

2. **Environmental Noise**:
   - CSI amplitude is affected by environmental changes, including large-scale fading and multipath effects, making it harder to detect subtle signal variations.

3. **Device Dependency**:
   - Absolute CSI values are dependent on hardware characteristics like power amplifier gains, which may vary across devices, creating inconsistencies in results.

4. **Limited Sensing Range**:
   - Noise and large-scale variations restrict the range at which the system can detect meaningful information.

The **CSI ratio** technique is a preprocessing method that addresses these issues by normalizing the data through division operations between antennas. It effectively mitigates noise, cancels out phase offsets, and emphasizes subtle channel variations while suppressing large-scale disturbances. This makes it particularly useful for tasks requiring precise detection, such as respiration monitoring or activity recognition.

---

### How CSI Ratio is Performed

To compute the **CSI ratio**, follow these steps:

1. **Collect CSI Data**:
   - Acquire raw CSI data from multiple antennas. For $ N $ antennas, the CSI values are represented as complex numbers $ H_i $, where $ i $ ranges from $ 1 $ to $ N $.

2. **Pair Antennas**:
   - Compute ratios between pairs of antennas. For $ N $ antennas, there are $ \binom{N}{2} $ unique pairs. Each pair $ (i, j) $ generates a ratio:
   $$ 
     H_{i,j} = \frac{H_i}{H_j}
   $$

3. **Decompose the Ratio**:
   - The ratio $ H_{i,j} $ is a complex number and can be expressed as:
   $$
     H_{i,j} = A_{i,j} e^{j \Delta \phi_{i,j}}
   $$
     Where:
     - $ A_{i,j} = \frac{|H_i|}{|H_j|} $ is the amplitude ratio.
     - $ \Delta \phi_{i,j} = \phi_i - \phi_j $ is the phase difference.

4. **Cancel Noise**:
   - The division cancels out time-varying random offsets and hardware-induced phase errors because these offsets are the same across antennas in a single device. This provides a **cleaner and more stable signal**.

5. **Enhance Sensitivity**:
   - The resulting amplitude ratios and phase differences are normalized, reducing sensitivity to large-scale fading and amplifying subtle variations caused by local movements.

6. **Combine Amplitude and Phase** (Optional):
   - The orthogonal properties of amplitude and phase can be combined to enhance sensing capabilities further. For example, projecting the complex CSI ratio onto multiple axes in the complex plane extracts richer information.

---

### Benefits of CSI Ratio Preprocessing
- **Noise Suppression**: Division cancels out shared noise components and random phase offsets.
- **Enhanced Sensitivity**: Highlights subtle changes in the environment, such as human respiration.
- **Increased Range**: Allows for sensing over greater distances by stabilizing phase and reducing noise.
- **Device Independence**: Normalized ratios are less affected by hardware-dependent variations.
- **Robustness**: More resilient to multipath effects and large-scale environmental changes.

By preprocessing raw CSI data with the CSI ratio technique, researchers can significantly improve the reliability and accuracy of wireless sensing systems for diverse applications.


In [None]:
padded_data[0,:,int(padded_data[0,0,:].shape[0]/2)-1:-1].shape

In [10]:
def cvt_to_csi_ratio(data):
    proc_data = torch.zeros_like(data)
    for idx, tensor in enumerate(data):
        num_antennas = 3
        num_subcarriers = 30
        A_n = np.roll(tensor[:,:num_antennas*num_subcarriers], shift=num_subcarriers, axis=1)
        A_n[:,:num_subcarriers] = tensor[:,:num_subcarriers]
        A_d = np.roll(tensor[:,:num_antennas*num_subcarriers], shift=-num_subcarriers, axis=1)
        A_d[:,2*num_subcarriers:3*num_subcarriers] = tensor[:,2*num_subcarriers:3*num_subcarriers]

        theta_n = np.roll(tensor[:,num_antennas*num_subcarriers:], shift=num_subcarriers, axis=1)
        theta_n[:,:num_subcarriers] = tensor[:,num_antennas*num_subcarriers:(num_antennas+1)*num_subcarriers]
        theta_d = np.roll(tensor[:,num_antennas*num_subcarriers:], shift=-num_subcarriers, axis=1)
        theta_d[:,2*num_subcarriers:3*num_subcarriers] = tensor[:,(num_antennas+2)*num_subcarriers:(num_antennas+3)*num_subcarriers]

        proc_data[idx, :, :num_antennas*num_subcarriers] = torch.tensor(np.divide(A_n, A_d, where=A_d!=0))
        proc_data[idx, :, num_antennas*num_subcarriers:] = torch.tensor(theta_n - theta_d)

    return proc_data

In [16]:
test = cvt_to_csi_ratio(data[0].unsqueeze(dim=0))
print(data[0])
print(test)

tensor([[10.6950, 12.8740, 14.3286,  ..., 13.3531, 13.1292, 13.3986],
        [10.6160, 11.5502, 13.4209,  ..., 12.7291, 12.4604, 12.7560],
        [10.8344, 12.4670, 14.1264,  ..., 12.8233, 12.6111, 12.8932],
        ...,
        [10.9311, 12.2447, 14.0699,  ..., 13.1821, 13.0071, 13.3126],
        [11.0406, 13.0809, 14.3176,  ..., 12.8890, 12.6994, 12.9936],
        [11.7168, 12.8876, 14.1998,  ..., 13.4782, 13.2650, 13.5574]])
tensor([[[ 0.5432,  0.6623,  0.7756,  ..., -1.1332, -1.1173, -1.1283],
         [ 0.5444,  0.6090,  0.7285,  ..., -1.1999, -1.1211, -1.1314],
         [ 0.5586,  0.6387,  0.7681,  ..., -0.6725, -0.6080, -0.6547],
         ...,
         [ 0.5520,  0.6265,  0.7674,  ..., -0.7336, -0.6468, -0.6409],
         [ 0.5635,  0.6785,  0.7675,  ..., -0.6623, -0.6812, -0.6631],
         [ 0.6015,  0.6666,  0.7754,  ..., -0.6743, -0.7112, -0.6417]]])


  A_n[:,:num_subcarriers] = tensor[:,:num_subcarriers]
  A_d[:,2*num_subcarriers:3*num_subcarriers] = tensor[:,2*num_subcarriers:3*num_subcarriers]
  theta_n[:,:num_subcarriers] = tensor[:,num_antennas*num_subcarriers:(num_antennas+1)*num_subcarriers]
  theta_d[:,2*num_subcarriers:3*num_subcarriers] = tensor[:,(num_antennas+2)*num_subcarriers:(num_antennas+3)*num_subcarriers]


In [12]:
csi_ratio = cvt_to_csi_ratio(data)

  A_n[:,:num_subcarriers] = tensor[:,:num_subcarriers]
  A_d[:,2*num_subcarriers:3*num_subcarriers] = tensor[:,2*num_subcarriers:3*num_subcarriers]
  theta_n[:,:num_subcarriers] = tensor[:,num_antennas*num_subcarriers:(num_antennas+1)*num_subcarriers]
  theta_d[:,2*num_subcarriers:3*num_subcarriers] = tensor[:,(num_antennas+2)*num_subcarriers:(num_antennas+3)*num_subcarriers]


unique_labels = sorted(set(labels))
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}
encoded_labels = [label_mapping[label] for label in labels]

print("Original Labels:", labels)
print("Encoded Labels:", encoded_labels)
print("Label Mapping:", label_mapping)

In [15]:
with h5py.File("../data/UT_HAR_OG/X.h5","w") as f:
    f.create_dataset("X", data=padded_data)
with h5py.File("../data/UT_HAR_OG/y.h5","w") as f:
    f.create_dataset("y", data=padded_labels)

In [16]:
with h5py.File("../data/UT_HAR_CSI_RATIO/X.h5","w") as f:
    f.create_dataset("X", data=csi_ratio)
with h5py.File("../data/UT_HAR_CSI_RATIO/y.h5","w") as f:
    f.create_dataset("y", data=padded_labels)