In [9]:
import os 
import scipy.io as sio
import scipy
import pickle
import pandas as pd

mat_dir_path = '/mnt/Data/engs2588/data/PhysioNet2017/raw/'  # Replace with your actual directory path
reference_file_path = '/mnt/Data/engs2588/data/PhysioNet2017/REFERENCE.csv'

# Read the reference CSV file
reference_df = pd.read_csv(reference_file_path, header=None)
reference_df.columns = ['Recording', 'Label']

# Initialize a dictionary to store the data
data_dict = {}

# List all .mat files in the directory that match the pattern A*****.mat and sort them
mat_files = sorted([f for f in os.listdir(mat_dir_path) if f.startswith('A') and f.endswith('.mat')])

# Read each .mat file and store the data in the dictionary along with the label
for mat_file in mat_files:
    file_path = os.path.join(mat_dir_path, mat_file)
    mat_data = scipy.io.loadmat(file_path)
    key = mat_file.split('.')[0]  # Remove the .mat extension to match with reference keys
    label = reference_df.loc[reference_df['Recording'] == key, 'Label'].values[0]
    data_dict[key] = {'data': mat_data['val'].flatten(), 'label': label}  # Flatten the data if needed




In [10]:
# Define the output .pkl file path
output_file_path = '../data/ECG.pkl'
# Save the data dictionary to a .pkl file
with open(output_file_path, 'wb') as pkl_file:
    pickle.dump(data_dict, pkl_file)


In [5]:
import pickle
import numpy as np
import os

# Paths
orig_path = "/mnt/Data/engs2588/project/MSc_ADH/data/ECG.pkl"
full_path = "/mnt/Data/engs2588/project/MSc_ADH/data/ECG_full.pkl"   # backup of original
subset_path = "/mnt/Data/engs2588/project/MSc_ADH/data/ECG.pkl"      # new shortened output

# 1. Rename original to ECG_full.pkl
if not os.path.exists(full_path):
    os.rename(orig_path, full_path)
    print(f"Renamed original ECG.pkl → ECG_full.pkl")
else:
    print("ECG_full.pkl exists — skipping rename.")

# ----------------------------
# 2. Load full dataset
# ----------------------------
with open(full_path, "rb") as f:
    data_full = pickle.load(f)

print("Total records in full dataset:", len(data_full))

# ----------------------------
# 3. Force all keys to be plain Python str
# ----------------------------
data_full = {str(k): v for k, v in data_full.items()}

# now keys are normal strings
keys = list(data_full.keys())
print("Sample keys (types):", [(k, type(k)) for k in keys[:5]])

# ----------------------------
# 4. Randomly select 20% of samples by index
# ----------------------------
np.random.seed(42)

n = len(keys)
sample_n = max(1, int(n * 0.2))

idx = np.arange(2000)
subset_keys = [keys[i] for i in idx]   # these are pure Python str

print("Sample Size:", len(subset_keys))
print(subset_keys[:100])   # show some keys, should print as plain 'A02500' etc.

subset_dict = {k: data_full[k] for k in subset_keys}

# ----------------------------
# 5. Save subset back to ECG.pkl
# ----------------------------
with open(subset_path, "wb") as f:
    pickle.dump(subset_dict, f)

print("Saved shortened dataset to:", subset_path)

ECG_full.pkl exists — skipping rename.
Total records in full dataset: 8528
Sample keys (types): [('A00001', <class 'str'>), ('A00002', <class 'str'>), ('A00003', <class 'str'>), ('A00004', <class 'str'>), ('A00005', <class 'str'>)]
Sample Size: 2000
['A00001', 'A00002', 'A00003', 'A00004', 'A00005', 'A00006', 'A00007', 'A00008', 'A00009', 'A00010', 'A00011', 'A00012', 'A00013', 'A00014', 'A00015', 'A00016', 'A00017', 'A00018', 'A00019', 'A00020', 'A00021', 'A00022', 'A00023', 'A00024', 'A00025', 'A00026', 'A00027', 'A00028', 'A00029', 'A00030', 'A00031', 'A00032', 'A00033', 'A00034', 'A00035', 'A00036', 'A00037', 'A00038', 'A00039', 'A00040', 'A00041', 'A00042', 'A00043', 'A00044', 'A00045', 'A00046', 'A00047', 'A00048', 'A00049', 'A00050', 'A00051', 'A00052', 'A00053', 'A00054', 'A00055', 'A00056', 'A00057', 'A00058', 'A00059', 'A00060', 'A00061', 'A00062', 'A00063', 'A00064', 'A00065', 'A00066', 'A00067', 'A00068', 'A00069', 'A00070', 'A00071', 'A00072', 'A00073', 'A00074', 'A00075',