In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [2]:
dataset_path = "../Disease_dataset/Raw/Dataset8/ECG/"
numpy_output_path = "../Disease_dataset/Raw/Dataset8/NumpyData_backup/"

os.makedirs(numpy_output_path, exist_ok=True)

In [3]:
def create_matrix(x_axis, y_axis, filename):
    # Example x and y values (replace with actual values)
    x_values = np.array(x_axis)  # replace with actual x-values
    y_values = np.array(y_axis)  # replace with actual y-values

    # CHANGE INPUT MATRIX SIZE HERE
    grid_size = 30
    # THIS SHOULD NOT BE CHANGE :( BECAUSE I HAVE TEST THE RANGE FOR ECG
    x_min, x_max = 400, 1400
    y_min, y_max = 400, 1400
    
    # Initialize the feature matrix (28x28 grid)
    feature_matrix = np.zeros((grid_size, grid_size), dtype=float)
    
    # Calculate the size of each cell in the grid
    x_step = (x_max - x_min) / grid_size
    y_step = (y_max - y_min) / grid_size
    
    # Populate the feature matrix based on x and y values
    for x, y in zip(x_values, y_values):
        if x_min <= x < x_max and y_min <= y < y_max:
            # Determine the cell index for x and y
            x_idx = int((x - x_min) / x_step)
            y_idx = int((y - y_min) / y_step)
            
            # Mark the cell as occupied
            feature_matrix[y_idx, x_idx] = 1

        # New row to insert (make sure it has the same number of columns)
    new_row = np.zeros((1, grid_size), dtype=float)
    
    # Step 3: Update the first value of the new row
    if "Resting" in filename:
        new_value = 0.1
    elif "Working" in filename:
        new_value = 0.5
    new_row[0, 0] = new_value  # Update the first value
    # Append the new row
    updated_array = np.append(feature_matrix, new_row, axis=0)
    return updated_array

In [4]:
for filename in os.listdir(dataset_path):
    # Check if the file is a CSV file
    if filename.endswith('.csv'):
        print(f'Found CSV file: {filename}')
        file_path = os.path.join(dataset_path, filename)
        
        # Read the CSV file into a DataFrame
        ecg_data = pd.read_csv(file_path)
        ecg_data = ecg_data[ecg_data["Time"] <= 120]
        r_peaks = ecg_data[ecg_data['Peak'] == 3]
        rr_intervals = r_peaks['Time'].diff().dropna().reset_index(drop=True)
        rr_intervals_ms = rr_intervals * 1000  # Convert seconds to ms
        rr_n_ms = rr_intervals_ms[:-1]         # n-th RR intervals in ms or x-axis
        rr_n1_ms = rr_intervals_ms[1:]         # (n+1)-th RR intervals in ms or y-axis

        input_data = create_matrix(rr_n_ms, rr_n1_ms, filename)
        np.save(numpy_output_path + filename.replace(".csv", ".npy"), input_data)

Found CSV file: 2024-11-05T18:29:50.354026700Z_0_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:26:31.753095900Z_0_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:27:59.489500100Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:30:01.667978800Z_0_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:24:57.114209900Z_0_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:28:00.869986300Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:29:52.889569800Z_0_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:29:58.811214400Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:29:15.715139500Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:27:39.500363500Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:28:00.187908500Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:26:17.889051600Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:29:30.677640500Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:28:41.500734900Z_1_Resting-Overlap.csv
Found CSV file: 2024-11-05T18:27:5

In [3]:
loaded_array = np.load("../Disease_dataset/Ex2_Dataset4_1050/NumpyData/2024-11-02T09:33:10.820Z_0_Resting-Normal.npy")

print(loaded_array)

[[0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0. ]
 [0.  0.

In [32]:
# import os
# import random
# import shutil

# # Specify the folder path and the desired ratio
# folder_path = './Dataset3_Official/NumpyData_3031/'
# train_ratio = 0.8  # Example: 80% for training, 20% for validation

# # Define paths for train and val folders
# train_folder = os.path.join(folder_path, 'train')
# val_folder = os.path.join(folder_path, 'val')
# os.makedirs(train_folder, exist_ok=True)
# os.makedirs(val_folder, exist_ok=True)

# # Get all files in the folder
# all_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# # Shuffle files randomly
# random.shuffle(all_files)

# # Calculate split index
# train_size = int(len(all_files) * train_ratio)

# # Split files into train and val
# train_files = all_files[:train_size]
# val_files = all_files[train_size:]

# # Move files to respective folders
# for file_name in train_files:
#     shutil.move(os.path.join(folder_path, file_name), os.path.join(train_folder, file_name))

# for file_name in val_files:
#     shutil.move(os.path.join(folder_path, file_name), os.path.join(val_folder, file_name))

# print(f"Moved {len(train_files)} files to {train_folder}")
# print(f"Moved {len(val_files)} files to {val_folder}")


Moved 2400 files to ./Dataset3_Official/NumpyData_3031/train
Moved 600 files to ./Dataset3_Official/NumpyData_3031/val


In [38]:
import os

def count_duplicate_files(folder1, folder2):
    # Get a set of filenames (without paths) from each folder
    files_in_folder1 = set(os.listdir(folder1))
    files_in_folder2 = set(os.listdir(folder2))

    print(len(set(os.listdir(folder2))))
    print(len(set(os.listdir(folder1))))

    # Find the intersection (duplicate files in both folders)
    duplicate_files = files_in_folder1.intersection(files_in_folder2)
    
    # Count duplicates
    duplicate_count = len(duplicate_files)
    
    print(f"Number of duplicate files: {duplicate_count}")
    return duplicate_count

# Example usage
count_duplicate_files("../Disease_dataset/Raw/Dataset1_Official/NumpyData_backup/", "../Disease_dataset/Raw/Dataset3_Official/NumpyData_backup/")


3000
1500
Number of duplicate files: 1500


1500

In [None]:
loaded_array = np.load("../Disease_dataset/Ex2_Dataset4_1050/NumpyData/2024-11-02T14:57:02.915Z_0_Resting-Overlap.npy")

print(loaded_array)