Importing



In [26]:
import pandas as pd
import numpy as np
from scipy.stats import zscore
import matplotlib.pyplot as plt

# Select from options
time_normalise = 1  # Set to 0 if not used
train_on_healthy = 1

# Read the data
data_path = 'combined_data2.data'
data = pd.read_csv(data_path, delimiter=' ', na_values='NaN', header=None, skiprows=2)

In [None]:
# Define column names
compnames = ["Time", "Intensity [Watts]", "Heart Rate [bpm]", "Ventilation [l/min]", "VO2 [ml/min]", 
             "VCO2 [ml/min]", "PetCO2 [mmHg]", "PetO2 [mmHg]", "VO2/kg [ml/min/kg]", "Ventilation / VO2", 
             "Ventialtion / VCO2", "Respiratory Quotient", "Tidal Volume [l]", "Breathing Frequency [bpm]", 
             "Inspiratory Time [s]", "Expriratory Time [s]"]
data.columns = ['labels', 'phase'] + compnames
pd.DataFrame
# Extracting unique IDs and phases
ID = data['labels'].unique()
phase = data['phase'].unique()

# Find start and end of each trial from breath time
def get_start_end_indices(data, ID):
    start_test = data.groupby('labels').head(1).index.tolist()
    end_test = start_test[1:] + [len(data)]
    return start_test, end_test

start_test, end_test = get_start_end_indices(data, ID)

# Calculate breath time, minimum and maximum breath times
breath_time = []
MIN = []
MAX = []

# line greened out, because I need to first check if this is necessary in our dataset
for i in range(len(ID)):
    start = start_test[i]
    end = end_test[i]
    bt = np.diff(data.iloc[start:end, 2].to_numpy(), prepend=0)
    #bt[bt < -100] = np.nan
    breath_time.append(bt)
    MIN.append(np.nanmin(bt))
    MAX.append(np.nanmax(bt))

breath_time = np.array(breath_time)
MIN = np.array(MIN)
MAX = np.array(MAX)

# Replace time column with breath_time
breath_data = data['Time'].to_numpy()
data['Time'] = np.concatenate(breath_time)

# Finding trial phases for each participant
start_exercise = []
end_exercise = []

for i in range(len(ID)):
    start = start_test[i]
    end = end_test[i]
    exercise_phase = data.loc[start:end, 'phase'].eq(phase[1])
    start_exercise.append(exercise_phase.idxmax() + start - 1)
    end_exercise.append(exercise_phase.iloc[::-1].idxmax() + start - 1)


# Creating a struct for the exercise part of the trials
exercise_cell = [data.iloc[start_exercise[i]:end_exercise[i]].to_numpy() for i in range(len(ID))]
labels_exercise = [np.repeat(ID[i], len(exercise_cell[i])) for i in range(len(ID))]

allLabels = np.concatenate(labels_exercise)
data_exercise = pd.DataFrame({
    'data': np.vstack(exercise_cell),
    'labels': allLabels
})

data_exercise.drop(index=22475, inplace=True)

start_test, end_test = get_start_end_indices(data_exercise, ID)

# Correct column 6 for changes in unit
### only use if necessary
#thres = data_exercise.loc[data_exercise['VCO2 [ml/min]'] < 5].index[0]
#data_exercise.loc[thres:, 'VCO2 [ml/min]'] *= 1000


NOT CHECKED YET


In [None]:

# Cleaning outliers
for i in range(3, 16):
    data_exercise.iloc[:, i] = data_exercise.iloc[:, i].mask(zscore(data_exercise.iloc[:, i]).abs() > 3)

# Changes in watts
### individualize
dif_me = [np.diff(data_exercise.iloc[start_test[i]:end_test[i], 1]) for i in range(len(ID))]
dif_covid = [np.diff(data_exercise.iloc[start_test[i]:end_test[i], 1]) for i in range(119)]
dif_patient = [np.diff(data_exercise.iloc[start_test[i]:end_test[i], 1]) for i in range(120, len(ID))]

# Plot changes in watts for Covid and Control groups
plt.figure()
for dif in dif_covid:
    plt.plot(dif)
plt.title("Changes in Watts for Covid Group")
plt.show()

plt.figure()
for dif in dif_patient:
    plt.plot(dif)
plt.title("Changes in Watts for Control Group")
plt.show()

# Normalizing data based on the highest watts
highest_watts = data_exercise['Intensity [Watts]'].max()
ld = [highest_watts / data_exercise.iloc[start_test[i]:end_test[i], 1].max() for i in range(len(ID))]
data_normalized = [data_exercise.iloc[start_test[i]:end_test[i]].apply(lambda x: x * ld[i], axis=1).to_numpy() for i in range(len(ID))]

# Creating the struct filled with NaN-s up to the longest trial
data_cell = []
for trial_data in data_normalized:
    trial_matrix = np.full((101, 16), np.nan)
    trial_length = len(trial_data)
    trial_matrix[:trial_length] = trial_data
    data_cell.append(trial_matrix)

data_longest = pd.DataFrame({
    'data': np.vstack(data_cell),
    'labels': np.repeat(ID, 101)
})

# Cleaning outliers in data_longest
for i in range(3, 16):
    data_longest.iloc[:, i] = data_longest.iloc[:, i].mask(zscore(data_longest.iloc[:, i]).abs() > 3)

# Construct breath number index
breath_nr = np.concatenate([np.arange(1, end_test[i] - start_test[i] + 2) for i in range(len(ID))])

WATTS = data_longest['Intensity [Watts]']
data_longest.drop(columns=['Time', 'Intensity [Watts]'], inplace=True)

datapoints_covid = len(data_longest['labels'][data_longest['labels'].str.contains("Covid")])

# Find start and end of each trial from breath time
start_test, end_test = get_start_end_indices(data_longest, ID)

# Plots with Watts-normalized data
covid_cell = [data_longest.iloc[start_test[i]:end_test[i]].to_numpy() for i in range(119)]
control_cell = [data_longest.iloc[start_test[i]:end_test[i]].to_numpy() for i in range(120, len(ID))]

covid_mat = np.stack(covid_cell, axis=2)
control_mat = np.stack(control_cell, axis=2)

av_covid = np.nanmean(covid_mat, axis=2)
sd_covid = np.nanstd(covid_mat, axis=2)

av_control = np.nanmean(control_mat, axis=2)
sd_control = np.nanstd(control_mat, axis=2)

for i in range(14):
    plt.figure()
    plt.plot(av_covid[:, i], 'b')
    plt.plot(av_control[:, i], 'r')
    plt.xlim([0, 101])
    plt.legend(["Post-Covid Group", "Control Group"], loc='lower center')
    plt.ylabel(compnames[i])
    plt.xlabel("Percentage of trial [%]")
    plt.show()

# Prepare the SOM
if train_on_healthy == 1:
    data_healthy = data_longest.iloc[datapoints_covid:]
    data_covid = data_longest.iloc[:datapoints_covid]

    sD_train = data_healthy
    sD_test = data_covid
    WATTS_train = WATTS[datapoints_covid:]
    WATTS_test = WATTS[:datapoints_covid]
