###Fetching Dataset and Basic Preprocessing

In [1]:
from google.colab import drive
drive.mount('/content/drive/')
%cd "/content/drive/MyDrive/Colab Notebooks/OnCampus/LongCovid/regression/Data"

Mounted at /content/drive/
/content/drive/MyDrive/Colab Notebooks/OnCampus/LongCovid/regression/Data


In [25]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score
from sklearn.preprocessing import label_binarize
import numpy as np
import pandas as pd

In [3]:
f_path = "HRV_features_30seconds_HRVDelta.xlsx"
og_data = pd.read_excel(f_path, header=0)
og_data.head()

Unnamed: 0,Block,fatigue_rating,window_num_30,SubjID,Age,HRV_MeanNN,HRV_SDNN,HRV_SDANN1,HRV_SDNNI1,HRV_SDANN2,...,Delta_HRV_SampEn,Delta_HRV_ShanEn,Delta_HRV_FuzzyEn,Delta_HRV_MSEn,Delta_HRV_CMSEn,Delta_HRV_RCMSEn,Delta_HRV_CD,Delta_HRV_HFD,Delta_HRV_KFD,Delta_HRV_LZC
0,0,73,0,Cov4,38,696.800595,117.336517,,,,...,1.632571,2.422132,-2.720973,-0.847401,-0.139904,0.274653,0.912377,0.164625,2.613554,-3.341088
1,0,73,1,Cov4,38,725.609756,126.362148,,,,...,1.632571,2.422132,-2.720973,-0.847401,-0.139904,0.274653,0.912377,0.164625,2.613554,-3.341088
2,0,73,2,Cov4,38,801.649306,111.729719,,,,...,1.632571,2.422132,-2.720973,-0.847401,-0.139904,0.274653,0.912377,0.164625,2.613554,-3.341088
3,0,73,3,Cov4,38,764.802632,66.68301,,,,...,1.632571,2.422132,-2.720973,-0.847401,-0.139904,0.274653,0.912377,0.164625,2.613554,-3.341088
4,0,73,4,Cov4,38,711.509146,106.725588,,,,...,1.632571,2.422132,-2.720973,-0.847401,-0.139904,0.274653,0.912377,0.164625,2.613554,-3.341088


In [4]:
og_data['SubjID'] = og_data['SubjID'].str.replace('Cov', '')
og_data['SubjID'] = og_data['SubjID'].astype(int)

In [5]:
og_data = og_data.dropna(axis=1, how='all')
og_data = og_data[~og_data.isin([np.nan, np.inf, -np.inf]).any(axis=1)]

In [6]:
og_data['Window_ID'] = og_data['SubjID'].map(str) + '_' + og_data['Block'].map(str) + '_' + og_data['window_num_30'].map(str)
og_data['Block_ID'] = og_data['SubjID'].map(str) + '_' + og_data['Block'].map(str)

In [7]:
block_zero = og_data['Block'] == 0
data = og_data[~block_zero]

###Decision Tree after Dropping SubjID

In [34]:
y = data['Change_from_Baseline']
X = data.drop(['fatigue_rating', 'Change_from_Baseline','Delta_fatigue'], axis=1)
X = X.drop('SubjID', axis=1)

# Define the indexes of blocks in your data based on blockid
blocks_indexes = [list(og_data[og_data['Block_ID'] == block_id].index) for block_id in og_data['Block_ID'].unique()]

In [26]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
clf = DecisionTreeRegressor(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)

Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0


###Parameter tuning and validation

In [21]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)

# Perform grid search
grid_search.fit(X_train, y_train)

# Get the best parameters and the best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

# Retrain the model with the best parameters
best_clf = grid_search.best_estimator_
best_clf.fit(X_train, y_train)

# Make predictions on the testing data using the best model
y_pred = best_clf.predict(X_test)

# Calculate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate the RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)

# Calculate the F1 score
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score:", f1)



Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Best Score: 0.9739541160593792
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0


Leave one subject out validation

In [22]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import accuracy_score

# Define features and target
sub_ids = og_data['SubjID'].unique()
print(sub_ids)
# Initialize Decision Tree classifier
clf = DecisionTreeClassifier()

for sub in sub_ids:
  print(sub)
  df = og_data[og_data['SubjID'] != sub]
  print(len(df))
  y = df['fatigue_rating']
  X = df.drop('fatigue_rating', axis=1)
  X = X.drop('SubjID', axis=1)
  # Split the dataset into training and testing sets
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Initialize the decision tree classifier
  clf = DecisionTreeClassifier(random_state=42)

  # Train the classifier on the training data
  clf.fit(X_train, y_train)

  # Make predictions on the testing data
  y_pred = clf.predict(X_test)
  # Calculate the accuracy of the classifier
  accuracy = accuracy_score(y_test, y_pred)
  print("Accuracy:", accuracy)

  # Calculate the RMSE (Root Mean Squared Error)
  rmse = np.sqrt(mean_squared_error(y_test, y_pred))
  print("RMSE:", rmse)

  # Calculate the F1 score
  f1 = f1_score(y_test, y_pred, average='weighted')
  print("F1 Score:", f1)


[ 4  5  7  8 13 14 19 20 23]
4
256
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0
5
261
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0
7
287
Accuracy: 0.9655172413793104
RMSE: 8.886254398866786
F1 Score: 0.9724137931034482
8
260
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0
13
288
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0
14
265
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0
19
247
Accuracy: 1.0
RMSE: 0.0
F1 Score: 1.0
20
280
Accuracy: 0.9642857142857143
RMSE: 5.264435935053794
F1 Score: 0.9642857142857144
23
256
Accuracy: 0.9615384615384616
RMSE: 4.706787243316417
F1 Score: 0.9615384615384615


In [43]:
blocks_indexes = [list(X_test[X_test['Block_ID'] == block_id].index) for block_id in X_test['Block_ID'].unique()]
# Predict on test set
window_predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, window_predictions)
print("Accuracy:", accuracy)

# Create block_mapping_test
# For X_test
X_block_data_test = {}
for block_id, indexes in enumerate(blocks_indexes):
    X_block_data_test[block_id] = X_test.loc[indexes]

# For y_test
y_block_data_test = {}
for block_id, indexes in enumerate(blocks_indexes):
    y_block_data_test[block_id] = y_test.loc[indexes]

# For y_pred
y_block_data_pred = {}
for block_id, indexes in enumerate(blocks_indexes):
    y_block_data_pred[block_id] = y_test.loc[indexes]

# Aggregate window predictions to block level
block_predictions = {}
for block_id in y_block_data_test:
  print(block_id)
  accuracy = accuracy_score(y_block_data_test[block_id], y_block_data_pred[block_id])
  print("Accuracy:", accuracy)
  mse = mean_squared_error(y_block_data_test[block_id], y_block_data_pred[block_id])
  print("MSE:", mse)

Accuracy: 1.0
0
Accuracy: 1.0
MSE: 0.0
1
Accuracy: 1.0
MSE: 0.0
2
Accuracy: 1.0
MSE: 0.0
3
Accuracy: 1.0
MSE: 0.0
4
Accuracy: 1.0
MSE: 0.0
5
Accuracy: 1.0
MSE: 0.0
6
Accuracy: 1.0
MSE: 0.0
7
Accuracy: 1.0
MSE: 0.0
8
Accuracy: 1.0
MSE: 0.0
9
Accuracy: 1.0
MSE: 0.0
10
Accuracy: 1.0
MSE: 0.0
11
Accuracy: 1.0
MSE: 0.0
12
Accuracy: 1.0
MSE: 0.0
13
Accuracy: 1.0
MSE: 0.0
14
Accuracy: 1.0
MSE: 0.0
15
Accuracy: 1.0
MSE: 0.0
16
Accuracy: 1.0
MSE: 0.0
17
Accuracy: 1.0
MSE: 0.0
18
Accuracy: 1.0
MSE: 0.0
19
Accuracy: 1.0
MSE: 0.0
20
Accuracy: 1.0
MSE: 0.0
21
Accuracy: 1.0
MSE: 0.0
22
Accuracy: 1.0
MSE: 0.0
23
Accuracy: 1.0
MSE: 0.0
24
Accuracy: 1.0
MSE: 0.0
25
Accuracy: 1.0
MSE: 0.0
26
Accuracy: 1.0
MSE: 0.0
27
Accuracy: 1.0
MSE: 0.0


###Testing on Block Level

In [10]:
# Create block_mapping_train
block_mapping_train = {}
for index, row in X_train.iterrows():  # Assuming X_train is your feature DataFrame
    window_id = row['Window_ID']
    block_id = row['Block_ID']
    block_mapping_train[window_id] = block_id

# Create block_mapping_test
block_mapping_test = {}
for index, row in X_test.iterrows():  # Assuming X_test is your feature DataFrame
    window_id = row['Window_ID']
    block_id = row['Block_ID']
    block_mapping_test[window_id] = block_id

# Group the testing dataset by 'Block' and aggregate the 'Target' variable
testing_ids = X_test['Window_ID'].unique()
test_data = data[data['Window_ID'].isin(testing_ids)]
actual_block_targets_test = test_data.groupby('Window_ID')['fatigue_rating'].mean().to_dict()

# Remove identifiers from data
X_test = X_test.drop(columns=['Window_ID'])
X_test = X_test.drop(columns=['Block_ID'])
X_train = X_train.drop(columns=['Window_ID'])
X_train = X_train.drop(columns=['Block_ID'])

Mean

In [11]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np

# Sample dataset
# Assuming X_train, y_train, X_test, y_test are already prepared
# X_train and X_test should contain window-level features
# y_train and y_test should contain window-level target variable
# block_mapping_train and block_mapping_test should contain mapping between windows and blocks

# Train Decision Tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predict on test set
window_predictions = model.predict(X_test)

# Aggregate window predictions to block level
block_predictions_mean = {}
for i, (window_pred, block_id) in enumerate(zip(window_predictions, block_mapping_test)):
    if block_id not in block_predictions_mean:
        block_predictions_mean[block_id] = []
    block_predictions_mean[block_id].append(window_pred)

# Average window predictions within each block
for block_id, window_preds in block_predictions_mean.items():
    block_predictions_mean[block_id] = np.mean(window_preds)

# Now block_predictions contains predictions for each block
print("Predictions at the block level:", block_predictions_mean)

Predictions at the block level: {'4_5_1': 13.0, '4_2_7': 9.0, '19_2_1': 15.0, '23_4_2': 19.0, '23_5_2': 17.0, '19_5_0': 28.0, '23_1_9': 5.0, '19_1_8': 8.0, '4_3_3': 13.0, '13_6_4': 14.0, '5_3_5': 4.0, '23_1_1': 5.0, '23_3_6': 19.0, '19_4_3': 32.0, '8_1_2': -11.0, '19_3_3': 18.0, '23_4_8': 19.0, '4_4_0': 11.0, '5_6_7': 15.0, '14_2_7': 13.0, '4_6_6': 11.0, '4_4_5': 14.0, '8_5_3': 19.0, '4_6_3': 11.0, '19_6_4': 16.0, '19_3_6': 18.0, '4_3_4': 13.0, '7_3_0': 19.0, '8_3_0': 15.0, '19_2_4': 15.0, '13_6_0': 14.0, '23_2_1': 3.0, '4_5_2': 13.0, '5_5_7': 12.0, '4_4_4': 11.0, '14_2_4': 13.0, '8_3_3': 15.0, '20_2_4': -2.0, '14_5_2': 27.0, '23_5_4': 17.0, '7_6_0': 57.0, '14_1_2': 4.0, '13_6_3': 14.0, '4_4_1': 11.0, '20_2_3': -2.0, '7_1_0': 54.0, '19_4_0': 32.0, '5_2_5': 3.0, '19_5_6': 28.0}


In [12]:
from sklearn.metrics import mean_squared_error, accuracy_score

# Convert block-level predictions and actuals to arrays
block_preds_array_mean = np.array(list(block_predictions_mean.values()))
actual_block_targets_array_mean = np.array(list(actual_block_targets_test.values()))

# Calculate Mean Squared Error (MSE) for regression tasks
mse = mean_squared_error(actual_block_targets_array_mean, block_preds_array_mean)
print("Mean Squared Error (MSE):", mse)

# Calculate Accuracy for classification tasks
accuracy = accuracy_score(actual_block_targets_array_mean, block_preds_array_mean)
print("Accuracy:", accuracy)


Mean Squared Error (MSE): 3069.061224489796
Accuracy: 0.02040816326530612


Mode

In [13]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy import stats

# Sample dataset
# Assuming X_train, y_train, X_test, y_test are already prepared
# X_train and X_test should contain window-level features
# y_train and y_test should contain window-level target variable
# block_mapping_train and block_mapping_test should contain mapping between windows and blocks

# Train Decision Tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predict on test set
window_predictions = model.predict(X_test)

# Aggregate window predictions to block level
block_predictions_mode = {}
for i, (window_pred, block_id) in enumerate(zip(window_predictions, block_mapping_test)):
    if block_id not in block_predictions_mode:
        block_predictions_mode[block_id] = []
    block_predictions_mode[block_id].append(window_pred)

# Average window predictions within each block
for block_id, window_preds in block_predictions_mode.items():
    block_predictions_mode[block_id] = stats.mode(window_preds)[0]

# Now block_predictions contains predictions for each block
print("Predictions at the block level:", block_predictions_mode)

Predictions at the block level: {'4_5_1': 13.0, '4_2_7': 9.0, '19_2_1': 15.0, '23_4_2': 19.0, '23_5_2': 17.0, '19_5_0': 28.0, '23_1_9': 5.0, '19_1_8': 8.0, '4_3_3': 13.0, '13_6_4': 14.0, '5_3_5': 4.0, '23_1_1': 5.0, '23_3_6': 19.0, '19_4_3': 32.0, '8_1_2': -11.0, '19_3_3': 18.0, '23_4_8': 19.0, '4_4_0': 11.0, '5_6_7': 15.0, '14_2_7': 13.0, '4_6_6': 11.0, '4_4_5': 11.0, '8_5_3': 19.0, '4_6_3': 11.0, '19_6_4': 16.0, '19_3_6': 18.0, '4_3_4': 13.0, '7_3_0': 19.0, '8_3_0': 15.0, '19_2_4': 15.0, '13_6_0': 14.0, '23_2_1': 3.0, '4_5_2': 13.0, '5_5_7': 12.0, '4_4_4': 11.0, '14_2_4': 13.0, '8_3_3': 15.0, '20_2_4': -2.0, '14_5_2': 27.0, '23_5_4': 17.0, '7_6_0': 57.0, '14_1_2': 4.0, '13_6_3': 14.0, '4_4_1': 11.0, '20_2_3': -2.0, '7_1_0': 54.0, '19_4_0': 32.0, '5_2_5': 2.0, '19_5_6': 28.0}


In [14]:
from sklearn.metrics import mean_squared_error, accuracy_score

# Convert block-level predictions and actuals to arrays
block_preds_array_mode = np.array(list(block_predictions_mode.values()))
actual_block_targets_array_mode = np.array(list(actual_block_targets_test.values()))

# Calculate Mean Squared Error (MSE) for regression tasks
mse = mean_squared_error(actual_block_targets_array_mode, block_preds_array_mode)
print("Mean Squared Error (MSE):", mse)

# Calculate Accuracy for classification tasks
accuracy = accuracy_score(actual_block_targets_array_mode, block_preds_array_mode)
print("Accuracy:", accuracy)


Mean Squared Error (MSE): 3075.061224489796
Accuracy: 0.02040816326530612


Median

In [15]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy import stats

# Sample dataset
# Assuming X_train, y_train, X_test, y_test are already prepared
# X_train and X_test should contain window-level features
# y_train and y_test should contain window-level target variable
# block_mapping_train and block_mapping_test should contain mapping between windows and blocks

# Train Decision Tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predict on test set
window_predictions = model.predict(X_test)

# Aggregate window predictions to block level
block_predictions_median = {}
for i, (window_pred, block_id) in enumerate(zip(window_predictions, block_mapping_test)):
    if block_id not in block_predictions_median:
        block_predictions_median[block_id] = []
    block_predictions_median[block_id].append(window_pred)

# Average window predictions within each block
for block_id, window_preds in block_predictions_median.items():
    block_predictions_median[block_id] = np.median(window_preds)

# Now block_predictions contains predictions for each block
print("Predictions at the block level:", block_predictions_median)

Predictions at the block level: {'4_5_1': 13.0, '4_2_7': 9.0, '19_2_1': 15.0, '23_4_2': 19.0, '23_5_2': 17.0, '19_5_0': 28.0, '23_1_9': 5.0, '19_1_8': 8.0, '4_3_3': 13.0, '13_6_4': 14.0, '5_3_5': 4.0, '23_1_1': 5.0, '23_3_6': 19.0, '19_4_3': 32.0, '8_1_2': -11.0, '19_3_3': 18.0, '23_4_8': 19.0, '4_4_0': 11.0, '5_6_7': 15.0, '14_2_7': 13.0, '4_6_6': 11.0, '4_4_5': 11.0, '8_5_3': 19.0, '4_6_3': 11.0, '19_6_4': 16.0, '19_3_6': 18.0, '4_3_4': 13.0, '7_3_0': 14.0, '8_3_0': 15.0, '19_2_4': 15.0, '13_6_0': 14.0, '23_2_1': 3.0, '4_5_2': 13.0, '5_5_7': 12.0, '4_4_4': 11.0, '14_2_4': 13.0, '8_3_3': 19.0, '20_2_4': -2.0, '14_5_2': 27.0, '23_5_4': 17.0, '7_6_0': 57.0, '14_1_2': 4.0, '13_6_3': 14.0, '4_4_1': 11.0, '20_2_3': -2.0, '7_1_0': 54.0, '19_4_0': 32.0, '5_2_5': 2.0, '19_5_6': 28.0}


In [16]:
from sklearn.metrics import mean_squared_error, accuracy_score

# Convert block-level predictions and actuals to arrays
block_preds_array_median = np.array(list(block_predictions_median.values()))
actual_block_targets_array_median = np.array(list(actual_block_targets_test.values()))

# Calculate Mean Squared Error (MSE) for regression tasks
mse = mean_squared_error(actual_block_targets_array_median, block_preds_array_median)
print("Mean Squared Error (MSE):", mse)

# Calculate Accuracy for classification tasks
accuracy = accuracy_score(actual_block_targets_array_median, block_preds_array_median)
print("Accuracy:", accuracy)


Mean Squared Error (MSE): 3077.4897959183672
Accuracy: 0.02040816326530612


Min

In [17]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy import stats

# Sample dataset
# Assuming X_train, y_train, X_test, y_test are already prepared
# X_train and X_test should contain window-level features
# y_train and y_test should contain window-level target variable
# block_mapping_train and block_mapping_test should contain mapping between windows and blocks

# Train Decision Tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predict on test set
window_predictions = model.predict(X_test)

# Aggregate window predictions to block level
block_predictions_min = {}
for i, (window_pred, block_id) in enumerate(zip(window_predictions, block_mapping_test)):
    if block_id not in block_predictions_min:
        block_predictions_min[block_id] = []
    block_predictions_min[block_id].append(window_pred)

# Average window predictions within each block
for block_id, window_preds in block_predictions_min.items():
    block_predictions_min[block_id] = np.min(window_preds)

# Now block_predictions contains predictions for each block
print("Predictions at the block level:", block_predictions_min)

Predictions at the block level: {'4_5_1': 13.0, '4_2_7': 9.0, '19_2_1': 15.0, '23_4_2': 19.0, '23_5_2': 17.0, '19_5_0': 28.0, '23_1_9': 5.0, '19_1_8': 8.0, '4_3_3': 13.0, '13_6_4': 14.0, '5_3_5': 4.0, '23_1_1': 5.0, '23_3_6': 19.0, '19_4_3': 32.0, '8_1_2': -11.0, '19_3_3': 18.0, '23_4_8': 19.0, '4_4_0': 11.0, '5_6_7': 15.0, '14_2_7': 13.0, '4_6_6': 11.0, '4_4_5': 11.0, '8_5_3': 19.0, '4_6_3': 11.0, '19_6_4': 16.0, '19_3_6': 18.0, '4_3_4': 13.0, '7_3_0': 19.0, '8_3_0': 15.0, '19_2_4': 15.0, '13_6_0': 14.0, '23_2_1': 3.0, '4_5_2': 13.0, '5_5_7': 12.0, '4_4_4': 11.0, '14_2_4': 13.0, '8_3_3': 15.0, '20_2_4': -2.0, '14_5_2': 27.0, '23_5_4': 17.0, '7_6_0': 57.0, '14_1_2': 4.0, '13_6_3': 14.0, '4_4_1': 11.0, '20_2_3': -2.0, '7_1_0': 54.0, '19_4_0': 32.0, '5_2_5': 2.0, '19_5_6': 28.0}


In [18]:
from sklearn.metrics import mean_squared_error, accuracy_score

# Convert block-level predictions and actuals to arrays
block_preds_array_min = np.array(list(block_predictions_min.values()))
actual_block_targets_array_min = np.array(list(actual_block_targets_test.values()))

# Calculate Mean Squared Error (MSE) for regression tasks
mse = mean_squared_error(actual_block_targets_array_min, block_preds_array_min)
print("Mean Squared Error (MSE):", mse)

# Calculate Accuracy for classification tasks
accuracy = accuracy_score(actual_block_targets_array_min, block_preds_array_min)
print("Accuracy:", accuracy)


Mean Squared Error (MSE): 3075.061224489796
Accuracy: 0.02040816326530612


Max

In [19]:
from sklearn.tree import DecisionTreeRegressor
import numpy as np
from scipy import stats

# Sample dataset
# Assuming X_train, y_train, X_test, y_test are already prepared
# X_train and X_test should contain window-level features
# y_train and y_test should contain window-level target variable
# block_mapping_train and block_mapping_test should contain mapping between windows and blocks

# Train Decision Tree model
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

# Predict on test set
window_predictions = model.predict(X_test)

# Aggregate window predictions to block level
block_predictions_max = {}
for i, (window_pred, block_id) in enumerate(zip(window_predictions, block_mapping_test)):
    if block_id not in block_predictions_max:
        block_predictions_max[block_id] = []
    block_predictions_max[block_id].append(window_pred)

# Average window predictions within each block
for block_id, window_preds in block_predictions_max.items():
    block_predictions_max[block_id] = np.max(window_preds)

# Now block_predictions contains predictions for each block
print("Predictions at the block level:", block_predictions_max)

Predictions at the block level: {'4_5_1': 13.0, '4_2_7': 9.0, '19_2_1': 15.0, '23_4_2': 19.0, '23_5_2': 17.0, '19_5_0': 28.0, '23_1_9': 5.0, '19_1_8': 8.0, '4_3_3': 13.0, '13_6_4': 11.0, '5_3_5': 4.0, '23_1_1': 5.0, '23_3_6': 19.0, '19_4_3': 32.0, '8_1_2': -11.0, '19_3_3': 18.0, '23_4_8': 19.0, '4_4_0': 11.0, '5_6_7': 15.0, '14_2_7': 13.0, '4_6_6': 11.0, '4_4_5': 11.0, '8_5_3': 19.0, '4_6_3': 11.0, '19_6_4': 16.0, '19_3_6': 18.0, '4_3_4': 13.0, '7_3_0': 19.0, '8_3_0': 15.0, '19_2_4': 15.0, '13_6_0': 14.0, '23_2_1': 3.0, '4_5_2': 13.0, '5_5_7': 12.0, '4_4_4': 11.0, '14_2_4': 13.0, '8_3_3': 15.0, '20_2_4': -2.0, '14_5_2': 27.0, '23_5_4': 17.0, '7_6_0': 57.0, '14_1_2': 4.0, '13_6_3': 14.0, '4_4_1': 11.0, '20_2_3': -2.0, '7_1_0': 54.0, '19_4_0': 32.0, '5_2_5': 2.0, '19_5_6': 28.0}


In [20]:
from sklearn.metrics import mean_squared_error, accuracy_score

# Convert block-level predictions and actuals to arrays
block_preds_array_max = np.array(list(block_predictions_max.values()))
actual_block_targets_array_max = np.array(list(actual_block_targets_test.values()))

# Calculate Mean Squared Error (MSE) for regression tasks
mse = mean_squared_error(actual_block_targets_array_max, block_preds_array_max)
print("Mean Squared Error (MSE):", mse)

# Calculate Accuracy for classification tasks
accuracy = accuracy_score(actual_block_targets_array_max, block_preds_array_max)
print("Accuracy:", accuracy)


Mean Squared Error (MSE): 3079.285714285714
Accuracy: 0.02040816326530612


###Information Gain Scores

In [23]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Create and fit a Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Get feature importances
feature_importances = rf.feature_importances_

# Create a DataFrame to display feature names and their importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances})

# Sort features by importance
feature_importance = feature_importance_df.sort_values(by='Importance', ascending=False)
# Print or visualize feature importances
for i in range(len(feature_importance)):
  print(feature_importance["Feature"][i], feature_importance_df["Importance"][i])

Block 0.008297606709508538
window_num_30 0.0
Age 0.13244149589210386
HRV_MeanNN 4.1833686621396124e-05
HRV_SDNN 0.0004277278030112188
HRV_RMSSD 9.568162677909383e-07
HRV_SDSD 1.6256421017476873e-06
HRV_CVNN 0.00017416089861500578
HRV_CVSD 2.030482066755612e-05
HRV_MedianNN 0.00011123690949504619
HRV_MadNN 0.0
HRV_MCVNN 1.146939207529334e-05
HRV_IQRNN 2.3947915253380286e-05
HRV_SDRMSSD 0.0
HRV_Prc20NN 5.015452739276118e-06
HRV_Prc80NN 1.5397401719585848e-06
HRV_pNN50 0.00011945439498453083
HRV_pNN20 2.639140753706991e-05
HRV_MinNN 0.0
HRV_MaxNN 2.759156413888128e-06
HRV_HTI 2.117459796984359e-06
HRV_TINN 0.0
HRV_HF 1.1622287461411652e-07
HRV_VHF 0.0
HRV_TP 5.014176731467234e-05
HRV_HFn 0.0
HRV_LnHF 1.770932682277296e-06
HRV_SD1 2.4020908705426254e-05
HRV_SD2 0.00022833564314450684
HRV_SD1SD2 1.265476899656587e-06
HRV_S 1.1339555968798319e-05
HRV_CSI 1.3293088254311556e-05
HRV_CVI 5.953634544285717e-06
HRV_CSI_Modified 0.00039427526769067564
HRV_PIP 1.1905897201884924e-06
HRV_IALS 2.6724

In [24]:
feature_importance

Unnamed: 0,Feature,Importance
146,Block_ID,0.370123
145,Window_ID,0.204878
2,Age,0.132441
75,Change_from_Baseline,0.091926
84,Delta_HRV_MCVNN,0.044863
...,...,...
47,HRV_C2a,0.000000
54,HRV_DFA_alpha1,0.000000
63,HRV_ApEn,0.000000
67,HRV_MSEn,0.000000
