In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import scipy
from statsmodels.stats.diagnostic import het_white
import statsmodels.api as sm


hrv_middle_data_path = 'hrv-middle-data/'
measurement = 'CVSD_mean'
output_path = 'analyse_data/'

res = pd.DataFrame([], columns=['pcode', 'RMSSD_mean', 'CVSD_mean', 'SDNN_mean', 'RMSSD_std', 'CVSD_std', 'SDNN_std'])

# calculate mean CVSD for each participant
for p_index in range(80):
    participant_key = 'P' + str(p_index + 1).zfill(2)
    file_path = hrv_middle_data_path + participant_key + '_hrv_result.csv'

    if not os.path.isfile(file_path):
        print(file_path + ' do not exist')
        continue

    df = pd.read_csv(file_path)
    
    # filter out participants with less than 5 rows of data 
    if len(df) < 5:
        print(participant_key + ' has too less data, filtered out')
        continue
    
    # In the end, we only use the CVSD_mean.
    res.loc[len(res)] = [participant_key, df['HRV_RMSSD'].mean(), df['HRV_CVSD'].mean(), df['HRV_SDNN'].mean(), df['HRV_RMSSD'].std(), df['HRV_CVSD'].std(), df['HRV_SDNN'].std()]

res

In [None]:
# plot the overview of participants with CVSD_mean
res.sort_values(by=measurement, ascending=False, inplace=True)
plot = res.plot(x='pcode', y=measurement, kind='scatter', figsize=(15, 5))

In [None]:
# read participant information and print result for overview
df_subj = pd.read_csv('dataset/SubjData/UserInfo.csv')
df_subj

In [None]:
# join user information dataframe and HRV result dataframe
personality_traits = ['openness', 'conscientiousness', 'neuroticism', 'extraversion', 'agreeableness']
df_test = pd.merge(res, df_subj, on='pcode')

# output the file for multivariate analysis
Path(output_path).mkdir(parents=True, exist_ok=True)
df_test.to_csv(f'{output_path}hrv-results.csv', index=False)

In [None]:
# only for reference, we don't use this plots in the end
for trait in personality_traits:
    df_test.plot(x=trait, y=measurement, kind='scatter')

In [None]:
# multiple regression for all five personality traits to HRV

X = df_test[personality_traits]
y = df_test[measurement]

X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
results = model

print(results.summary())

In [None]:
plot = plt.plot(results.resid)

In [None]:
# normality test and normality visualization
print(scipy.stats.shapiro(results.resid))
qq_plot = sm.qqplot(results.resid, line='q')

In [None]:
# Pearson correlation of each personality trait to HRV. We don't use them in the end.
for trait in personality_traits:
    print(trait, scipy.stats.pearsonr(df_test[trait], df_test[measurement]))

In [None]:
# Pearson correlation between personality traits.
df_corr = df_test[personality_traits]
print(df_corr.corr(method='pearson'))

df_corr.corr(method='pearson').to_csv(f"{output_path}IV-correlation.csv", index=False)

In [None]:
# assumption test for homogeneity
homogeneity_test_results = het_white(results.resid, results.model.exog)

labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
homogeneity_test_results = dict(zip(labels, homogeneity_test_results))

print(homogeneity_test_results)

In [None]:
# new multiple regression with only openness and conscientiousness as IVs
p2 = ['openness', 'conscientiousness']
X2 = df_test[p2]

X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2)
results2 = model2.fit()
print(results2.summary())

In [None]:
# normality test
print(scipy.stats.shapiro(results2.resid))

In [None]:
# assumption test for homogeneity
homogeneity_test_results = het_white(results2.resid, results2.model.exog)

labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
homogeneity_test_results = dict(zip(labels, homogeneity_test_results))

print(homogeneity_test_results)

In [None]:
import matplotlib.pyplot as plt

coef_conscientiousness = results2.params['conscientiousness']
print(coef_conscientiousness)
std_err_conscientiousness = results2.bse['conscientiousness']
print(std_err_conscientiousness)
intercept = results2.params['const']

# calculate 95% confidence interval
lower_ci = coef_conscientiousness - 1.96 * std_err_conscientiousness
upper_ci = coef_conscientiousness + 1.96 * std_err_conscientiousness

plt.figure(figsize=(8, 6))
plt.scatter(df_test['conscientiousness'], y, label='Data points')
plt.plot(df_test['conscientiousness'], intercept + coef_conscientiousness * df_test['conscientiousness'], color='red', label='Best Fit Line')
plt.fill_between(df_test['conscientiousness'], intercept + lower_ci * df_test['conscientiousness'], intercept + upper_ci * df_test['conscientiousness'], color='pink', alpha=0.4, label='95% CI')
plt.xlabel('Conscientiousness')
plt.ylabel('CVSD_mean')
plt.legend()

plt.tight_layout()
plt.show()
