In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import scipy
from matplotlib import pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_white


eda_middle_data_path = 'eda-middle-data/'

res = pd.DataFrame([], columns=['pcode', 'max_amplitude', 'scr_count_per_minute'])
measurement = 'scr_count_per_minute'

# calculate mean SCRs frequency for each participant
for p_index in range(80):
    participant_key = 'P' + str(p_index + 1).zfill(2)
    file_path = eda_middle_data_path + participant_key + '_eda_result.csv'

    if not os.path.isfile(file_path):
        print(file_path + ' do not exist.')
        continue

    df = pd.read_csv(file_path)
    
    # filter out participants with less than 5 rows of data 
    if len(df) < 5:
        print(file_path + ' has too less data, filtered out')
        continue

    # IQR procedure to remove outlier
    Q1 = df[measurement].quantile(0.25)
    Q3 = df[measurement].quantile(0.75)
    IQR = Q3 - Q1
    
    outliers = ((df[measurement] < (Q1 - 1.5 * IQR)) | (df[measurement] > (Q3 + 1.5 * IQR)))
    
    df_filtered = df[~outliers]

    # In the end, we only use the scr_count_per_minute.
    res.loc[len(res)] = [participant_key, df_filtered['max_amplitude'].mean(), df_filtered['scr_count_per_minute'].mean()]

res

In [None]:
# plot overview of participants
res.sort_values(by=measurement, ascending=False, inplace=True)
plot = res.plot(x='pcode', y=measurement, kind='scatter', figsize=(15, 5))

In [None]:
# read participant information and print result for overview
df_subj = pd.read_csv('dataset/SubjData/UserInfo.csv')
df_subj

In [None]:
personality_traits = ['openness', 'conscientiousness', 'neuroticism', 'extraversion', 'agreeableness']
df_test = pd.merge(res, df_subj, on='pcode')

output_path = 'analyse_data/'
Path(output_path).mkdir(parents=True, exist_ok=True)

df_test.to_csv(f'{output_path}eda-results.csv', index=False)

for trait in personality_traits:
    df_test.plot(x=trait, y=measurement, kind='scatter')


In [None]:
# join user information dataframe and HRV result dataframe
X = df_test[personality_traits]
y = df_test[measurement]

# multiple regression model for all five personality traits to EDA
X = sm.add_constant(X)
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# normality test
qq_plot = sm.qqplot(results.resid, line='q')
print(scipy.stats.shapiro(results.resid))

In [None]:
# Pearson correlation of each personality trait to EDA. We don't use them in the end.
for trait in personality_traits:
    print(trait, scipy.stats.pearsonr(df_test[trait], df_test[measurement]))

In [None]:
# assumption test for homogeneity
homogeneity_test_results = het_white(results.resid, results.model.exog)

labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
homogeneity_test_results = dict(zip(labels, homogeneity_test_results))

print(homogeneity_test_results)

In [None]:
p2 = ['openness', 'conscientiousness']
X2 = df_test[p2]

X2 = sm.add_constant(X2)
model2 = sm.OLS(y, X2)
results2 = model2.fit()
print(results2.summary())

In [None]:
print(scipy.stats.shapiro(results2.resid))

homogeneity_test_results = het_white(results2.resid, results2.model.exog)

labels = ['Lagrange multiplier statistic', 'p-value', 'f-value', 'f p-value']
homogeneity_test_results = dict(zip(labels, homogeneity_test_results))

print(homogeneity_test_results)

In [None]:
import matplotlib.pyplot as plt

coef_openness = results2.params['openness']
std_err_openness = results2.bse['openness']
intercept = results2.params['const']

# calculate 95% confidence interval
lower_ci = coef_openness - 1.96 * std_err_openness
upper_ci = coef_openness + 1.96 * std_err_openness

plt.figure(figsize=(8, 6))
plt.scatter(df_test['openness'], y, label='Data points')
plt.plot(df_test['openness'], intercept + coef_openness * df_test['openness'], color='red', label='Best Fit Line')
plt.fill_between(df_test['openness'], intercept + lower_ci * df_test['openness'], intercept + upper_ci * df_test['openness'], color='pink', alpha=0.4, label='95% CI')
plt.xlabel('Openness')
plt.ylabel('scr_count_per_minute')
plt.legend()

plt.tight_layout()
plt.show()
