In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from tqdm.auto import tqdm
from scipy import stats

### We use the same script as when caclulating the mean changes between each participant's answers but this time only the answers to the 'PAIN VAS' question are compared at the start and end of the group's time with the SOLIO device.

In [2]:
df = pd.read_csv(r"/Users/yotamhadari/Desktop/SOLIO/Group_1_SOLIO_0408.csv", index_col=[0,1])
group_df = pd.read_csv(r"/Users/yotamhadari/Desktop/SOLIO/Device_type_group_1.csv", index_col=[0])

In [3]:
start = df.loc[df.index.get_level_values(1) == '10/07/2022']
end = df.loc[df.index.get_level_values(1) == '04/08/2022']
df = pd.concat((start, end))
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Pain VAS,Qualitative pain,Day to day,Lifting weights,Walking,Sitting,Standing,Sleeping,Sex life,Social life,Travel
User ID,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,10/07/2022,0,1.0,1.0,,,,,,,,
2,10/07/2022,6,3.0,2.0,,,,,,,,
3,10/07/2022,3,3.0,1.0,,,,,,,,
4,10/07/2022,3,3.0,1.0,,,,,,,,
5,10/07/2022,8,4.0,2.0,,,,,,,,
7,10/07/2022,5,3.0,2.0,,,,,,,,
9,10/07/2022,3,3.0,2.0,,,,,,,,
13,10/07/2022,6,3.0,2.0,,,,,,,,
14,10/07/2022,3,2.0,2.0,,,,,,,,
16,10/07/2022,6,3.0,2.0,,,,,,,,


we want to calculate the difference between each participant's answers to the previous one.

In [4]:
def create_indiviudal_user_dfs(df):
    """Create a dictionary where keys are user_n and values and individual answer df for that user."""
    users_dfs = {}
    for user in df.index.get_level_values(0).drop_duplicates():  # Used later
        users_dfs[f"{user}"] = df.loc[df.index.get_level_values(0) == user]
    return users_dfs

In [5]:
users_dict = create_indiviudal_user_dfs(df)
user_1 = users_dict['5']
user_1.head(10)


def harvesting_mean_median(user_df, periods_to_check=30):
    """harvesting a participant's answers to a questionnaire for it's mean and median change in values"""
    user_mean_median = {}
    diff_per = 1
    while len(user_mean_median.keys()) < len(user_df.columns):
        user_diff = user_df.diff(periods=diff_per)
        user_desc = user_diff.describe()
        user_desc.replace(np.nan, 'NaN', inplace=True)
        diff_per += 1
        for column in (user_df.columns):
            
            if user_desc.loc['mean', column] != 'NaN':
                user_mean_median[column] = {'mean':user_desc.loc['mean', column], 'median': user_desc.loc['mean', column]}

        if diff_per > periods_to_check:

            for column in (user_df.columns):

                if column not in user_mean_median.keys():
                    user_mean_median[column] = {'mean': 'NEA', 'median': 'NEA'} # not enough answers
            break
    return user_mean_median

In [6]:
group_dict = {}
for group in group_df.iloc[:, 0].drop_duplicates():
    group_dict[group] = list(group_df.loc[group_df.iloc[:,0] == group].index)

group_dict
group_dict = {'real': [1, 5, 7, 9, 16, 17, 13, 14], 'sham': [2,3,4, 18, 20]}

In [7]:
df1 = pd.DataFrame(harvesting_mean_median(user_1))
df1

Unnamed: 0,Pain VAS,Qualitative pain,Day to day,Lifting weights,Walking,Sitting,Standing,Sleeping,Sex life,Social life,Travel
mean,-7.0,NEA,NEA,NEA,NEA,NEA,NEA,NEA,NEA,NEA,NEA
median,-7.0,NEA,NEA,NEA,NEA,NEA,NEA,NEA,NEA,NEA,NEA


In [8]:
mean_dict = {}
median_dict = {}
for user, df in users_dict.items():
    mean_median = pd.DataFrame(harvesting_mean_median(df))
    mean = mean_median.loc['mean']
    median = mean_median.loc['median']
    mean_dict[int(user)] = mean
    median_dict[int(user)] = median
    

In [9]:
mean_df = pd.concat(mean_dict, axis=1)
median_df = pd.concat(median_dict, axis=1)

real_mean = mean_df.loc[:, group_dict['real']]
sham_mean = mean_df.loc[:, group_dict['sham']]

real_median = median_df.loc[:, group_dict['real']]
sham_median = median_df.loc[:, group_dict['sham']]

In [10]:
real_mean.replace('NEA', np.nan, inplace=True)
sham_mean.replace('NEA', np.nan, inplace=True)

In [11]:
real_mean['avg mean changes'] = real_mean.mean(axis=1, numeric_only=True)
sham_mean['avg mean changes'] = sham_mean.mean(axis=1, numeric_only=True)
real_mean['group'] = 'real'
sham_mean['group'] = 'sham'

In [12]:
real_mean
#sham_mean

Unnamed: 0,1,5,7,9,16,17,13,14,avg mean changes,group
Pain VAS,0.0,-7.0,-1.0,3.0,-2.0,-4.0,-1.0,-1.0,-1.625,real
Qualitative pain,,,,,,,,,,real
Day to day,,,,,,,,,,real
Lifting weights,,,,,,,,,,real
Walking,,,,,,,,,,real
Sitting,,,,,,,,,,real
Standing,,,,,,,,,,real
Sleeping,,,,,,,,,,real
Sex life,,,,,,,,,,real
Social life,,,,,,,,,,real


In [13]:
joined_mean = pd.concat([real_mean.loc[:, 'avg mean changes':],
sham_mean.loc[:, 'avg mean changes':]], axis=0).reset_index()

In [14]:
joined_mean

Unnamed: 0,index,avg mean changes,group
0,Pain VAS,-1.625,real
1,Qualitative pain,,real
2,Day to day,,real
3,Lifting weights,,real
4,Walking,,real
5,Sitting,,real
6,Standing,,real
7,Sleeping,,real
8,Sex life,,real
9,Social life,,real


In [15]:
fig = px.histogram(joined_mean, x='index', y='avg mean changes', color='group', barmode='group',
text_auto=True, width=700, height=400)
fig

Using statistical tests to check for difference between 'sham' and 'real' device groups.

In [431]:
a = real_mean.iloc[0, 0:8]
b = sham_mean.iloc[0, 0:5]

In [427]:
stats.ttest_ind(a, b, random_state=42)

Ttest_indResult(statistic=-0.5931392346786842, pvalue=0.5650836878077025)

In [426]:
stats.mannwhitneyu(np.array(a).astype(float), np.array(b).astype(float))

MannwhitneyuResult(statistic=20.5, pvalue=1.0)