In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import random

from matplotlib.colors import ListedColormap
from random import choices
from scipy import stats
from statsmodels.miscmodels.ordinal_model import OrderedModel
from tqdm.notebook import tqdm

In [None]:
import warnings
import copy
warnings.filterwarnings(action="ignore", category=np.VisibleDeprecationWarning)
warnings.filterwarnings(action='ignore', message='All-NaN slice encountered')
warnings.filterwarnings(action='ignore', message='Precision loss occurred in moment calculation due to catastrophic cancellation. This occurs when the data are nearly identical. Results may be unreliable.')
warnings.filterwarnings(action='ignore', message='Mean of empty slice')

In [None]:
from jupyter_utils import style, mean_std, display_test, display_group_test, scatter_annotate, show_corrtest_mask_corr
from ortogonolize_utils import draw_scatter, draw_corrected_scatter
from ortogonolize_utils import compute_coefficient, compute_ortogonolized_coefficient, compute_ortogonolized_logit

In [None]:
PATH = '/Users/galina.ryazanskaya/Downloads/thesis?/code?/'

## merge data with psychosocial data

In [None]:
df = pd.read_csv(PATH +'rus_merged_psychosocial_data.csv', index_col=0)
df = df[df.index.notnull()]
df.rename(columns={'dep.severity-1': 'dep.severity',
                  'HDRS-17.score-1': 'HDRS-17',
                   'panss-1-td': 'panss_td',
                   'panss-1-total': 'panss_total', 
                   'panss-n-1-total': 'panss_neg', 
                   'panss-o-1-total': 'panss_o',
                   'panss-p-1-total': 'panss_pos', 
                   'sans-1-total': 'sans',
                   'saps-ftd-1-total': 'saps_ftd', 
                   'sops-1-total': 'sops_total', 
                   'sops-c-total': 'sops_c',
                   'sops-d-total': 'sops_d', 
                   'sops-n-total': 'sops_n', 
                   'sops-p-total': 'sops_p'
                  }, inplace=True)
df['index'] = df.index
df.drop_duplicates(inplace=True, subset='index')
df.drop(columns=['index'], inplace=True)

In [None]:
len(df)

In [None]:
res_df = pd.read_csv(PATH + 'processed_values/new/ru_both.tsv', sep='\t', index_col=0, header=[0, 1, 2])

In [None]:
len(res_df)

In [None]:
dfi = [i.replace('-', '').replace('S', 'PD1') for i in df.index]
df.index = dfi
# res_df.index = [i.replace('S', 'PD1') for i in res_df.index]
rdfi = [i.split('_')[0].replace('S', 'PD1') for i in res_df.index]

missing_psy = set(rdfi).difference(set(dfi))
missing_psy

In [None]:
missing_psy = [i for i in res_df.index if i.split('_')[0] in missing_psy]
res_df.drop(missing_psy, inplace=True)

In [None]:
len(res_df)

In [None]:
missing_text = set(dfi).difference(set(rdfi))
len(missing_text)

In [None]:
df.drop(missing_text, inplace=True)

In [None]:
non_start_timepoint = [i for i in res_df.index if i.split('_')[-1] != '1']
res_df.drop(non_start_timepoint, inplace=True)

In [None]:
res_df.loc['PD101_1'].dropna()

In [None]:
rdfi_filtered = [i.split('_')[0].replace('S', 'PD1') for i in res_df.index]
res_df.index = rdfi_filtered

## psychosocial statistics

In [None]:
def fill_diagnosis_type(row):
    dt = row['diagnosis.type']
    if not pd.isna(dt):
        return dt
    else:
        if not pd.isna(row['td.severity']):
            return 'control_psy'
        else:
            return 'control'

In [None]:
df['diagnosis.type'] = df.apply(fill_diagnosis_type, axis=1)

In [None]:
sz = df[df['diagnosis.type'] == 'sz']
dep = df[df['diagnosis.type'] == 'dep']
control = df[df['diagnosis.group'] == 'control']
control_psy = df[df['diagnosis.type'] == 'control_psy']

In [None]:
df.columns

In [None]:
len(df)

In [None]:
df['diagnosis.group'].value_counts()

In [None]:
df['diagnosis.type'].value_counts()

In [None]:
df['dep.scale'].value_counts()

In [None]:
df['td.scales'].value_counts()

In [None]:
print(df.groupby('diagnosis.type')[['diagnosis_code', 'diagnosis_eng']].value_counts().to_csv(sep='\t'))

### select target cols

In [None]:
target_cols = ['sex', 'age', 'education.years', 
               'diagnosis.group', 'diagnosis.type',
               'dep.severity', 'td.severity']

In [None]:
# panss_cols = [col for col in df.columns if col.startswith('panss')]
panss_cols = ['panss_td', 'panss_total', 'panss_neg', 'panss_pos', 'panss_o']
sans_cols = [col for col in df.columns if col.startswith('sans')]
saps_cols = [col for col in df.columns if col.startswith('saps')]
sops_cols = [col for col in df.columns if col.startswith('sops')]

In [None]:
numeric_target = ['education.years','dep.severity','td.severity'] + panss_cols

In [None]:
sz.count()[sz.count() > 0]

In [None]:
dep.count()[dep.count() > 0]

In [None]:
sz['sex'].value_counts()

In [None]:
mean_std(sz, target_cols + panss_cols)

In [None]:
mean_std(sz, target_cols + panss_cols, 'sex')

In [None]:
dep['sex'].value_counts()

In [None]:
mean_std(dep, target_cols + panss_cols)

In [None]:
dep['panss_o'].count()

In [None]:
control['sex'].value_counts()

In [None]:
mean_std(control, target_cols)

In [None]:
mean_std(control, target_cols, 'sex')

In [None]:
control_psy['sex'].value_counts()

In [None]:
mean_std(control_psy, target_cols + panss_cols)

In [None]:
mean_std(control_psy, target_cols + panss_cols, 'sex')

In [None]:
control.groupby('sex')['panss_total'].count()

In [None]:
control_psy.count()[control_psy.count() > 0]

In [None]:
df.dropna(axis=0, thresh=30).iloc[0]

### test for differences

age

In [None]:
stats.ttest_ind(control['age'], sz['age'], nan_policy='omit')

In [None]:
stats.ttest_ind(control_psy['age'], sz['age'], nan_policy='omit')

In [None]:
stats.ttest_ind(control_psy['age'], dep['age'], nan_policy='omit')

eduaction years

In [None]:
stats.ttest_ind(control['education.years'], sz['education.years'], nan_policy='omit')

In [None]:
stats.ttest_ind(control_psy['education.years'], sz['education.years'], nan_policy='omit')

In [None]:
stats.ttest_ind(control_psy['education.years'], dep['education.years'], nan_policy='omit')

sex

In [None]:
a = 0.05

In [None]:
s_t_sex, res_t_sex = display_group_test(control, numeric_target, 'sex', stats.ttest_ind, stat_name='t', alpha=a)
style(res_t_sex)

In [None]:
s_t_sex, res_t_sex = display_group_test(control_psy, numeric_target, 'sex', stats.ttest_ind, stat_name='t', alpha=a)
style(res_t_sex)

In [None]:
s_t_sex, res_t_sex = display_group_test(sz, numeric_target, 'sex', stats.ttest_ind, stat_name='t', alpha=a)
style(res_t_sex)

### correlation between target variables

In [None]:
# display corr test

In [None]:
x, y = display_test(df, numeric_target, 'age', stats.pearsonr, stat_name='r', alpha=a)
y[y['abs_r']> 0.3].sort_values('abs_r', ascending=False)

In [None]:
x, y = display_test(df, numeric_target, 'education.years', stats.pearsonr, stat_name='r', alpha=a)
y[y['abs_r']> 0.3].sort_values('abs_r', ascending=False)

In [None]:
target_corr_res = show_corrtest_mask_corr(df[numeric_target])

## scores

In [None]:
res_df

## features of raw texts

In [None]:
raw = pd.read_csv(PATH+'rus_transcript_lex_by_task_with_dots.tsv', sep='\t', index_col=0)

In [None]:
ids_to_drop = [i for i in raw.index if i.split('_')[0] not in res_df.index]

In [None]:
raw.drop(index=ids_to_drop, inplace=True)

In [None]:
raw.index = [i.split('_')[0] for i in raw.index]

In [None]:
raw.isna().sum(axis=0).sum()

In [None]:
raw.applymap(lambda x: len(x.split('.')) if not pd.isna(x) else np.nan).mean(axis=1).hist()

## look at tasks

In [None]:
task_available = raw.applymap(lambda x: 1 if not pd.isna(x) else x)

In [None]:
len(task_available)

In [None]:
task_available['diagnosis.type'] = df['diagnosis.type']

In [None]:
task_available.count()

In [None]:
task_available.groupby('diagnosis.type').count()

In [None]:
task_available.groupby('diagnosis.type').count()[task_available.groupby('diagnosis.type').count() > 0].dropna(axis=1)

In [None]:
def drop_person(row):
    for task in ('adventure', 'chair','present','sportsman'):
        if not pd.isna(row[task]):
            return False
    return True

In [None]:
ids_with_at_least_one_task = task_available[~task_available.apply(drop_person, axis=1)].index

In [None]:
df = df.loc[ids_with_at_least_one_task]

In [None]:
TASKS = ['adventure', 'chair','present','sportsman']

In [None]:
res_df = res_df.loc[ids_with_at_least_one_task, TASKS]

In [None]:
res_df[[(task, 'syntactic', 'mean_sent_len') for task in TASKS]].mean()

In [None]:
res_df[[(task, 'syntactic', 'n_sents') for task in TASKS]].mean()

In [None]:
res_df[[(task, 'lexical', 'n_words') for task in TASKS]].mean()

In [None]:
def task_data(df, task, keep_target=True, fill_synt=True):
    subset = df[task].dropna(axis=0, how='all')
    if fill_synt:
        subset['syntactic'] = subset['syntactic'].fillna(0.0)
    if keep_target:
        subset = pd.concat([subset, df['target'].loc[subset.index]], axis=1)
    return subset

In [None]:
def aplly_to_all_tasks(df, f, tasks=TASKS, to_df=True, *args, **kwargs):
    res = {}
    for task in tasks:
        data = task_data(df, task)
        res[task] = f(data, *args, **kwargs)
    if to_df:
        if all(isinstance(v, pd.Series) for v in res.values()):
            return pd.DataFrame(res)
        elif all(isinstance(v, pd.DataFrame) for v in res.values()):
            return pd.concat(list(res.values()), keys=list(res.keys()), names=['task'], axis=1)
        else:
            return res
    return res

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(15, 10))
fig.suptitle('PANSS')


sns.histplot(sz['panss_total'], ax=axes[0, 0], binwidth=5)
axes[0, 0].set_xlim(30, 95)
sns.histplot(dep['panss_total'], ax=axes[1, 0], binwidth=5)
axes[1, 0].set_xlim(30, 95)
sns.histplot(control_psy['panss_total'], ax=axes[2, 0], binwidth=5)
axes[2, 0].set_xlim(30, 95)
sns.histplot(sz['panss_neg'], ax=axes[0, 1], binwidth=2)
axes[0, 1].set_xlim(7, 45)
sns.histplot(dep['panss_neg'], ax=axes[1, 1], binwidth=2)
axes[1, 1].set_xlim(7, 45)
sns.histplot(control_psy['panss_neg'], ax=axes[2, 1], binwidth=2)
axes[2, 1].set_xlim(7, 45)
sns.histplot(sz['panss_pos'], ax=axes[0, 2], binwidth=2)
axes[0, 2].set_xlim(7, 30)
sns.histplot(dep['panss_pos'], ax=axes[1, 2], binwidth=2)
axes[1, 2].set_xlim(7, 30)
sns.histplot(control_psy['panss_pos'], ax=axes[2, 2], binwidth=2)
axes[2, 2].set_xlim(7, 30)

for ax, col in zip(axes[0], ('total', 'positive', 'negative')):
    ax.set_title(col)

for ax, row in zip(axes[:,0], ('NAP', 'Dep', 'HC')):
    ax.set_ylabel(row, rotation=0, size='large')

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('severity')


sns.histplot(sz['dep.severity'], ax=axes[0, 0])
sns.histplot(dep['dep.severity'], ax=axes[1, 0])
sns.histplot(sz['td.severity'], ax=axes[0, 1])
sns.histplot(dep['td.severity'], ax=axes[1, 1])
axes[1, 1].set_xlim(0, 3)

for ax, col in zip(axes[0], ('Dep', 'TD')):
    ax.set_title(col)

for ax, row in zip(axes[:,0], ('NAP', 'Dep')):
    ax.set_ylabel(row, rotation=0, size='large')

### merge psychoscial data to scores

In [None]:
merge_df = res_df.copy()
for col in target_cols + panss_cols:
    merge_df[('target', 'target', col)] = df[col]

In [None]:
cols_tasks = res_df['chair'].columns
cols_LM = [col for col in res_df['chair'] if col[0] == 'LM']
cols_synt = [col for col in res_df['chair'] if col[0] == 'syntactic']
cols_lex = [col for col in res_df['chair'] if col[0] == 'lexical']
cols_graph = [col for col in res_df['chair'] if col[0] == 'graph']

In [None]:
POS_to_use = ('ADJ', 'ADV', 'AUX', 'CCONJ', 'DET','NOUN', 'PRON', 'PROPN', 'SCONJ', 'VERB', 'PART')
pos = set([x[1] for x in cols_synt if x[1].isupper()])

pos.difference(set(('ADJ', 'ADV', 'AUX', 'CCONJ', 'DET','NOUN', 'PRON', 'PROPN', 'SCONJ', 'VERB', 'PART')))

In [None]:
pos_cols_to_drop = [(task, c[0], c[1]) for task in TASKS for c in cols_synt if c[1].isupper() and c[1] not in POS_to_use]

In [None]:
merge_df.drop(columns=pos_cols_to_drop, inplace=True)

In [None]:
cols_synt = [col for col in merge_df['chair'] if col[0] == 'syntactic']
cols_tasks = cols_synt + cols_LM + cols_lex + cols_graph

In [None]:
def corr(df, target):
    return df[~pd.isnull(df[target])].corr()[target]

In [None]:
def corr_thresh(df, target, thresh=0.3, drop_target=True):
    corr_tgt = corr(df, target)
    if drop_target:
        corr_tgt.drop('target', inplace=True)
    return corr_tgt[abs(corr_tgt) >= thresh]

In [None]:
def ttest(df, test_columns, group):
    s_t, res_t = display_group_test(df, test_columns, group, stats.ttest_ind, stat_name='t', alpha=a)
    return res_t[['t', 'p']]

### psychosocial variables vs metrics

In [None]:
style(aplly_to_all_tasks(merge_df, corr_thresh, target=('target', 'age')))

In [None]:
style(aplly_to_all_tasks(merge_df, corr_thresh, target=('target', 'education.years')))

In [None]:
control = merge_df[merge_df[('target', 'target', 'diagnosis.group')] == 'control']

In [None]:
style(aplly_to_all_tasks(control, corr_thresh, target=('target', 'age'), thresh=0.4))

In [None]:
style(aplly_to_all_tasks(control, corr_thresh, target=('target', 'education.years'), thresh=0.4))

### psychiatric variables vs metrics

In [None]:
def thresh_any_task(df, col='p', tasks=TASKS, thresh=0.05):
    return df[df.apply(lambda row: any([row[task][col] < thresh for task in tasks]), axis=1)]

In [None]:
res = aplly_to_all_tasks(merge_df, ttest, test_columns=cols_tasks, group=('target', 'diagnosis.group'))
style(thresh_any_task(res, thresh=0.01))

In [None]:
def ttest_select(df, test_two_groups, colname, cols_tasks=cols_tasks):
    groups = df[df[('target', colname)].isin(test_two_groups)]
    s_t, res_t = display_group_test(groups, cols_tasks, ('target', colname), test=stats.ttest_ind, stat_name='t', alpha=a)
    return res_t['t']

In [None]:
chair = task_data(merge_df, 'chair')

In [None]:
ttest_select(chair, ('sz', 'control_psy'), 'diagnosis.type')

In [None]:
test_two_groups = ('sz', 'control_psy')
groups = merge_df[merge_df[('target', 'target','diagnosis.type')].isin(test_two_groups)]
res = aplly_to_all_tasks(groups, ttest, test_columns=cols_tasks, group=('target', 'diagnosis.type'))
style(thresh_any_task(res, thresh=0.01))

In [None]:
test_two_groups = ('sz', 'control')
groups = merge_df[merge_df[('target', 'target','diagnosis.type')].isin(test_two_groups)]
res = aplly_to_all_tasks(groups, ttest, test_columns=cols_tasks, group=('target', 'diagnosis.type'))
style(thresh_any_task(res, thresh=0.01))

In [None]:
test_two_groups = ('dep', 'control_psy')
groups = merge_df[merge_df[('target', 'target','diagnosis.type')].isin(test_two_groups)]
res = aplly_to_all_tasks(groups, ttest, test_columns=cols_tasks, group=('target', 'diagnosis.type'))
style(thresh_any_task(res, thresh=0.01))

In [None]:
test_two_groups = ('dep', 'sz')
groups = merge_df[merge_df[('target', 'target','diagnosis.type')].isin(test_two_groups)]
res = aplly_to_all_tasks(groups, ttest, test_columns=cols_tasks, group=('target', 'diagnosis.type'))
style(thresh_any_task(res, thresh=0.01))

### Corr

In [None]:
style(aplly_to_all_tasks(merge_df, corr_thresh, target=('target', 'panss_total')))

In [None]:
style(aplly_to_all_tasks(merge_df, corr_thresh, target=('target', 'panss_pos')))

In [None]:
style(aplly_to_all_tasks(merge_df, corr_thresh, target=('target', 'panss_neg')))

In [None]:
style(aplly_to_all_tasks(merge_df, corr_thresh, target=('target', 'panss_o')))

## Ordered Model

In [None]:
psy_td = df[~pd.isna(df['td.severity'])]

In [None]:
psy_td['td.severity'] = psy_td.loc[:,'td.severity'].astype(pd.CategoricalDtype(ordered=True))
psy_td = psy_td.loc[psy_td[['td.severity', 'panss_total', 'panss_pos', 'panss_neg']].dropna().index]

In [None]:
mod_prob = OrderedModel(psy_td['td.severity'],
                        psy_td[['panss_total', 'panss_pos', 'panss_neg']],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.summary()

In [None]:
mod_prob = OrderedModel(psy_td['td.severity'],
                        psy_td[['panss_pos', 'panss_neg', 'panss_total']],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.aic, res_prob.bic

In [None]:
mod_prob = OrderedModel(psy_td['td.severity'],
                        psy_td[['panss_total', 'panss_pos']],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.aic, res_prob.bic

In [None]:
mod_prob = OrderedModel(psy_td['td.severity'],
                        psy_td[['panss_total']],
                        distr='probit')

res_prob = mod_prob.fit(method='bfgs')
res_prob.aic, res_prob.bic

## Length

In [None]:
tasks_len = aplly_to_all_tasks(merge_df, corr_thresh, target=('syntactic', 'mean_sent_len'), thresh=0.5)
tasks_len.drop(index=[('syntactic', 'mean_sent_len')], inplace=True)
style(tasks_len)

In [None]:
tasks_len = aplly_to_all_tasks(merge_df, corr_thresh, target=('syntactic', 'mean_sent_len'), thresh=0.)
tasks_len.drop(index=[('syntactic', 'mean_sent_len')], inplace=True)

In [None]:
ex_df = pd.DataFrame()
for task in TASKS:
    df_ = pd.DataFrame()
    df_['metric'] = tasks_len[task]
    df_['task'] = task
    df_['index'] = [x[1] for x in df_.index]
    df_['type'] = [x[0] for x in df_.index]
    ex_df = pd.concat([ex_df, df_])

In [None]:
def catplot(data, x, y, hue, col=None, ax=None):
    sns.set_theme(style="whitegrid")

    # Draw a pointplot to show pulse as a function of three categorical factors
    g = sns.catplot(
        data=data, x=x, y=y, hue=hue, col=col,
        capsize=.2,  # errorbar="se",
        kind="point", height=6, aspect=.75, ax=ax)
    g.despine(left=True);

In [None]:
def pointplot(data, x, y, hue,  ax=None):
    sns.set_theme(style="whitegrid")

    g = sns.pointplot(
        data=data, x=x, y=y, hue=hue,
        capsize=.2,  # errorbar="se",
        ax=ax);

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('corr len')

pointplot(ex_df.loc['LM'], x="task", y="metric", hue="index", ax=axes[0, 0])
pointplot(ex_df.loc['syntactic'], x="task", y="metric", hue="index", ax=axes[0, 1])
pointplot(ex_df.loc['lexical'], x="task", y="metric", hue="index", ax=axes[1, 0])
pointplot(ex_df.loc['graph'], x="task", y="metric", hue="index", ax=axes[1, 1])

# for ax, col in zip(axes[0], ('total', 'positive', 'negative')):
#     ax.set_title(col)

# for ax, row in zip(axes[:,0], ('NAP', 'Dep', 'HC')):
#     ax.set_ylabel(row, rotation=0, size='large')

In [None]:
catplot(ex_df, x="task", y="metric", hue="index", col="type")

## Corr corrected for length

In [None]:
compute_coefficient(chair, ('target', 'panss_total'), ('syntactic', 'mean_sent_len'),
                    column_names=['sent len'], target_name='panss', add_sq=False)[0]

In [None]:
compute_coefficient(chair, ('target', 'panss_total'), ('syntactic', 'AUX'),
                    column_names=['sent len'], target_name='panss', add_sq=False)[0]

In [None]:
compute_ortogonolized_coefficient(chair, ('target', 'panss_total'), ('syntactic', 'mean_sent_len'), ('syntactic', 'AUX'),
                                  column_names=['sent len'], target_name='panss', add_sq=False)[0]

In [None]:
scale_cols = [('target', x) for x in ['panss_pos', 'panss_neg', 'panss_o', 'panss_total']]

#### compare

In [None]:
test_two_groups = ('sz', 'control_psy')
groups = merge_df[merge_df[('target', 'target', 'diagnosis.type')].isin(test_two_groups)]
chair_group = task_data(groups, 'chair')
chair_group[('target', 'group')] = chair_group[('target', 'diagnosis.type')].apply(lambda x: 0 if x =='control_psy' else 1)


In [None]:
a = 0.001
s_t, res_t = display_group_test(chair_group, cols_tasks, ('target', 'diagnosis.type'), stats.ttest_ind, stat_name='t', alpha=a)

In [None]:
s_panss, r_panss = display_test(chair, cols_tasks, ('target','panss_total'), stats.pearsonr, stat_name='r', alpha=a)
s_panss_pos, r_panss_pos = display_test(chair, cols_tasks, ('target', 'panss_pos'), stats.pearsonr, stat_name='r', alpha=a)
s_panss_neg, r_panss_neg = display_test(chair, cols_tasks, ('target','panss_neg'), stats.pearsonr, stat_name='r', alpha=a)
s_panss_o, r_panss_o = display_test(chair, cols_tasks, ('target', 'panss_o'), stats.pearsonr, stat_name='r', alpha=a)
combined_corr = pd.concat([res_t, r_panss, r_panss_pos, r_panss_neg, r_panss_o], 
                     keys= ['t_test'] + [s[1] for s in scale_cols], 
                     names=["scale"], axis=1)

In [None]:
combined_corr[('mean', 'abs_r')] = combined_corr[[(scale[1], 'abs_r') for scale in scale_cols]].mean(axis=1)
mean_abs_r = combined_corr.pop(('mean', 'abs_r'))
combined_corr.insert(0, ('mean', 'abs_r'), mean_abs_r) 

In [None]:
# combined_corr[[(scale, 'abs_r') for scale in ["saps34", "sans12", "panss", "panss_pos", "panss_neg", "panss_o"]]].mean(axis=1)
t_p_n_rank = pd.concat([combined_corr[('t_test', 'abs_t')].rank(ascending=False)] + [combined_corr[(scale, 'abs_r')].rank(ascending=False) for scale in ['panss_pos', 'panss_neg']], axis=1).mean(axis=1)
mean_rank = pd.concat([combined_corr[(scale[1], 'abs_r')].rank(ascending=False) for scale in scale_cols], axis=1).mean(axis=1)
combined_corr.insert(1, ('mean', 'rank'), mean_rank) 
combined_corr.insert(2, ('mean', 'rank_abs'), mean_rank.rank()) 
combined_corr.insert(1, ('mean', 'tpn_rank'), t_p_n_rank) 
combined_corr.insert(2, ('mean', 'tpn_rank_abs'), t_p_n_rank.rank()) 

style(combined_corr.sort_values(('mean', 'rank'))[['mean', 't_test']])

In [None]:
cols_dict = {}
for col in cols_tasks:
    cols_dict[col[1]] = compute_ortogonolized_coefficient(chair, ('target', 'panss_total'), ('syntactic', 'mean_sent_len'), col)[0]
combined_corr.insert(1, ('panss', 'r_corrected_for_mean_len'), pd.Series(cols_dict)) 

In [None]:
cols_dict = {}
for col in cols_tasks:
    cols_dict[col[1]] = compute_ortogonolized_coefficient(chair, ('target', 'panss_pos'), ('syntactic', 'mean_sent_len'), col)[0]
combined_corr.insert(1, ('panss_pos', 'r_corrected_for_mean_len'), pd.Series(cols_dict)) 

In [None]:
cols_dict = {}
for col in cols_tasks:
    cols_dict[col[1]] = compute_ortogonolized_coefficient(chair, ('target', 'panss_neg'), ('syntactic', 'mean_sent_len'), col)[0]
combined_corr.insert(1, ('panss_neg', 'r_corrected_for_mean_len'), pd.Series(cols_dict)) 

In [None]:
cols_dict = {}
for col in cols_tasks:
    cols_dict[col[1]] = compute_ortogonolized_logit(chair_group, ('target', 'group'), ('syntactic', 'mean_sent_len'), col)
combined_corr.insert(1, ('t_test', 'r_corrected_for_mean_len'), pd.Series(cols_dict)) 

In [None]:
t_p_n_corr_rank = pd.concat([combined_corr[('t_test', 'abs_t')].rank(ascending=False)] + [combined_corr[(scale, 'r_corrected_for_mean_len')].rank(ascending=False) for scale in ['panss_pos', 'panss_neg']], axis=1).mean(axis=1)
combined_corr.insert(1, ('mean', 'tpn_rank_corr'), t_p_n_corr_rank) 
combined_corr.insert(2, ('mean', 'tpn_rank_corr_abs'), t_p_n_corr_rank.rank()) 

In [None]:
# combined_corr.sort_values(('mean', 'tpn_rank_corr'), ascending=True).apply(pd.to_numeric).style.background_gradient(axis=0, cmap='Reds')

## bootstrap

In [None]:
def draw_sample_with_replacement(df, seed=None):
    if seed:
        random.seed(seed)
    length = len(df.index)
    idxs = choices(range(length), k=length)
    return df.iloc[idxs, :]

dict_scales_sapmles = {scale[1]: {metric: [] for metric in cols_tasks} for scale in scale_cols}
dict_scales_raws = {scale[1]: {metric: [] for metric in cols_tasks} for scale in scale_cols}
for i in tqdm(range(1000)):
    sample = draw_sample_with_replacement(chair, seed=i)
    for scale in scale_cols:
        for col in cols_tasks:
            if col != ('syntactic', 'mean_sent_len'):
                r_corr = compute_ortogonolized_coefficient(sample, scale, ('syntactic', 'mean_sent_len'), col)[0]
                r_raw = compute_coefficient(sample, scale, col)[0]
                dict_scales_raws[scale[1]][col].append(r_raw)
                dict_scales_sapmles[scale[1]][col].append(r_corr)
            else:
                dict_scales_sapmles[scale[1]][col].append(np.nan)

sample_df = pd.DataFrame()
for scale in scale_cols:
    scale = scale[1]
    scale_metric_sd = {}
    scale_metric_md = {}
    scale_metric_mn = {}
    scale_metric_cov = {}
    scale_metric_diff = {}
    scale_metric_q25 = {}
    scale_metric_q75 = {}
    for col in cols_tasks:
        if col != ('syntactic', 'mean_sent_len'):
            scale_metric_sd[col] = np.std(dict_scales_sapmles[scale][col])
            scale_metric_md[col] = np.median(dict_scales_sapmles[scale][col])
            scale_metric_mn[col] = np.mean(dict_scales_sapmles[scale][col])
            scale_metric_q25[col] = np.quantile(dict_scales_sapmles[scale][col], 0.25)
            scale_metric_q75[col] = np.quantile(dict_scales_sapmles[scale][col], 0.75)
            scale_metric_cov[col] = np.std(dict_scales_sapmles[scale][col]) / np.mean(dict_scales_sapmles[scale][col])
            scale_metric_diff[col] = np.mean(dict_scales_sapmles[scale][col]) / np.mean(dict_scales_raws[scale][col])
            
    sample_df[(scale, 'md')] = pd.Series(scale_metric_md)
    sample_df[(scale, 'mn')] = pd.Series(scale_metric_mn)
    sample_df[(scale, 'sd')] = pd.Series(scale_metric_sd)
    sample_df[(scale, 'cov')] = pd.Series(scale_metric_cov)
    sample_df[(scale, 'dif')] = pd.Series(scale_metric_diff)
    sample_df[(scale, 'q25')] = pd.Series(scale_metric_q25)
    sample_df[(scale, 'q75')] = pd.Series(scale_metric_q75)

sample_df.columns = pd.MultiIndex.from_tuples(sample_df.columns, names=('scale', 'res'))
style(sample_df)

In [None]:
def t_test(df, column, target_column, test=stats.ttest_ind, nan_policy='omit', stat_name='x', alpha=0.05,
           group_names=None):
    if group_names is None:
        group_names = df[target_column].dropna().unique().tolist()
        assert len(group_names) == 2, 'only two group tests are supported'

    r, p = test(df[df[target_column] == group_names[1]][column],
                df[df[target_column] == group_names[0]][column], nan_policy=nan_policy)
    return r

In [None]:
def bootstrap(df, cols_av, scale_cols, N, col_to_correct_for=('syntactic', 'mean_sent_len'), group=None):
    dict_scales_sapmles = {k: {scale: {metric: [] for metric in cols_av} for scale in scale_cols} \
                           for k in ('sample_corr', 'sample_raw', 'r', 't', 'r_control')}
    for i in tqdm(range(N)):
        sample = draw_sample_with_replacement(df, seed=i)
        for scale in scale_cols:
            for col in cols_av:
                if group:
                    t_test_res = ttest_select(sample, ('sz', 'control_psy'), group)
#                     t_test(sample, col, group)
                    dict_scales_sapmles['t'][scale][col].append(t_test_res)
                    
                r_raw = compute_coefficient(sample, scale, col)[0]
                dict_scales_sapmles['sample_raw'][scale][col].append(r_raw)
                
                droped = sample.dropna(subset=[col, scale])
                r = stats.pearsonr(droped[col], droped[scale])
                dict_scales_sapmles['r'][scale][col].append(r)
                
                if col != col_to_correct_for:
                    
                    droped_c = sample.dropna(subset=[col, col_to_correct_for])
                    r_c = stats.pearsonr(droped_c[col], droped_c[col_to_correct_for])
                    dict_scales_sapmles['r_control'][scale][col].append(r_c)
                    
                    r_corr = compute_ortogonolized_coefficient(sample, scale, col_to_correct_for, col)[0]
                    dict_scales_sapmles['sample_corr'][scale][col].append(r_corr)

    return dict_scales_sapmles

In [None]:
dict_scales_sapmles = bootstrap(chair, cols_tasks, scale_cols, 10, col_to_correct_for=('syntactic', 'mean_sent_len'), group='diagnosis.type')

In [None]:
reform = {(scale[1], measure): dict_scales_sapmles[measure][scale] for scale in scale_cols for measure in dict_scales_sapmles}

In [None]:
median_bootstrap = pd.DataFrame(reform).applymap(np.nanmedian)

In [None]:
median_bootstrap.loc['LM', 'panss_pos']

In [None]:
d = aplly_to_all_tasks(merge_df, bootstrap, cols_av=cols_tasks, scale_cols=scale_cols, N=10, col_to_correct_for=('syntactic', 'mean_sent_len'), group='diagnosis.type')


In [None]:
d