In [2]:
import numpy as np
import pandas as pd
import os
from scipy.stats import spearmanr,pearsonr
from statsmodels.stats.multitest import multipletests
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# select the CN subjects to create semi-simulated data
data = pd.read_csv('DATA.csv')   # path of your dataset

CN_data = data[data['DIAGNOSIS']=='CN']
print('No of cognitively normal participants (CN)', len(CN_data))

# arrange the columns
CN_data = CN_data[['ROI1', 'ROI2',...,'ROI145', 'MMSE', 'Subject ID']]

# normalize the data
CN_data['Sum'] = CN_data.iloc[:, :-2].sum(axis=1)
numeric_columns = CN_data.columns[:-3]
CN_data[numeric_columns] = CN_data[numeric_columns].div(CN_data['Sum'], axis=0)

In [None]:
# calculate the Spearman's correlations with the corresponding p-values between the ROIs and MMSE using the CN_data,
# and select the top 10 ROIs with highest positive correlation to perturb. 
# These perturbations represents the psuedo-patient patterns.

corr_list = []
p_val_list = []

for column in CN_data.columns[:-3]:  # consider only the ROI columns
    corr, p_val = spearmanr(CN_data['MMSE'], CN_data[column], nan_policy='omit')
    corr_list.append(corr)
    p_val_list.append(p_val)

# Adjust p-values for multiple testing
fdr_corr_p_val = multipletests(p_val_list, method='fdr_bh')[1]

# Create DataFrame with correlation results
correlation_results = pd.DataFrame({
    'Column': CN_data.columns[:-3],
    'Correlation': corr_list,
    'p-value': p_val_list,
    'FDR Corrected p-value': fdr_corr_p_val
})
correlation_results = correlation_results.sort_values(by='Correlation', ascending=False)

# select the top 10 columns with highest positive correlations with MMSE, along with MMSE column, to perturb the data
columns_to_change = correlation_results['Column'].iloc[-10:].tolist() + ['MMSE']
print(columns_to_change)

In [None]:
excel_writer2 = pd.ExcelWriter('path to file containing the ROI and MMSE perturbations results.xlsx')

df_30 = CN_data[CN_data['MMSE']==30].reset_index().drop(['index', 'Sum'], axis=1)
df_29 = CN_data[CN_data['MMSE']==29].reset_index().drop(['index', 'Sum'], axis=1)
df_28 = CN_data[CN_data['MMSE']==28].reset_index().drop(['index', 'Sum'], axis=1)
df_27 = CN_data[CN_data['MMSE']==27].reset_index().drop(['index', 'Sum'], axis=1)
df_26 = CN_data[CN_data['MMSE']==26].reset_index().drop(['index', 'Sum'], axis=1)
df_25 = CN_data[CN_data['MMSE']==25].reset_index().drop(['index', 'Sum'], axis=1)
df_24 = CN_data[CN_data['MMSE']==24].reset_index().drop(['index', 'Sum'], axis=1)
df_23 = CN_data[CN_data['MMSE']==23].reset_index().drop(['index', 'Sum'], axis=1)

# get the participants with MMSE=30 and perturb half of them 
df_30_sim = df_30.copy()
df_30_sim_org = df_30_sim.sample(n=int(len(df_30_sim)/2), random_state=42)
df_30_sim_org = df_30_sim_org.reset_index().drop(['index'], axis=1)
df_30_sim_rest = df_30_sim_rest.reset_index().drop(['index'], axis=1)

vol_change_30 = np.random.uniform(-0.3,-0.97, size=(len(df_30_sim_rest), ))  
mmse_change_30 = np.random.uniform(-0.35, -0.17, size=(len(df_30_sim_rest), )) 

for i, col in enumerate(columns_to_change):
    if col == 'MMSE':
        df_30_sim_rest[col] *= (1 + mmse_change_30).astype(int)
    else:
        df_30_sim_rest[col] *= (1 + vol_change_30) 

for i, col in enumerate(columns_to_change):
    if col != 'MMSE':
        mmse_bl_24_indices = df_30_sim_rest[df_30_sim_rest['MMSE'] == 24].index
        mmse_bl_23_indices = df_30_sim_rest[df_30_sim_rest['MMSE'] == 23].index
        mmse_bl_22_indices = df_30_sim_rest[df_30_sim_rest['MMSE'] == 22].index
        mmse_bl_21_indices = df_30_sim_rest[df_30_sim_rest['MMSE'] == 21].index
        mmse_bl_20_indices = df_30_sim_rest[df_30_sim_rest['MMSE'] == 20].index
        mmse_bl_19_indices = df_30_sim_rest[df_30_sim_rest['MMSE'] == 19].index
        
        vol_change_30_24 = np.random.uniform(-0.1,-0.20, size=(len(mmse_bl_24_indices), ))  
        vol_change_30_23 = np.random.uniform(-0.2,-0.45, size=(len(mmse_bl_23_indices), ))
        vol_change_30_22 = np.random.uniform(-0.35,-0.70, size=(len(mmse_bl_22_indices), ))
        vol_change_30_21 = np.random.uniform(-0.55,-0.85, size=(len(mmse_bl_21_indices), ))
        vol_change_30_20 = np.random.uniform(-0.7,-0.93, size=(len(mmse_bl_20_indices), ))
        vol_change_30_19 = np.random.uniform(-0.8,-0.98, size=(len(mmse_bl_19_indices), ))
        
        df_30_sim_rest.iloc[mmse_bl_24_indices, df_30_sim_rest.columns.get_loc(col)] *= 1 + vol_change_30_24
        df_30_sim_rest.iloc[mmse_bl_23_indices, df_30_sim_rest.columns.get_loc(col)] *= 1 + vol_change_30_23
        df_30_sim_rest.iloc[mmse_bl_22_indices, df_30_sim_rest.columns.get_loc(col)] *= 1 + vol_change_30_22
        df_30_sim_rest.iloc[mmse_bl_21_indices, df_30_sim_rest.columns.get_loc(col)] *= 1 + vol_change_30_21
        df_30_sim_rest.iloc[mmse_bl_20_indices, df_30_sim_rest.columns.get_loc(col)] *= 1 + vol_change_30_20
        df_30_sim_rest.iloc[mmse_bl_19_indices, df_30_sim_rest.columns.get_loc(col)] *= 1 + vol_change_30_19

# get the participants with MMSE=29 and perturb half of them 
df_29_sim = df_29.copy()
df_29_sim_org = df_29_sim.sample(n=int(len(df_29_sim)/2), random_state=43)
df_29_sim_org = df_29_sim_org.reset_index().drop(['index'], axis=1)
df_29_sim_rest = df_29_sim_rest.reset_index().drop(['index'], axis=1)

vol_change_29 = np.random.uniform(-0.1,-0.7, size=(len(df_29_sim_rest), ))  
mmse_change_29 = np.random.uniform(-0.13, -0.08, size=(len(df_29_sim_rest), )) 

for i, col in enumerate(columns_to_change):
    if col == 'MMSE':
        df_29_sim_rest[col] *= (1 + mmse_change_29).astype(int)
    else:
        df_29_sim_rest[col] *= (1 + vol_change_29) 

for i, col in enumerate(columns_to_change):
    if col != 'MMSE':
        mmse_bl_26_indices = df_29_sim_rest[df_29_sim_rest['MMSE'] == 26].index
        mmse_bl_25_indices = df_29_sim_rest[df_29_sim_rest['MMSE'] == 25].index
        
        vol_change_29_26 = np.random.uniform(-0.1,-0.20, size=(len(mmse_bl_26_indices), ))  
        vol_change_29_25 = np.random.uniform(-0.2,-0.45, size=(len(mmse_bl_25_indices), ))
        
        df_29_sim_rest.iloc[mmse_bl_26_indices, df_29_sim_rest.columns.get_loc(col)] *= 1 + vol_change_29_26
        df_29_sim_rest.iloc[mmse_bl_25_indices, df_29_sim_rest.columns.get_loc(col)] *= 1 + vol_change_29_25

# get the participants with MMSE=28 and perturb half of them 
df_28_sim = df_28.copy()
df_28_sim_org = df_28_sim.sample(n=int(len(df_28_sim)/2), random_state=44)
df_28_sim_rest = df_28_sim.drop(df_28_sim_org.index)
df_28_sim_org = df_28_sim_org.reset_index().drop(['index'], axis=1)
df_28_sim_rest = df_28_sim_rest.reset_index().drop(['index'], axis=1)

vol_change_28 = np.random.uniform(-0.1,-0.15, size=(len(df_28_sim_rest), ))  
mmse_change_28 = np.random.uniform(-0.03, -0.025, size=(len(df_28_sim_rest), )) 

for i, col in enumerate(columns_to_change):
    if col == 'MMSE':
        df_28_sim_rest[col] *= (1 + mmse_change_28).astype(int)
    else:
        df_28_sim_rest[col] *= (1 + vol_change_28) 

# combine all perturbed and non perturbed samples
sim_df = pd.concat([df_30_sim_org, df_30_sim_rest, df_24, df_23, df_29_sim_org, df_29_sim_rest, df_26, df_25, df_28_sim_org, df_28_sim_rest, df_27], axis=0, ignore_index=True)
sim_df.to_excel(excel_writer2, index=False)