In [None]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, math
from scipy.optimize import curve_fit
from sklearn.cluster import KMeans
from scipy.stats import ttest_ind

In [None]:
data_dir = '../../../datasets/ppmi/visit_feature_inputs_asof_2019Jan24_using_CMEDTM/'
pd_total_df = pd.read_csv(data_dir + 'PD_totals_across_time.csv')
pd_questions_df = pd.read_csv(data_dir + 'PD_questions_across_time.csv')

### Select 20 patients to use for visualizations

In [None]:
np.random.seed(82073)
patnos = pd_total_df.PATNO.unique()
np.random.shuffle(patnos)
selected_patnos = np.empty(20)
idx = 0
for patno in patnos:
    if idx == 20:
        break
    patno_df = pd_total_df.loc[pd_total_df['PATNO']==patno]
    patno_df = patno_df.dropna(subset=['NUPDRS3_untreated', 'NUPDRS3_off', 'NUPDRS3_on'], \
                               how='all').sort_values(by=['EVENT_ID_DUR'])
    if len(patno_df) >= 5:
        selected_patnos[idx] = patno
        idx += 1
selected_patnos

### Calculate subtotals across time split by treatment

In [None]:
def get_4cond_cols(cols):
    # returns untreated, off, on, maob
    untreated_cols = []
    off_cols = []
    on_cols = []
    maob_cols = []
    for col in cols:
        if col.startswith('NP3'):
            untreated_cols.append(col+'_untreated')
            off_cols.append(col+'_off')
            on_cols.append(col+'_on')
            maob_cols.append(col+'_maob')
        else:
            untreated_cols.append(col)
            off_cols.append(col)
            on_cols.append(col)
            maob_cols.append(col)
    return untreated_cols, off_cols, on_cols, maob_cols

In [None]:
def get_cond_sum(df, cols, sum_col):
    df[sum_col] = np.where(pd.isnull(df[cols[0]]), float('NaN'), df[cols].sum(axis=1))
    return df

In [None]:
tremor_cols = ['NP3RTALL', 'NP3RTALU', 'NP3KTRML', 'NP3PTRML', 'NP3KTRMR', 'NP3PTRMR', 'NP3RTARU', \
                   'NP3RTALJ', 'NP3RTARL', 'NP2TRMR', 'NP3RTCON']
rigidity_left_cols = ['NP3RIGLU', 'NP3RIGLL', 'NP3PRSPL', 'NP3FTAPL', 'NP3HMOVL', 'NP3LGAGL', 'NP3TTAPL']
rigidity_right_cols = ['NP3RIGRL', 'NP3RIGRU', 'NP3PRSPR', 'NP3FTAPR', 'NP3HMOVR', 'NP3LGAGR', 'NP3TTAPR']
face_cols = ['NP3SPCH', 'NP3RIGN', 'NP3BRADY', 'NP3FACXP']
gait_cols = ['NP3FRZGT', 'NP3PSTBL', 'NP3RISNG', 'NP3GAIT', 'NP3POSTR']
subtotal_cols_dict = {'NUPDRS_TREMOR': tremor_cols, 'NUPDRS_RIGIDITY_LEFT': rigidity_left_cols, \
                      'NUPDRS_RIGIDITY_RIGHT': rigidity_right_cols, 'NUPDRS_FACE': face_cols, \
                      'NUPDRS_GAIT': gait_cols}
for subtotal in subtotal_cols_dict.keys():
    subtotal_untreated_cols, subtotal_off_cols, subtotal_on_cols, subtotal_maob_cols \
        = get_4cond_cols(subtotal_cols_dict[subtotal])
    pd_questions_df = get_cond_sum(pd_questions_df, subtotal_untreated_cols, subtotal + '_untreated')
    pd_questions_df = get_cond_sum(pd_questions_df, subtotal_off_cols, subtotal + '_off')
    pd_questions_df = get_cond_sum(pd_questions_df, subtotal_on_cols, subtotal + '_on')
    pd_questions_df = get_cond_sum(pd_questions_df, subtotal_maob_cols, subtotal + '_maob')

In [None]:
def get_on_off_sums(df, cols, sum_header):
    df[sum_header + '_untreated'] = np.where(~pd.isnull(df['NP3RTALL_untreated']), df[cols].sum(axis=1), float('NaN'))
    df[sum_header + '_treated'] = np.where(~pd.isnull(df['NP3RTALL_untreated']), float('NaN'), df[cols].sum(axis=1))
    return df

In [None]:
daily_activities = ['NP2HWRT', 'NP2FREZ', 'NP2HYGN', 'NP2EAT', 'NP2HOBB', 'NP2WALK', 'NP2DRES', 'NP2RISE', \
                    'NP2TURN', 'NP2SWAL', 'NP2SALV', 'NP2SPCH']
pd_questions_df = get_on_off_sums(pd_questions_df, daily_activities, 'NUPDRS_DAILYACT')

In [None]:
pd_questions_df = pd_questions_df.merge(pd_total_df[['PATNO','EVENT_ID','NUPDRS3_untreated','NUPDRS3_on',\
                                                     'NUPDRS3_off','NUPDRS3_maob']], \
                                       on=['PATNO','EVENT_ID'], how='left', validate='one_to_one')

### Fit curves to each treatment setting and make plots for each patient

In [None]:
def lin_func(x, a, b):
    return a*x + b

def quadratic_func(x, a, b, c):
    return a*x**2 + b*x + c

def piecewise_lin_func(x, a, b, c, d, e):
    return np.piecewise(x, [x < e], [lambda x: a*x + b, lambda x: c*x + d])

In [None]:
nupdrs_col_headers = ['NUPDRS_TREMOR', 'NUPDRS_RIGIDITY_LEFT', 'NUPDRS_RIGIDITY_RIGHT', 'NUPDRS_FACE', 'NUPDRS_GAIT', \
                      'NUPDRS_DAILYACT', 'NUPDRS3']
nupdrs_col_labels = ['Tremor', 'Rigidity left', 'Rigidity right', 'Face', 'Gait', 'Part II', 'Part III']
plt.rcParams.update({'font.size': 18})
def make_mdsupdrs_plot(df, func_fit='linear', offset=0):
    assert func_fit in {'linear', 'piecewise_linear', 'quadratic'}
    if func_fit == 'linear':
        func = lin_func
        num_params = 2
    elif func_fit == 'piecewise_linear':
        func = piecewise_lin_func
        num_params = 5
    else:
        func = quadratic_func
        num_params = 3
    # to avoid misspecification, the number of points must be at least the number of parameters before fitting
    num_rows = 5
    num_cols = len(nupdrs_col_headers)
    fig, ax = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(4*num_cols, 4*num_rows))
    for row_idx in range(num_rows):
        patno = selected_patnos[row_idx+offset]
        patno_df = df.loc[df['PATNO']==patno]
        for col_idx in range(len(nupdrs_col_headers)):
            col_header = nupdrs_col_headers[col_idx]
            if col_header != 'NUPDRS_DAILYACT':
                patno_col_df = patno_df.dropna(subset=[col_header+'_untreated',col_header+'_off',col_header+'_on',\
                                               col_header+'_maob'], \
                                       how='all').sort_values(by=['EVENT_ID_DUR'])
            else:
                patno_col_df = patno_df.dropna(subset=[col_header+'_untreated',col_header+'_treated'], \
                                       how='all').sort_values(by=['EVENT_ID_DUR'])
       
            untreated_df = patno_col_df.dropna(subset=[col_header+'_untreated'])
            untreated_times = untreated_df.EVENT_ID_DUR.values
            untreated_values = untreated_df[col_header+'_untreated'].values
            if len(untreated_times) > 0:
                ax[row_idx, col_idx].scatter(untreated_times, untreated_values, c='b', label='untreated')
                if len(untreated_times) >= num_params:
                    untreated_params, _ = curve_fit(func, untreated_times, untreated_values)
                    untreated_smooth_xs = 0.01*np.arange(100*np.min(untreated_times), 100*np.max(untreated_times))
                    ax[row_idx, col_idx].plot(untreated_smooth_xs, func(untreated_smooth_xs, *untreated_params), 'b')
            ax[row_idx, col_idx].set_title(nupdrs_col_labels[col_idx])
            if col_header != 'NUPDRS_DAILYACT':
                off_df = patno_col_df.dropna(subset=[col_header+'_off'])
                off_times = off_df.EVENT_ID_DUR.values
                off_values = off_df[col_header+'_off'].values
                if len(off_times) > 0:
                    ax[row_idx, col_idx].scatter(off_times, off_values, c='g', label='"off" meds')
                    if len(off_times) >= num_params:
                        off_params, _ = curve_fit(func, off_times, off_values)
                        off_smooth_xs = 0.01*np.arange(100*np.min(off_times), 100*np.max(off_times))
                        ax[row_idx, col_idx].plot(off_smooth_xs, func(off_smooth_xs, *off_params), 'g')

                on_df = patno_col_df.dropna(subset=[col_header+'_on'])
                on_times = on_df.EVENT_ID_DUR.values
                on_values = on_df[col_header+'_on'].values
                if len(on_times) > 0:
                    ax[row_idx, col_idx].scatter(on_times, on_values, c='r', label='"on" meds')
                    if len(on_times) >= num_params:
                        on_params, _ = curve_fit(func, on_times, on_values)
                        
                        on_smooth_xs = 0.01*np.arange(100*np.min(on_times), 100*np.max(on_times))
                        ax[row_idx, col_idx].plot(on_smooth_xs, func(on_smooth_xs, *on_params), 'r')

                maob_df = patno_col_df.dropna(subset=[col_header+'_maob'])
                maob_times = maob_df.EVENT_ID_DUR.values
                maob_values = maob_df[col_header+'_maob'].values
                if len(maob_times) > 0:
                    ax[row_idx, col_idx].scatter(maob_times, maob_values, c='y', label='MAO-B')
                    if len(maob_times) >= num_params:
                        maob_params, _ = curve_fit(func, maob_times, maob_values)
                        maob_smooth_xs = 0.01*np.arange(100*np.min(maob_times), 100*np.max(maob_times))
                        ax[row_idx, col_idx].plot(maob_smooth_xs, func(maob_smooth_xs, *maob_params), 'y')
            else:
                treated_df = patno_col_df.dropna(subset=[col_header+'_treated'])
                treated_times = treated_df.EVENT_ID_DUR.values
                treated_values = treated_df[col_header+'_treated'].values
                if len(treated_times) > 0:
                    ax[row_idx, col_idx].scatter(treated_times, treated_values, c='r', label='treated')
                    if len(treated_times) >= num_params:
                        treated_params, _ = curve_fit(func, treated_times, treated_values)
                        treated_smooth_xs = 0.01*np.arange(100*np.min(treated_times), 100*np.max(treated_times))
                        ax[row_idx, col_idx].plot(treated_smooth_xs, func(treated_smooth_xs, *treated_params), 'r')
                        
    plt.tight_layout()
    plt.legend()
    plt.show()

In [None]:
make_mdsupdrs_plot(pd_questions_df)

In [None]:
make_mdsupdrs_plot(pd_questions_df, offset=5)

In [None]:
make_mdsupdrs_plot(pd_questions_df, func_fit='piecewise_linear')

In [None]:
make_mdsupdrs_plot(pd_questions_df, func_fit='piecewise_linear', offset=5)

In [None]:
make_mdsupdrs_plot(pd_questions_df, func_fit='quadratic')

In [None]:
make_mdsupdrs_plot(pd_questions_df, func_fit='quadratic', offset=5)

### Pick function using MSE + changepoint restriction for piecewise functions

In [None]:
def mse_calc(x, y, func, *params):
    return np.mean(np.square(y - func(x, *params)))

def pick_func(x, y):
    # returns best function + its parameters
    if len(x) < 2:
        return None
    lin_params, _ = curve_fit(lin_func, x, y)
    if len(x) < 3:
        return lin_func, lin_params
    lin_mse = mse_calc(x, y, lin_func, *lin_params)
    quadratic_params, _ = curve_fit(quadratic_func, x, y)
    quadratic_mse = mse_calc(x, y, quadratic_func, *quadratic_params)
    if len(x) >= 5:
        piecewise_lin_params, _ = curve_fit(piecewise_lin_func, x, y)
        if piecewise_lin_params[-1] < x[1] or piecewise_lin_params[-1] > x[-2]:
            piecewise_lin_mse = float('inf')
        else:
            piecewise_lin_mse = mse_calc(x, y, piecewise_lin_func, *piecewise_lin_params)
    else:
        piecewise_lin_mse = float('inf')
    if lin_mse <= quadratic_mse and lin_mse <= piecewise_lin_mse:
        return lin_func, lin_params
    elif quadratic_mse <= piecewise_lin_mse:
        return quadratic_func, quadratic_params
    else:
        return piecewise_lin_func, piecewise_lin_params

In [None]:
nupdrs_col_headers = ['NUPDRS_TREMOR', 'NUPDRS_RIGIDITY_LEFT', 'NUPDRS_RIGIDITY_RIGHT', 'NUPDRS_FACE', 'NUPDRS_GAIT', \
                      'NUPDRS_DAILYACT', 'NUPDRS3']
nupdrs_col_labels = ['Tremor', 'Rigidity left', 'Rigidity right', 'Face', 'Gait', 'Part II', 'Part III']
plt.rcParams.update({'font.size': 24})
nupdrs_col_maxs = [15, 20, 20, 12, 12, 27, 60]
nupdrs_col_mins = [0, 0, 0, 0, 0, 0, 0]
def make_mdsupdrs_plot_pick_func(df, offset=0):
    num_rows = 5
    num_cols = len(nupdrs_col_headers)
    fig, ax = plt.subplots(nrows=num_rows, ncols=num_cols, figsize=(4*num_cols, 4*num_rows))
    for row_idx in range(num_rows):
        patno = selected_patnos[row_idx+offset]
        patno_df = df.loc[df['PATNO']==patno]
        for col_idx in range(len(nupdrs_col_headers)):
            col_header = nupdrs_col_headers[col_idx]
            if col_header != 'NUPDRS_DAILYACT':
                patno_col_df = patno_df.dropna(subset=[col_header+'_untreated',col_header+'_off',col_header+'_on',\
                                               col_header+'_maob'], \
                                       how='all').sort_values(by=['EVENT_ID_DUR'])
            else:
                patno_col_df = patno_df.dropna(subset=[col_header+'_untreated',col_header+'_treated'], \
                                       how='all').sort_values(by=['EVENT_ID_DUR'])
       
            untreated_df = patno_col_df.dropna(subset=[col_header+'_untreated'])
            untreated_times = untreated_df.EVENT_ID_DUR.values
            untreated_values = untreated_df[col_header+'_untreated'].values
            if len(untreated_times) > 0:
                ax[row_idx, col_idx].scatter(untreated_times, untreated_values, c='b', label='untreated')
                if len(untreated_times) >= 2:
                    untreated_func, untreated_params = pick_func(untreated_times, untreated_values)
                    untreated_smooth_xs = 0.01*np.arange(100*np.min(untreated_times), 100*np.max(untreated_times))
                    ax[row_idx, col_idx].plot(untreated_smooth_xs, \
                                              untreated_func(untreated_smooth_xs, *untreated_params), 'b', linewidth=4)
            ax[row_idx, col_idx].set_title(nupdrs_col_labels[col_idx])
            ax[row_idx, col_idx].set_ylim([nupdrs_col_mins[col_idx], nupdrs_col_maxs[col_idx]])
            if col_header != 'NUPDRS_DAILYACT':
                off_df = patno_col_df.dropna(subset=[col_header+'_off'])
                off_times = off_df.EVENT_ID_DUR.values
                off_values = off_df[col_header+'_off'].values
                if len(off_times) > 0:
                    ax[row_idx, col_idx].scatter(off_times, off_values, c='g', label='"off" meds')
                    if len(off_times) >= 2:
                        off_func, off_params = pick_func(off_times, off_values)
                        off_smooth_xs = 0.01*np.arange(100*np.min(off_times), 100*np.max(off_times))
                        ax[row_idx, col_idx].plot(off_smooth_xs, off_func(off_smooth_xs, *off_params), 'g', \
                                                  linestyle='dashdot', linewidth=4)

                on_df = patno_col_df.dropna(subset=[col_header+'_on'])
                on_times = on_df.EVENT_ID_DUR.values
                on_values = on_df[col_header+'_on'].values
                if len(on_times) > 0:
                    ax[row_idx, col_idx].scatter(on_times, on_values, c='r', label='"on" meds')
                    if len(on_times) >= 2:
                        on_func, on_params = pick_func(on_times, on_values)
                        on_smooth_xs = 0.01*np.arange(100*np.min(on_times), 100*np.max(on_times))
                        ax[row_idx, col_idx].plot(on_smooth_xs, on_func(on_smooth_xs, *on_params), 'r', \
                                                  linestyle='dotted', linewidth=4)

                maob_df = patno_col_df.dropna(subset=[col_header+'_maob'])
                maob_times = maob_df.EVENT_ID_DUR.values
                maob_values = maob_df[col_header+'_maob'].values
                if len(maob_times) > 0:
                    ax[row_idx, col_idx].scatter(maob_times, maob_values, c='y', label='MAO-B')
                    if len(maob_times) >= 2:
                        maob_func, maob_params = pick_func(maob_times, maob_values)
                        maob_smooth_xs = 0.01*np.arange(100*np.min(maob_times), 100*np.max(maob_times))
                        ax[row_idx, col_idx].plot(maob_smooth_xs, maob_func(maob_smooth_xs, *maob_params), 'y', \
                                                  linestyle='dashed', linewidth=4)
            else:
                treated_df = patno_col_df.dropna(subset=[col_header+'_treated'])
                treated_times = treated_df.EVENT_ID_DUR.values
                treated_values = treated_df[col_header+'_treated'].values
                if len(treated_times) > 0:
                    ax[row_idx, col_idx].scatter(treated_times, treated_values, c='r', label='treated')
                    if len(treated_times) >= 2:
                        treated_func, treated_params = pick_func(treated_times, treated_values)
                        treated_smooth_xs = 0.01*np.arange(100*np.min(treated_times), 100*np.max(treated_times))
                        ax[row_idx, col_idx].plot(treated_smooth_xs, \
                                                  treated_func(treated_smooth_xs, *treated_params), 'r', \
                                                  linestyle='dotted', linewidth=4)
                        
    plt.tight_layout()
    plt.legend()
    plt.savefig('nupdrs_subtotals_nonlinear_offset' + str(offset) + '_asof_2019Jul31.pdf')
    plt.show()

In [None]:
make_mdsupdrs_plot_pick_func(pd_questions_df)

In [None]:
make_mdsupdrs_plot_pick_func(pd_questions_df, offset=5)

### Run k-means clustering on parameters and classify clusters from baseline

In [None]:
def get_patient_params(patno_df):
    # returns dict: subtotal -> {medication state: 7 params}, 0 if missing state
    patno_param_dict = dict()
    for col_header in nupdrs_col_headers:
        patno_param_dict[col_header] = dict()
        if col_header == 'NUPDRS_DAILYACT':
            col_endings = {'untreated', 'treated'}
        else:
            col_endings = {'untreated', 'on', 'off', 'maob'}
        for col_ending in col_endings:
            patno_col_df = patno_df.dropna(subset=[col_header + '_' + col_ending])
            patno_col_times = patno_col_df.EVENT_ID_DUR.values
            patno_col_values = patno_col_df[col_header + '_' + col_ending].values
            if len(patno_col_times) < 2:
                patno_param_dict[col_header][col_ending] = np.zeros(7)
            else:
                _, func_params = pick_func(patno_col_times, patno_col_values)
                if len(func_params) == 2: #linear
                    formatted_params = np.array([0, func_params[0], func_params[1], \
                                                 0, func_params[0], func_params[1], 0])
                elif len(func_params) == 3: # quadratic
                    formatted_params = np.array([func_params[0], func_params[1], func_params[2], \
                                                 func_params[0], func_params[1], func_params[2], 0])
                else: #piecewise linear
                    formatted_params = np.array([0, func_params[0], func_params[1], \
                                                 0, func_params[2], func_params[3], func_params[4]])
                patno_param_dict[col_header][col_ending] = formatted_params
    return patno_param_dict

In [None]:
patno_params_dict = dict()
for patno in pd_questions_df.PATNO.unique():
    print(patno)
    patno_df = pd_questions_df.loc[pd_questions_df['PATNO']==patno]
    patno_params_dict[patno] = get_patient_params(patno_df)

In [None]:
patno_params_dict[3001]

In [None]:
# gather parameters
patnos = patno_params_dict.keys()
param_cols = []
subtotals = patno_params_dict[patnos[0]].keys()
subtotals.sort()
for subtotal in subtotals:
    subtotal_settings = patno_params_dict[patnos[0]][subtotal].keys()
    subtotal_settings.sort()
    for treatment_setting in subtotal_settings:
        param_cols.append(subtotal + '_' + treatment_setting)
patno_params_arr = np.empty((len(patnos), len(param_cols)*7))
print(patno_params_arr.shape)
for patno_idx in range(len(patnos)):
    patno = patnos[patno_idx]
    param_idx = 0
    for subtotal in subtotals:
        subtotal_settings = patno_params_dict[patnos[0]][subtotal].keys()
        subtotal_settings.sort()
        for treatment_setting in subtotal_settings:
            patno_params_arr[patno_idx, param_idx:param_idx+7] = patno_params_dict[patno][subtotal][treatment_setting]
            param_idx += 7
    assert param_idx == patno_params_arr.shape[1]

In [None]:
kmeans_2clusters = KMeans(n_clusters=2, random_state=0).fit(patno_params_arr)
cluster0_idxs = np.nonzero(np.where(kmeans_2clusters.labels_ == 0, 1, 0))[0]
cluster1_idxs = np.nonzero(kmeans_2clusters.labels_)[0]
print(len(cluster0_idxs))
print(len(cluster1_idxs))

In [None]:
kmeans_2clusters.cluster_centers_

In [None]:
kmeans_3clusters = KMeans(n_clusters=3, random_state=0).fit(patno_params_arr)
cluster0_idxs = np.nonzero(np.where(kmeans_3clusters.labels_ == 0, 1, 0))[0]
cluster1_idxs = np.nonzero(np.where(kmeans_3clusters.labels_ == 1, 1, 0))[0]
cluster2_idxs = np.nonzero(np.where(kmeans_3clusters.labels_ == 2, 1, 0))[0]
print(len(cluster0_idxs))
print(len(cluster1_idxs))
print(len(cluster2_idxs))

In [None]:
kmeans_8clusters = KMeans(n_clusters=8, random_state=0).fit(patno_params_arr)
for idx in range(8):
    print(len(np.nonzero(np.where(kmeans_8clusters.labels_ == idx, 1, 0))[0]))

In [None]:
kmeans_4clusters = KMeans(n_clusters=4, random_state=0).fit(patno_params_arr)
for idx in range(4):
    print(len(np.nonzero(np.where(kmeans_4clusters.labels_ == idx, 1, 0))[0]))

In [None]:
kmeans_5clusters = KMeans(n_clusters=5, random_state=0).fit(patno_params_arr)
for idx in range(5):
    print(len(np.nonzero(np.where(kmeans_5clusters.labels_ == idx, 1, 0))[0]))

In [None]:
kmeans_6clusters = KMeans(n_clusters=6, random_state=0).fit(patno_params_arr)
for idx in range(6):
    print(len(np.nonzero(np.where(kmeans_6clusters.labels_ == idx, 1, 0))[0]))

In [None]:
kmeans_7clusters = KMeans(n_clusters=7, random_state=0).fit(patno_params_arr)
for idx in range(7):
    print(len(np.nonzero(np.where(kmeans_7clusters.labels_ == idx, 1, 0))[0]))

In [None]:
# Use 7 clusters - study the 2 largest clusters
cluster0_idxs = np.nonzero(np.where(kmeans_7clusters.labels_ == 0, 1, 0))[0]
cluster1_idxs = np.nonzero(np.where(kmeans_7clusters.labels_ == 2, 1, 0))[0]
cluster0_patnos = np.array(patnos)[cluster0_idxs]
cluster1_patnos = np.array(patnos)[cluster1_idxs]
cluster0_params = np.array(patno_params_arr)[cluster0_idxs]
cluster1_params = np.array(patno_params_arr)[cluster1_idxs]

In [None]:
param_7_cols = []
letters = ['a','b','c','d','e','f','g']
for col in param_cols:
    for letter in letters:
        param_7_cols.append(col + '_' + letter)
signif_param_idxs = []
for param_idx in range(patno_params_arr.shape[1]):
    _, pval = ttest_ind(cluster0_params[:,param_idx], cluster1_params[:,param_idx], equal_var=False)
    if pval < 0.05:
        signif_param_idxs.append(param_idx)
        print(param_7_cols[param_idx] + ': {0:.4f}'.format(np.mean(cluster0_params[:,param_idx])) \
              + ' ({0:.4f}), '.format(np.std(cluster0_params[:,param_idx])) \
              + '{0:.4f}'.format(np.mean(cluster1_params[:,param_idx])) \
              + ' ({0:.4f})'.format(np.std(cluster1_params[:,param_idx])))
#print(np.array(param_7_cols)[signif_param_idxs])
print(len(signif_param_idxs))

In [None]:
param_7_cols = []
letters = ['a','b','c','d','e','f','g']
for col in param_cols:
    for letter in letters:
        param_7_cols.append(col + '_' + letter)
signif_param_idxs = []
for param_idx in range(patno_params_arr.shape[1]):
    _, pval = ttest_ind(cluster0_params[:,param_idx], cluster1_params[:,param_idx], equal_var=False)
    if pval < 0.0005:
        signif_param_idxs.append(param_idx)
        print(param_7_cols[param_idx] + ': {0:.4f}'.format(np.mean(cluster0_params[:,param_idx])) \
              + ' ({0:.4f}), '.format(np.std(cluster0_params[:,param_idx])) \
              + '{0:.4f}'.format(np.mean(cluster1_params[:,param_idx])) \
              + ' ({0:.4f})'.format(np.std(cluster1_params[:,param_idx])))
#print(np.array(param_7_cols)[signif_param_idxs])
print(len(signif_param_idxs))

In [None]:
# get significantly different baseline features
datadir = '../gather_PD_data/'
baseline_df = pd.read_csv(datadir + 'selected_baseline_data_using_CMEDTM.csv')
del baseline_df['ENROLL_CAT']
longitudinal_df = pd.read_csv(datadir + 'selected_longitudinal_data_using_CMEDTM.csv')
screening_longitudinal_df = longitudinal_df.loc[longitudinal_df['EVENT_ID_DUR']==0]
baseline_longitudinal_df = longitudinal_df.loc[longitudinal_df['EVENT_ID_DUR']==0.125]
screening_longitudinal_cols = ['NUPDRS1', 'MOCA', 'NUPDRS2_DAILYACT', 'NUPDRS3_GAIT', 'NUPDRS3_RIGID_RIGHT', \
                               'NUPDRS3_FACE', 'NUPDRS3_TREMOR', 'NUPDRS3_RIGID_LEFT']
baseline_longitudinal_cols = ['SCOPA-AUT', 'HVLT_discrim_recog', 'STAI', 'HVLT_immed_recall', 'QUIP', 'EPWORTH', \
                              'GDSSHORT', 'HVLT_retent', 'BJLO', 'LNS', 'SEMANTIC_FLUENCY', 'REMSLEEP']
baseline_df = baseline_df.merge(screening_longitudinal_df[['PATNO']+screening_longitudinal_cols], on=['PATNO'], \
                                validate='one_to_one')
baseline_df = baseline_df.merge(baseline_longitudinal_df[['PATNO']+baseline_longitudinal_cols], on=['PATNO'], \
                                validate='one_to_one')

In [None]:
cluster0_baseline_df = baseline_df.loc[baseline_df['PATNO'].isin(cluster0_patnos)]
cluster1_baseline_df = baseline_df.loc[baseline_df['PATNO'].isin(cluster1_patnos)]

In [None]:
signif_baseline_feats = []
for col in baseline_df.columns.values[1:]:
    cluster0_col = cluster0_baseline_df[col].dropna().values
    cluster1_col = cluster1_baseline_df[col].dropna().values
    _, pval = ttest_ind(cluster0_col, cluster1_col, equal_var=False)
    if pval < 0.0005:
        signif_baseline_feats.append(col)
        print(col + ': {0:.4f}'.format(np.mean(cluster0_col)) + ' ({0:.4f}), '.format(np.std(cluster0_col)) \
              + '{0:.4f}'.format(np.mean(cluster1_col)) + ' ({0:.4f})'.format(np.std(cluster1_col)))
#print(signif_baseline_feats)

In [None]:
signif_baseline_feats = []
for col in baseline_df.columns.values[1:]:
    cluster0_col = cluster0_baseline_df[col].dropna().values
    cluster1_col = cluster1_baseline_df[col].dropna().values
    _, pval = ttest_ind(cluster0_col, cluster1_col, equal_var=False)
    if pval < 0.05:
        signif_baseline_feats.append(col)
        print(col + ': {0:.4f}'.format(np.mean(cluster0_col)) + ' ({0:.4f}), '.format(np.std(cluster0_col)) \
              + '{0:.4f}'.format(np.mean(cluster1_col)) + ' ({0:.4f})'.format(np.std(cluster1_col)))
#print(signif_baseline_feats)

In [None]:
signif_baseline_feats = []
for col in baseline_df.columns.values[1:]:
    cluster0_col = cluster0_baseline_df[col].dropna().values
    cluster1_col = cluster1_baseline_df[col].dropna().values
    _, pval = ttest_ind(cluster0_col, cluster1_col, equal_var=False)
    if pval < 0.005:
        signif_baseline_feats.append(col)
        print(col + ': {0:.4f}'.format(np.mean(cluster0_col)) + ' ({0:.4f}), '.format(np.std(cluster0_col)) \
              + '{0:.4f}'.format(np.mean(cluster1_col)) + ' ({0:.4f})'.format(np.std(cluster1_col)))
#print(signif_baseline_feats)

In [None]:
# get the proportion of patients that use each functional form
# linear if a = 0 and g = 0
# quadratic if a != 0
# piecewise linear if g != 0
setting_counts = dict()
for subtotal in patno_params_dict[patnos[0]].keys():
    setting_counts[subtotal] = dict()
    for setting in patno_params_dict[patnos[0]][subtotal].keys():
        setting_counts[subtotal][setting] = {'linear': 0, 'piecewise_linear': 0, 'quadratic': 0}
for patno in patno_params_dict.keys():
    for subtotal in patno_params_dict[patno].keys():
        for setting in patno_params_dict[patno][subtotal].keys():
            params = patno_params_dict[patno][subtotal][setting]
            if len(np.nonzero(params)[0]) > 0:
                if params[0] != 0:
                    setting_counts[subtotal][setting]['quadratic'] += 1
                elif params[-1] != 0:
                    setting_counts[subtotal][setting]['piecewise_linear'] += 1
                else:
                    setting_counts[subtotal][setting]['linear'] += 1

In [None]:
print(setting_counts)

In [None]:
# get proportion of patients with each functional form in each cluster
cluster0_setting_counts = dict()
cluster1_setting_counts = dict()
for subtotal in patno_params_dict[patnos[0]].keys():
    cluster0_setting_counts[subtotal] = dict()
    cluster1_setting_counts[subtotal] = dict()
    for setting in patno_params_dict[patnos[0]][subtotal].keys():
        cluster0_setting_counts[subtotal][setting] = {'linear': 0, 'piecewise_linear': 0, 'quadratic': 0}
        cluster1_setting_counts[subtotal][setting] = {'linear': 0, 'piecewise_linear': 0, 'quadratic': 0}
for patno in patno_params_dict.keys():
    for subtotal in patno_params_dict[patno].keys():
        for setting in patno_params_dict[patno][subtotal].keys():
            params = patno_params_dict[patno][subtotal][setting]
            if len(np.nonzero(params)[0]) > 0:
                if patno in cluster0_patnos:
                    if params[0] != 0:
                        cluster0_setting_counts[subtotal][setting]['quadratic'] += 1
                    elif params[-1] != 0:
                        cluster0_setting_counts[subtotal][setting]['piecewise_linear'] += 1
                    else:
                        cluster0_setting_counts[subtotal][setting]['linear'] += 1
                elif patno in cluster1_patnos:
                    if params[0] != 0:
                        cluster1_setting_counts[subtotal][setting]['quadratic'] += 1
                    elif params[-1] != 0:
                        cluster1_setting_counts[subtotal][setting]['piecewise_linear'] += 1
                    else:
                        cluster1_setting_counts[subtotal][setting]['linear'] += 1

In [None]:
print(cluster0_setting_counts)

In [None]:
print(cluster1_setting_counts)

In [None]:
# Make barplots for each set of distributions
def make_barplot(setting_counts, cluster0_setting_counts, cluster1_settting_counts):
    plt.rcParams.update({'font.size': 14})
    subtotals = setting_counts.keys()
    part3_total_idx = subtotals.index('NUPDRS3') # move this to end
    subtotals = subtotals[:part3_total_idx] + subtotals[part3_total_idx+1:] + ['NUPDRS3']
    fig, ax = plt.subplots(nrows=4, ncols=len(subtotals), sharey=True, sharex=True, figsize=(20,10))
    for subtotal_idx in range(len(subtotals)):
        subtotal = subtotals[subtotal_idx]
        if subtotal == 'NUPDRS_DAILYACT':
            settings = ['untreated', 'treated']
        else:
            settings = ['untreated', 'on', 'off', 'maob']
        for row_idx in range(len(settings)):
            # plot order: linear, quadratic, piecewise linear
            setting = settings[row_idx]
            total_y = np.array([setting_counts[subtotal][setting]['linear'], \
                                setting_counts[subtotal][setting]['quadratic'], \
                                setting_counts[subtotal][setting]['piecewise_linear']])
            cluster0_y = np.array([cluster0_setting_counts[subtotal][setting]['linear'], \
                                   cluster0_setting_counts[subtotal][setting]['quadratic'], \
                                   cluster0_setting_counts[subtotal][setting]['piecewise_linear']])
            cluster1_y = np.array([cluster1_setting_counts[subtotal][setting]['linear'], \
                                   cluster1_setting_counts[subtotal][setting]['quadratic'], \
                                   cluster1_setting_counts[subtotal][setting]['piecewise_linear']])
            x = np.array([0,1,2])
            ax[row_idx,subtotal_idx].bar(x-0.2, total_y, width=0.2, color='b', align='center', label='Total')
            ax[row_idx,subtotal_idx].bar(x, cluster0_y, width=0.2, color='g', align='center', label='Cluster 0')
            ax[row_idx,subtotal_idx].bar(x+0.2, cluster1_y, width=0.2, color='r', align='center', label='Cluster 1')
        if subtotal == 'NUPDRS3':
            subtotal_title = 'Part III total'
        else:
            subtotal_title = subtotal[7:].lower().replace('_', ' ')
        ax[0,subtotal_idx].set_title(subtotal_title)
        ax[3,subtotal_idx].set_xticklabels(['','linear','quadratic','piecewise_linear'], rotation=90)
    for row_idx in range(len(settings)):
        ax[row_idx,0].set_ylabel(settings[row_idx])
    ax[3,len(subtotals)-1].legend()
    plt.tight_layout()
    plt.show()

In [None]:
make_barplot(setting_counts, cluster0_setting_counts, cluster1_setting_counts)

differences don't seem to be in distribution of functional forms<br>
Also seems to be generating more dimensions than originally present in data...is this only to handle diff # of data points?