# Figures - Original CNN Score Comparisons
Last updated 9 April 2020

Various comparisons of the CNN scores by CERAD-like categories, Reagan scores, and pathological diagnosis as well as gray matter analysis of the scores. These were the original comparisons done before final publication.

In [None]:
%reset -f
import os
import pandas as pd
import sys
from os.path import join as oj
sys.path.append('..')
from modules import plotting

save_dir = '/mnt/data/figures/'
data_path = '../CSVs/Emory_data.csv'
score_columns = ['{}_cored_score', '{}_diffuse_score', '{}_caa_score']
labels = ['Cored', 'Diffuse', 'CAA']

os.makedirs(save_dir, exist_ok=True)
df = pd.read_csv(data_path)

#### CNN vs CERAD-like scores

Recreation of box plots shown in Tang paper but incorporating the Emory cohort.

In [None]:
x_param = {0: 'None', 1: 'Sparse', 2: 'Moderate', 3: 'Frequent'}
plotting.plot_cnn_scores(df, 'Cored_MTG', 'tissue_cored_score', x_param, ylabel='CNN scores', 
                         xlabel='CERAD-like scores', 
                         save_name='/mnt/data/figures/cnn_vs_cerad_cored_emory_tissue.png')
plotting.plot_cnn_scores(df, 'Diffuse_MTG', 'tissue_diffuse_score', x_param, ylabel='CNN scores', 
                         xlabel='CERAD-like scores', 
                         save_name='/mnt/data/figures/cnn_vs_cerad_diffuse_emory_tissue.png')
plotting.plot_cnn_scores(df, 'CAA_MTG', 'tissue_caa_score', x_param, ylabel='CNN scores', 
                         xlabel='CERAD-like scores', 
                         save_name='/mnt/data/figures/cnn_vs_cerad_caa_emory_tissue.png')

In [None]:
# add column for concomitant_dx
df_concomitant = df.copy()
df_concomitant

#### Pathology Diagnosis Comparisons

In [None]:
# add column for concomitant_dx
df_concomitant = df.copy()
concomitant_dx = []
dx_columns = ['Primary Dx', 'Secondary Dx (1)', 'Secondary Dx (2)']

for index, row in df_concomitant.iterrows():
    # concatenate the diagnosis column values
    dxs = [row[dx] for dx in dx_columns if row[dx] not in ['none', 'amyg LB']]
    
    # to simplify convert poss AD and prob AD to AD
    for i, dx in enumerate(dxs):
        if dx in ['poss AD', 'prob AD']:
            dxs[i] = 'AD'
    
    # if else clauses
    if len(dxs) == 1 and dxs[0] == 'control':
        concomitant_dx.append('control')
    elif len(dxs) == 1 and dxs[0] == 'AD':
        concomitant_dx.append('AD')
    elif 'AD' in dxs and 'DLB' not in dxs and 'TDP' not in dxs:
        concomitant_dx.append('AD')
    elif 'AD' in dxs and 'DLB' in dxs and 'TDP' not in dxs:
        concomitant_dx.append('AD+DLB')
    elif 'AD' in dxs and 'TDP' in dxs and 'DLB' not in dxs:
        concomitant_dx.append('AD+TDP')
    else:
        concomitant_dx.append('none')

# remove cases with none for concomitant
df_concomitant['Concomitant Diagnosis'] = concomitant_dx
df_concomitant = df_concomitant[df_concomitant['Concomitant Diagnosis'] != 'none'].reset_index(drop=True)

# plot CNN score comparisons for tissue CNN scores
x_param = {'control': 'control', 'AD': 'AD', 'AD+TDP': 'AD+TDP','AD+DLB': 'AD+DLB'}

for label, score_column in zip(labels, score_columns):
    save_path = oj(save_dir, 'tissue_{}_dx_comparison.png'.format(label))
    plotting.plot_cnn_scores(df_concomitant, 'Concomitant Diagnosis', score_column.format('tissue'), x_param,
                            ylabel='Whole Tissue CNN scores ({})'.format(label), xlabel='Pathological Diagnosis',
                            save_name=save_path)
    
# repeat for gray matter only
for label, score_column in zip(labels, score_columns):
    save_path = oj(save_dir, 'gm_{}_dx_comparison.png'.format(label))
    plotting.plot_cnn_scores(df_concomitant, 'Concomitant Diagnosis', score_column.format('gm'), x_param,
                            ylabel='Gray Matter CNN scores ({})'.format(label), xlabel='Pathological Diagnosis',
                            save_name=save_path)

In [None]:
# parameters
save_dir = '/mnt/Data/outputs/CNN_comparisons/'
os.makedirs(save_dir, exist_ok=True)

df_emory = pd.read_csv('../CSVs/Emory_data.csv')
df_tang = pd.read_csv('../CSVs/CNN_vs_CREAD.csv')
df_emory['Group'] = ['emory'] * len(df_emory)
df = pd.concat([df_tang, df_emory], axis=0, ignore_index=True, sort=False)
df.head()

In [None]:
def cnn_vs_cerad(data, description, save_dir=None, gm=False, subgroup=None, cerad_style=None):
    """Function for plotting CNN vs CERAD given a datatfile.
    
    :param data : Dataframe
        contains the csv data generated in this project, either Tang or the Emory file
    :param description : str
        this is to add in the name file for differentiating the figures"""
    data = data.copy()
    
    if subgroup is not None:
        data = data[data['Group'].isin(subgroup)].reset_index(drop=True)
    
    x_param = {0: 'None', 1: 'Sparse', 2: 'Moderate', 3: 'Frequent'}
    
    cerad_labels = ['Cored_MTG', 'Diffuse_MTG', 'CAA_MTG']
    
    if cerad_style:
        cnn_labels = ['score_cnn_gm_cored', 'score_cnn_gm_diffuse', 'score_cnn_gm_caa']
    elif gm:
        cnn_labels = ['gm_cored_score', 'gm_diffuse_score', 'gm_caa_score']
    else:
        cnn_labels = ['tissue_cored_score', 'tissue_diffuse_score', 'tissue_caa_score']
    labels = ['Cored', 'Diffuse', 'CAA']
    
    save_name = None
    for i in range(3):
        if save_dir is not None:
            os.makedirs(save_dir, exist_ok=True)
            save_name = oj(save_dir, 'CNN_vs_CERAD_{}_{}.png'.format(labels[i], description))
            
        cat_data = data[cerad_labels[i]].tolist()
        cont_data = data[cnn_labels[i]].tolist()
        r, p = spearmanr(cat_data, cont_data)
        print('spearman rank-order: r = {}, p = {}'.format(r, p))
            
        plotting.plot_cnn_scores(data, cerad_labels[i], cnn_labels[i], x_param,
                          'CNN scores ({})'.format(labels[i]),
                          'CERAD-like categories ({})'.format(labels[i]),
                          save_name=save_name)
        

def cnn_vs_diagnosis(data, description, save_dir=None, gm=False, subgroup=None, cerad_style=None):
    """Function for plotting CNN vs CERAD given a datatfile.
    
    :param data : Dataframe
        contains the csv data generated in this project, either Tang or the Emory file
    :param description : str
        this is to add in the name file for differentiating the figures"""
    data = data.copy()
    if subgroup is not None:
        data = data[data['Group'].isin(subgroup)].reset_index(drop=True) 
    
    x_param = {'non-AD': 'non-AD', 'AD': 'AD'}
    
    diagnosis_label = 'curated_dx'
    
    if cerad_style:
        cnn_labels = ['score_cnn_gm_cored', 'score_cnn_gm_diffuse', 'score_cnn_gm_caa']
    elif gm:
        cnn_labels = ['gm_cored_score', 'gm_diffuse_score', 'gm_caa_score']
    else:
        cnn_labels = ['tissue_cored_score', 'tissue_diffuse_score', 'tissue_caa_score']
    labels = ['Cored', 'Diffuse', 'CAA']
    
    save_name = None
    for i in range(3):
        if save_dir is not None:
            os.makedirs(save_dir, exist_ok=True)
            save_name = oj(save_dir, 'CNN_vs_Diagnosis_{}_{}.png'.format(labels[i], description))
            
        cat_data = data[diagnosis_label].tolist()
        cont_data = data[cnn_labels[i]].tolist()
        r, p = spearmanr(cat_data, cont_data)
        print('spearman rank-order: r = {}, p = {}'.format(r, p))
            
        plotting.plot_cnn_scores(data, diagnosis_label, cnn_labels[i], x_param,
                          'CNN scores ({})'.format(labels[i]),
                          'Pathological Diagnosis',
                          save_name=save_name, figsize=(7,8))
        
def cnn_vs_reagan(data, description, save_dir=None, gm=False, subgroup=None, cerad_style=None):
    """Function for plotting CNN vs CERAD given a datatfile.
    
    :param data : Dataframe
        contains the csv data generated in this project, either Tang or the Emory file
    :param description : str
        this is to add in the name file for differentiating the figures"""
    data = data.copy()
    if subgroup is not None:
        data = data[data['Group'].isin(subgroup)].reset_index(drop=True) 
    
    x_param = {'no/low': 'no/low', 'intermediate': 'intermediate', 'high': 'high'}
#     x_param = {'no': 'no', 'low': 'low', 'intermediate': 'intermediate', 'high': 'high'}
    
    reagan_label = 'grouped_reagen'
    
    if cerad_style:
        cnn_labels = ['score_cnn_gm_cored', 'score_cnn_gm_diffuse', 'score_cnn_gm_caa']
    elif gm:
        cnn_labels = ['gm_cored_score', 'gm_diffuse_score', 'gm_caa_score']
    else:
        cnn_labels = ['tissue_cored_score', 'tissue_diffuse_score', 'tissue_caa_score']
    labels = ['Cored', 'Diffuse', 'CAA']
    
    save_name = None
    for i in range(3):
        if save_dir is not None:
            os.makedirs(save_dir, exist_ok=True)
            save_name = oj(save_dir, 'CNN_vs_Reagan_{}_{}.png'.format(labels[i], description))
            
        cat_data = data[reagan_label].tolist()
        cont_data = data[cnn_labels[i]].tolist()
        r, p = spearmanr(cat_data, cont_data)
        print('spearman rank-order: r = {}, p = {}'.format(r, p))
            
        plotting.plot_cnn_scores(data, reagan_label, cnn_labels[i], x_param,
                          'CNN scores ({})'.format(labels[i]),
                          'Regan Score',
                          save_name=save_name, figsize=(7,8), fontsize=24, labelsize=24)

### Emory CNN vs CERAD for Whole Tissue Scores

In [None]:
cnn_vs_cerad(df, 'EmoryDataset_Tissue', save_dir=save_dir, subgroup=['emory'])

### Emory CNN vs CERAD for Gray Matter Scores

In [None]:
cnn_vs_cerad(df, 'EmoryDataset_GM', save_dir=save_dir, subgroup=['emory'], gm=True)

### Comparing CNN vs CERAD between Emory and Tang Hold-out Datasets

In [None]:
# Comparing Tang and Emory CNN scores, grouped by their CERAD-like categories
tang_df = pd.read_csv('../CSVs/CNN_vs_CREAD.csv')
emory_data = df[df['Group'] == 'emory'].reset_index(drop=True)

plotting.compare_dfs(emory_data, tang_df, 'Cored_MTG', 'tissue_cored_score', 
                     ['Emory', 'Tang'], xticks=['None', 'Sparse', 'Moderate', 'Frequent'],
                    xlabel='CERAD-like categories (cored)', ylabel='CNN scores (cored)')
plotting.compare_dfs(emory_data, tang_df, 'Diffuse_MTG', 'tissue_diffuse_score', 
                     ['Emory', 'Tang'], xticks=['None', 'Sparse', 'Moderate', 'Frequent'],
                    xlabel='CERAD-like categories (diffuse)', ylabel='CNN scores (diffuse)')
plotting.compare_dfs(emory_data, tang_df, 'CAA_MTG', 'tissue_caa_score', 
                     ['Emory', 'Tang'], xticks=['None', 'Sparse', 'Moderate', 'Frequent'],
                    xlabel='CERAD-like categories (CAA)', ylabel='CNN scores (CAA)')

### CNN vs CERAD for Combined Emory and Tang dataset (tissue only)

In [None]:
# combining the two datasets
cnn_vs_cerad(df, '', subgroup=['emory', 'hold out'])

### CNN vs Pathologies (Any AD vs Non-AD)

In [None]:
# cases with pathological diagnosis of AD (include cases that are AD + DLB)
# add the curated dx column
curated_dx = ['none'] * len(df)
for i, r in df.iterrows():
    dx1 = r['Primary Dx']
    dx2 = r['Secondary Dx (1)']
    dx3 = r['Secondary Dx (2)']
    dxs = [dx1, dx2, dx3]
    
    ad_flag = False
    
    if 'AD' in dxs or 'poss AD' in dxs or 'prob AD' in dxs:
        ad_flag = True
        
    if ad_flag:
        curated_dx[i] = 'AD'
    else:
        curated_dx[i] = 'non-AD'
#     if dx1 == 'control':
#         curated_dx[i] = 'control'
#     if dx1 == 'AD' and dx2 == 'none':
#         curated_dx[i] = 'AD'
#     elif dx1 == 'AD' and dx2 == 'DLB' and dx3 == 'none':
#         curated_dx[i] = 'AD'
#     elif dx1 == 'DLB' and dx2 == 'AD':
#         curated_dx[i] = 'AD'

df['curated_dx'] = curated_dx

cnn_vs_diagnosis(df, 'EmoryDataset_Tissue', save_dir=save_dir, subgroup=['emory'])
cnn_vs_diagnosis(df, 'EmoryDataset_GM', save_dir=save_dir, subgroup=['emory'], gm=True)

### CNN vs Reagan for Tissue Scores

In [None]:
# group no and low together
grouped_reagen = ['none'] * len(df)
for i, r in df.iterrows():
    if r['Reagan'] in ['no', 'low']:
        grouped_reagen[i] = 'no/low'
    else:
        grouped_reagen[i] = r['Reagan']
        
df['grouped_reagen'] = grouped_reagen

cnn_vs_reagan(df, 'EmoryDataset_Tissue', save_dir=save_dir, subgroup=['emory'])


### CNN vs Reagan for Gray Matter Scores

In [None]:
cnn_vs_reagan(df, 'EmoryDataset_GM', save_dir=save_dir, subgroup=['emory'], gm=True)

### Gray Matter vs Tissue Differences

In [None]:
temp = df[df['Group'] == 'emory'].copy().reset_index(drop=True)

figure = plt.figure(figsize=(12,10))
tissue_cored_score = temp['tissue_cored_score'].tolist()
gm_cored_score = temp['gm_cored_score'].tolist()

tissue_diffuse_score = temp['tissue_diffuse_score'].tolist()
gm_diffuse_score = temp['gm_diffuse_score'].tolist()

tissue_caa_score = temp['tissue_caa_score'].tolist()
gm_caa_score = temp['gm_caa_score'].tolist()

tissue_cored_score, gm_cored_score = (list(t) for t in zip(*sorted(zip(tissue_cored_score, gm_cored_score))))
fig = plt.figure(figsize=(12,12))
plt.plot(tissue_cored_score, color='b')
plt.plot(gm_cored_score, color='b', linestyle='--')

tissue_diffuse_score, gm_diffuse_score = (list(t) for t in zip(*sorted(zip(tissue_diffuse_score, gm_diffuse_score))))
plt.plot(tissue_diffuse_score, color='r')
plt.plot(gm_diffuse_score, color='r', linestyle='--')

tissue_caa_score, gm_caa_score = (list(t) for t in zip(*sorted(zip(tissue_caa_score, gm_caa_score))))
plt.plot(tissue_caa_score, color='g')
plt.plot(gm_caa_score, color='g', linestyle='--')
plt.legend(['Tissue Cored', 'GM Cored', 'Tissue Diffuse', 'GM Diffuse', 'Tissue CAA', 'GM CAA'], 
          loc='upper left', fontsize=22)
plt.ylabel('CNN Scores', fontsize=24)
plt.xlabel('Cases', fontsize=24)
plt.xticks(fontsize=22)
plt.yticks(fontsize=22)
fig.savefig(oj(save_dir, 'gm_increase.png'), bbox_inches='tight', dpi=300)
plt.show()