In [65]:
## DATA AND METRICS
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
from imblearn.metrics import specificity_score


## FS
import os

In [66]:
EVAL_METRIC_PATH = "../experiments/metrics_evaluation"

# Helpers

In [67]:
ABNORMAL_TYPES = ['normal', 'mtl_atrophy', 'wmh', 'other_atrophy']
DEMENTIA_TYPES = {
    0: "no_dementia",
    1: "other_dementia",
    2: "AD"
}

def get_count_values(data: pd.DataFrame, column_name: str, is_ascending: bool = False):
  counts = data[column_name].value_counts(ascending=is_ascending)
  percentage = (data[column_name].value_counts(normalize=True, ascending=is_ascending) * 100).round(2)
  return pd.concat([counts, percentage], axis=1)

class EvalMetric:
  def __init__(self, labels:pd.Series, scores:pd.Series, predictions:pd.Series):
    self.labels = labels
    self.scores = scores
    # self.predictions = (scores >= threshold).astype(int)
    self.predictions = predictions

  def get_accuracy(self) -> float:
    return accuracy_score(self.labels, self.predictions)

  def get_precision(self) -> float:
    return precision_score(self.labels, self.predictions)

  def get_recall(self) -> float:
    # This metric is also sensitivity
    return recall_score(self.labels, self.predictions)

  def get_f1_score(self) -> float:
    return f1_score(self.labels, self.predictions)

  def get_specificity(self) -> float:
    return specificity_score(self.labels, self.predictions)

  def get_auc_score(self) -> float:
    return roc_auc_score(self.labels, self.scores)

  def get_overall_result(self) -> dict:
    return {
        'precision': self.get_precision(),
        'recall': self.get_recall(),
        'f1_score': self.get_f1_score(),
        'specificity': self.get_specificity(),
        'auc': self.get_auc_score(),
        'accuracy': self.get_accuracy()
    }

def get_evaluation(data, label_col:str, score_col_prefix:str, label_list:list) -> dict:
  result_dict = dict()
  clone_data = data.copy()
  for label_value in label_list:
    clone_data[f'is_{label_value}'] = clone_data[label_col].map(lambda val: label_value in val).astype(int)
    result_dict[label_value] = EvalMetric(labels=clone_data[f'is_{label_value}'], scores=clone_data[f'{score_col_prefix}_{label_value}'], predictions=clone_data[f'is_predicted_{label_value}']).get_overall_result()
  return result_dict

def get_result_paths(model_path:str):
    files = os.listdir(model_path)
    train_result_path = [file for file in files if file.startswith("train_result__")][0]
    test_result_path = [file for file in files if file.startswith("test_result__")][0]
    public_result_path = [file for file in files if file.startswith("public_result__")][0]
    return train_result_path, test_result_path, public_result_path


# ConVIRT performance

In [68]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "convirt")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/convirt/train_result__convirt_baseline.csv',
 '../experiments/metrics_evaluation/convirt/test_result__convirt_baseline.csv',
 '../experiments/metrics_evaluation/convirt/public_result__convirt_baseline.csv')

## On MINDSet

In [69]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [70]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.275
	- precision: 0.06875
	- recall: 0.25
	- f1_score: 0.10784313725490197
	- specificity: 0.75
	- auc: 0.5868258738432652


TEST DATA
Overall result:
	- Accuracy: 0.26
	- precision: 0.065
	- recall: 0.25
	- f1_score: 0.10317460317460317
	- specificity: 0.75
	- auc: 0.5579641387682085




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [71]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.0
	recall: 0.0
	f1_score: 0.0
	specificity: 1.0
	auc: 0.5
	accuracy: 0.525
TEST DATA
AD
	precision: 0.0
	recall: 0.0
	f1_score: 0.0
	specificity: 1.0
	auc: 0.5
	accuracy: 0.48


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [72]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.8083333333333333
	- precision: 0.8083333333333333
	- recall: 1.0
	- f1_score: 0.8940092165898618
	- specificity: 0.0
	- auc: 0.6163155535634245
	- accuracy: 0.8083333333333333
----
TEST DATA
	- Accuracy: 0.8
	- precision: 0.8
	- recall: 1.0
	- f1_score: 0.8888888888888888
	- specificity: 0.0
	- auc: 0.6174999999999999
	- accuracy: 0.8
----


In [73]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 0,
 'normal_mtl_atrophy': 0,
 'normal_wmh': 0,
 'normal_other_atrophy': 10,
 'mtl_atrophy_normal': 0,
 'mtl_atrophy_mtl_atrophy': 0,
 'mtl_atrophy_wmh': 0,
 'mtl_atrophy_other_atrophy': 25,
 'wmh_normal': 0,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 0,
 'wmh_other_atrophy': 7,
 'other_atrophy_normal': 0,
 'other_atrophy_mtl_atrophy': 0,
 'other_atrophy_wmh': 0,
 'other_atrophy_other_atrophy': 13}

In [74]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 0 10]
 [ 0 40]]
[[24  0]
 [26  0]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [75]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.5046875,
 'recall': 1.0,
 'f1_score': 0.6708203530633438,
 'specificity': np.float64(0.0),
 'auc': np.float64(0.5309109199050697),
 'accuracy': 0.5046875}

In [76]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[  0, 634],
       [  0, 646]])

# MedCLIP performance

In [82]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "med-clip")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/med-clip/train_result__medclip_baseline.csv',
 '../experiments/metrics_evaluation/med-clip/test_result__medclip_baseline.csv',
 '../experiments/metrics_evaluation/med-clip/public_result__medclip_baseline.csv')

## On MINDSet

In [83]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [84]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.18333333333333332
	- precision: 0.21515506671474938
	- recall: 0.17724344898257943
	- f1_score: 0.13447740415506532
	- specificity: 0.7341551324742573
	- auc: 0.43535413916434085


TEST DATA
Overall result:
	- Accuracy: 0.24
	- precision: 0.36142533936651583
	- recall: 0.2296153846153846
	- f1_score: 0.18233618233618232
	- specificity: 0.7550518541797612
	- auc: 0.5197942081488593




In [85]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.75
	recall: 0.05263157894736842
	f1_score: 0.09836065573770492
	specificity: 0.9841269841269841
	auc: 0.519910888331941
	accuracy: 0.5416666666666666
TEST DATA
AD
	precision: 0.0
	recall: 0.0
	f1_score: 0.0
	specificity: 0.9166666666666666
	auc: 0.4575320512820512
	accuracy: 0.44


In [86]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.5
	- precision: 0.7534246575342466
	- recall: 0.5670103092783505
	- f1_score: 0.6470588235294118
	- specificity: 0.21739130434782608
	- auc: 0.6718960107575078
	- accuracy: 0.5
----
TEST DATA
	- Accuracy: 0.58
	- precision: 0.7878787878787878
	- recall: 0.65
	- f1_score: 0.7123287671232876
	- specificity: 0.3
	- auc: 0.5625
	- accuracy: 0.58
----


In [87]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 3,
 'normal_mtl_atrophy': 0,
 'normal_wmh': 3,
 'normal_other_atrophy': 4,
 'mtl_atrophy_normal': 7,
 'mtl_atrophy_mtl_atrophy': 2,
 'mtl_atrophy_wmh': 2,
 'mtl_atrophy_other_atrophy': 14,
 'wmh_normal': 3,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 0,
 'wmh_other_atrophy': 4,
 'other_atrophy_normal': 5,
 'other_atrophy_mtl_atrophy': 0,
 'other_atrophy_wmh': 1,
 'other_atrophy_other_atrophy': 7}

In [88]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 3  7]
 [14 26]]
[[22  2]
 [26  0]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [89]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.5136116152450091,
 'recall': 0.43808049535603716,
 'f1_score': 0.4728487886382623,
 'specificity': np.float64(0.5772870662460567),
 'auc': np.float64(0.5115378793057983),
 'accuracy': 0.50703125}

In [90]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[366, 268],
       [363, 283]])

# BiomedCLIP performance

In [91]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "biomed-clip")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/biomed-clip/train_result__biomedclip_baseline.csv',
 '../experiments/metrics_evaluation/biomed-clip/test_result__biomedclip_baseline.csv',
 '../experiments/metrics_evaluation/biomed-clip/public_result__biomedclip_baseline.csv')

## On MINDSet

In [92]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [93]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.26666666666666666
	- precision: 0.2358333333333333
	- recall: 0.4246894409937888
	- f1_score: 0.2750210260723297
	- specificity: 0.7869577903131686
	- auc: 0.7362349193359565


TEST DATA
Overall result:
	- Accuracy: 0.26
	- precision: 0.25961538461538464
	- recall: 0.37857142857142856
	- f1_score: 0.2739403453689168
	- specificity: 0.8004360465116279
	- auc: 0.7400925054047147




  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [94]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.0
	recall: 0.0
	f1_score: 0.0
	specificity: 1.0
	auc: 0.4979114452798663
	accuracy: 0.525
TEST DATA
AD
	precision: 0.0
	recall: 0.0
	f1_score: 0.0
	specificity: 1.0
	auc: 0.5352564102564101
	accuracy: 0.48


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [95]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.325
	- precision: 0.9444444444444444
	- recall: 0.17525773195876287
	- f1_score: 0.2956521739130435
	- specificity: 0.9565217391304348
	- auc: 0.630658897355446
	- accuracy: 0.325
----
TEST DATA
	- Accuracy: 0.3
	- precision: 0.7777777777777778
	- recall: 0.175
	- f1_score: 0.2857142857142857
	- specificity: 0.8
	- auc: 0.5700000000000001
	- accuracy: 0.3
----


In [96]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 8,
 'normal_mtl_atrophy': 0,
 'normal_wmh': 0,
 'normal_other_atrophy': 0,
 'mtl_atrophy_normal': 22,
 'mtl_atrophy_mtl_atrophy': 0,
 'mtl_atrophy_wmh': 3,
 'mtl_atrophy_other_atrophy': 0,
 'wmh_normal': 1,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 5,
 'wmh_other_atrophy': 0,
 'other_atrophy_normal': 10,
 'other_atrophy_mtl_atrophy': 0,
 'other_atrophy_wmh': 1,
 'other_atrophy_other_atrophy': 0}

In [97]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 8  2]
 [33  7]]
[[24  0]
 [26  0]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [98]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.3333333333333333,
 'recall': 0.0015479876160990713,
 'f1_score': 0.0030816640986132513,
 'specificity': np.float64(0.9968454258675079),
 'auc': np.float64(0.5022169917277886),
 'accuracy': 0.49453125}

In [100]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[632,   2],
       [645,   1]])

# FT-01 performance
(Finetuning on image description only)

In [103]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "vista-ft-01-description-only")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/vista-ft-01-description-only/train_result__ft_description_only__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-01-description-only/test_result__ft_description_only__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-01-description-only/public_result__ft_description_only__1e-5.csv')

## On MINDSet

In [104]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [105]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.5333333333333333
	- precision: 0.5923102981029811
	- recall: 0.5612099253403602
	- f1_score: 0.4975108225108225
	- specificity: 0.8375490885350768
	- auc: 0.7951454504071442


TEST DATA
Overall result:
	- Accuracy: 0.48
	- precision: 0.5358552631578948
	- recall: 0.5019230769230769
	- f1_score: 0.45066598300956734
	- specificity: 0.8234420175989943
	- auc: 0.7563830077841707




In [106]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.5
	recall: 0.47368421052631576
	f1_score: 0.4864864864864865
	specificity: 0.5714285714285714
	auc: 0.5827067669172933
	accuracy: 0.525
TEST DATA
AD
	precision: 0.4375
	recall: 0.2692307692307692
	f1_score: 0.3333333333333333
	specificity: 0.625
	auc: 0.4655448717948718
	accuracy: 0.44


In [107]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.8666666666666667
	- precision: 0.8785046728971962
	- recall: 0.9690721649484536
	- f1_score: 0.9215686274509803
	- specificity: 0.43478260869565216
	- auc: 0.7216494845360825
	- accuracy: 0.8666666666666667
----
TEST DATA
	- Accuracy: 0.84
	- precision: 0.8478260869565217
	- recall: 0.975
	- f1_score: 0.9069767441860465
	- specificity: 0.3
	- auc: 0.63375
	- accuracy: 0.84
----


In [108]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 3,
 'normal_mtl_atrophy': 2,
 'normal_wmh': 3,
 'normal_other_atrophy': 2,
 'mtl_atrophy_normal': 0,
 'mtl_atrophy_mtl_atrophy': 10,
 'mtl_atrophy_wmh': 10,
 'mtl_atrophy_other_atrophy': 4,
 'wmh_normal': 0,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 7,
 'wmh_other_atrophy': 0,
 'other_atrophy_normal': 1,
 'other_atrophy_mtl_atrophy': 5,
 'other_atrophy_wmh': 3,
 'other_atrophy_other_atrophy': 4}

In [109]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 3  7]
 [ 1 39]]
[[15  9]
 [19  7]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [110]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.5577639751552795,
 'recall': 0.695046439628483,
 'f1_score': 0.6188835286009648,
 'specificity': np.float64(0.4384858044164038),
 'auc': np.float64(0.5916706546473811),
 'accuracy': 0.56796875}

In [111]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[278, 356],
       [197, 449]])

# FT-02 performance
(Finetuning on abnormality only)

**FORMAT**

"Summary: This is an MRI image of \<abnormalities\>.

In [113]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "vista-ft-02-abnormality-only")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/vista-ft-02-abnormality-only/train_result__ft_abnormality_only__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-02-abnormality-only/test_result__ft_abnormality_only__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-02-abnormality-only/public_result__ft_abnormality_only__1e-5.csv')

## On MINDSet

In [114]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [115]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.925
	- precision: 0.9295600233100233
	- recall: 0.9089628165715122
	- f1_score: 0.9160704075798416
	- specificity: 0.970596498491749
	- auc: 0.9513343340028702


TEST DATA
Overall result:
	- Accuracy: 0.74
	- precision: 0.7402777777777778
	- recall: 0.7230769230769232
	- f1_score: 0.7205495570321151
	- specificity: 0.9053315524827152
	- auc: 0.8711999879127785




In [116]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.75
	recall: 0.6842105263157895
	f1_score: 0.7155963302752294
	specificity: 0.7936507936507936
	auc: 0.7942077415761626
	accuracy: 0.7416666666666667
TEST DATA
AD
	precision: 0.6666666666666666
	recall: 0.46153846153846156
	f1_score: 0.5454545454545454
	specificity: 0.75
	auc: 0.6722756410256411
	accuracy: 0.6


In [117]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.9666666666666667
	- precision: 0.9603960396039604
	- recall: 1.0
	- f1_score: 0.9797979797979798
	- specificity: 0.8260869565217391
	- auc: 0.9094576423128642
	- accuracy: 0.9666666666666667
----
TEST DATA
	- Accuracy: 0.88
	- precision: 0.9047619047619048
	- recall: 0.95
	- f1_score: 0.926829268292683
	- specificity: 0.6
	- auc: 0.8237500000000001
	- accuracy: 0.88
----


In [118]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 6,
 'normal_mtl_atrophy': 1,
 'normal_wmh': 1,
 'normal_other_atrophy': 2,
 'mtl_atrophy_normal': 2,
 'mtl_atrophy_mtl_atrophy': 15,
 'mtl_atrophy_wmh': 4,
 'mtl_atrophy_other_atrophy': 4,
 'wmh_normal': 0,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 7,
 'wmh_other_atrophy': 0,
 'other_atrophy_normal': 0,
 'other_atrophy_mtl_atrophy': 3,
 'other_atrophy_wmh': 1,
 'other_atrophy_other_atrophy': 9}

In [119]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 6  4]
 [ 2 38]]
[[18  6]
 [14 12]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [120]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.5106719367588933,
 'recall': 1.0,
 'f1_score': 0.6760858189429618,
 'specificity': np.float64(0.02365930599369085),
 'auc': np.float64(0.539598206873651),
 'accuracy': 0.51640625}

In [122]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[ 15, 619],
       [  0, 646]])

# FT-03 performance
(Finetuning on summary only)

**FORMAT**

"Summary: This is an MRI image of \<abnormalities\>, which suggests \<dementia_type\>".

In [123]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "vista-ft-03-summary-only")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/vista-ft-03-summary-only/train_result__ft_summary_only__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-03-summary-only/test_result__ft_summary_only__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-03-summary-only/public_result__ft_summary_only__1e-5.csv')

## On MINDSet

In [124]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [125]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.9083333333333333
	- precision: 0.9029928254826834
	- recall: 0.8879190455277413
	- f1_score: 0.8913304715067656
	- specificity: 0.9656606883380825
	- auc: 0.9631909054367607


TEST DATA
Overall result:
	- Accuracy: 0.7
	- precision: 0.7278693528693528
	- recall: 0.6696153846153846
	- f1_score: 0.6835284280936454
	- specificity: 0.8915815524827152
	- auc: 0.8536694785572693




In [126]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.7407407407407407
	recall: 0.7017543859649122
	f1_score: 0.7207207207207207
	specificity: 0.7777777777777778
	auc: 0.7912837649679755
	accuracy: 0.7416666666666667
TEST DATA
AD
	precision: 0.5714285714285714
	recall: 0.46153846153846156
	f1_score: 0.5106382978723404
	specificity: 0.625
	auc: 0.592948717948718
	accuracy: 0.54


In [127]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.9583333333333334
	- precision: 0.96
	- recall: 0.9896907216494846
	- f1_score: 0.9746192893401016
	- specificity: 0.8260869565217391
	- auc: 0.925369789332138
	- accuracy: 0.9583333333333334
----
TEST DATA
	- Accuracy: 0.88
	- precision: 0.8863636363636364
	- recall: 0.975
	- f1_score: 0.9285714285714286
	- specificity: 0.5
	- auc: 0.77
	- accuracy: 0.88
----


In [128]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 5,
 'normal_mtl_atrophy': 2,
 'normal_wmh': 1,
 'normal_other_atrophy': 2,
 'mtl_atrophy_normal': 1,
 'mtl_atrophy_mtl_atrophy': 16,
 'mtl_atrophy_wmh': 4,
 'mtl_atrophy_other_atrophy': 4,
 'wmh_normal': 0,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 7,
 'wmh_other_atrophy': 0,
 'other_atrophy_normal': 0,
 'other_atrophy_mtl_atrophy': 4,
 'other_atrophy_wmh': 1,
 'other_atrophy_other_atrophy': 7}

In [129]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 5  5]
 [ 1 39]]
[[15  9]
 [14 12]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [130]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.5492957746478874,
 'recall': 0.9659442724458205,
 'f1_score': 0.7003367003367004,
 'specificity': np.float64(0.19242902208201892),
 'auc': np.float64(0.6353256145559669),
 'accuracy': 0.5828125}

In [132]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[122, 512],
       [ 22, 624]])

# FT-04 performance
(Finetuning on summary and full caption)

**FORMAT**

"Summary: This is an MRI image of \<abnormalities\>, which suggests \<dementia_type\>"\n\<full_caption\>.

In [133]:
MODEL_PATH = os.path.join(EVAL_METRIC_PATH, "vista-ft-04-summary-and-description")
TRAIN_PATH, TEST_PATH, PUBLIC_PATH = [os.path.join(MODEL_PATH, file_path) for file_path in get_result_paths(MODEL_PATH)]
TRAIN_PATH, TEST_PATH, PUBLIC_PATH

('../experiments/metrics_evaluation/vista-ft-04-summary-and-description/train_result__ft_summary_and_description__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-04-summary-and-description/test_result__ft_summary_and_description__lr_1e-5.csv',
 '../experiments/metrics_evaluation/vista-ft-04-summary-and-description/public_result__ft_summary_and_description__1e-5.csv')

## On MINDSet

In [134]:
train_data_with_result = pd.read_csv(TRAIN_PATH)
test_data_with_result = pd.read_csv(TEST_PATH)
train_data_with_result.shape, test_data_with_result.shape

((120, 31), (50, 31))

In [135]:
print('ABNORMALITY RETRIEVAL')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  result = get_evaluation(data, label_col='abnormal_type', score_col_prefix='sim_score', label_list=ABNORMAL_TYPES)
  overall_result = dict()
  for abnormal_type in result.keys():
    # print(abnormal_type)
    for metric, value in result[abnormal_type].items():
      # print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)
    # print('----')
  print('Overall result:')
  print(f'\t- Accuracy: {data["is_correct_abnormality"].mean()}')
  for metric, values in overall_result.items():
    if metric != 'accuracy':
      print(f'\t- {metric}: {np.mean(values)}')
  print('\n=========\n')

ABNORMALITY RETRIEVAL
TRAINING DATA
Overall result:
	- Accuracy: 0.6083333333333333
	- precision: 0.6635468697968698
	- recall: 0.6209010393793003
	- f1_score: 0.581006877283473
	- specificity: 0.8668663338575271
	- auc: 0.7930606272639757


TEST DATA
Overall result:
	- Accuracy: 0.46
	- precision: 0.4390522875816993
	- recall: 0.46769230769230774
	- f1_score: 0.40858879492600425
	- specificity: 0.8125699245757385
	- auc: 0.7273850626118068




In [136]:
print('AD PREDICTION\n****\n')
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  prob_columns = [c for c in data.columns if 'prob' in c]
  baseline_result = get_evaluation(data, label_col='label_code', score_col_prefix='prob', label_list=DEMENTIA_TYPES.values())
  overall_result = dict()
  for dementia_type in baseline_result.keys():
    if dementia_type != 'AD': continue
    print(dementia_type)
    for metric, value in baseline_result[dementia_type].items():
      print(f'\t{metric}: {value}')
      if metric not in overall_result.keys():
        overall_result[metric] = []
      overall_result[metric].append(value)

AD PREDICTION
****

TRAINING DATA
AD
	precision: 0.5535714285714286
	recall: 0.543859649122807
	f1_score: 0.5486725663716814
	specificity: 0.6031746031746031
	auc: 0.6016429963798384
	accuracy: 0.575
TEST DATA
AD
	precision: 0.5
	recall: 0.34615384615384615
	f1_score: 0.4090909090909091
	specificity: 0.625
	auc: 0.49919871794871795
	accuracy: 0.48


In [137]:
# Dementia Prediction
for dataset, data in zip(['TRAINING DATA', 'TEST DATA'], [train_data_with_result, test_data_with_result]):
  print(dataset)
  print(f'\t- Accuracy: {data["is_correct_dementia"].mean()}')
  diagnosis_result = EvalMetric(data['is_dementia'], data['predicted_score'], data['is_predicted_dementia']).get_overall_result()
  for metric, value in diagnosis_result.items():
    print(f'\t- {metric}: {value}')
  print('----')

TRAINING DATA
	- Accuracy: 0.8666666666666667
	- precision: 0.8785046728971962
	- recall: 0.9690721649484536
	- f1_score: 0.9215686274509803
	- specificity: 0.43478260869565216
	- auc: 0.5804571940833707
	- accuracy: 0.8666666666666667
----
TEST DATA
	- Accuracy: 0.78
	- precision: 0.8222222222222222
	- recall: 0.925
	- f1_score: 0.8705882352941177
	- specificity: 0.2
	- auc: 0.53125
	- accuracy: 0.78
----


In [138]:
# ABNORMALITY CONFUSION MATRIX
cfs_matrix = dict()
for ab_type in ABNORMAL_TYPES:
  for ab_type_2 in ABNORMAL_TYPES:
     cfs_matrix[f'{ab_type}_{ab_type_2}']= test_data_with_result[(test_data_with_result[f'is_{ab_type}'] == 1)&(test_data_with_result[f'is_predicted_{ab_type_2}'])].shape[0]
cfs_matrix

{'normal_normal': 2,
 'normal_mtl_atrophy': 3,
 'normal_wmh': 3,
 'normal_other_atrophy': 2,
 'mtl_atrophy_normal': 1,
 'mtl_atrophy_mtl_atrophy': 11,
 'mtl_atrophy_wmh': 8,
 'mtl_atrophy_other_atrophy': 4,
 'wmh_normal': 0,
 'wmh_mtl_atrophy': 0,
 'wmh_wmh': 7,
 'wmh_other_atrophy': 0,
 'other_atrophy_normal': 2,
 'other_atrophy_mtl_atrophy': 5,
 'other_atrophy_wmh': 3,
 'other_atrophy_other_atrophy': 3}

In [139]:
# DEMENTIA CONFUSION MATRIX
print(confusion_matrix(test_data_with_result['is_dementia'], test_data_with_result['is_predicted_dementia'], labels=[0, 1]))

# AD CONFUSION MATRIX
test_data_with_result['is_AD'] = (test_data_with_result['label'] == 2).astype(int)
print(confusion_matrix(test_data_with_result['is_AD'], test_data_with_result['is_predicted_AD'], labels=[0, 1]))

[[ 2  8]
 [ 3 37]]
[[15  9]
 [17  9]]


## On public dataset
https://huggingface.co/datasets/Falah/Alzheimer_MRI

In [140]:
test_public_with_result = pd.read_csv(PUBLIC_PATH)
public_result = EvalMetric(test_public_with_result['is_dementia'], test_public_with_result['predicted_score'], test_public_with_result['is_predicted_dementia'])
public_result.get_overall_result()

{'precision': 0.5293056807935077,
 'recall': 0.9086687306501547,
 'f1_score': 0.6689458689458689,
 'specificity': np.float64(0.17665615141955837),
 'auc': np.float64(0.5643293844185524),
 'accuracy': 0.54609375}

In [141]:
confusion_matrix(test_public_with_result['is_dementia'], test_public_with_result['is_predicted_dementia'])

array([[112, 522],
       [ 59, 587]])