# 12/2/19

Extracting the confusion matrix for the drain detector trained on the small validation set partition.

In [1]:
import os
import os.path as osp
import json
from functools import partial
os.chdir('/lfs/1/gangus/repositories/pytorch-classification/drain_detector')

import torch
import torch.nn as nn
import sklearn.metrics as skl
import numpy as np
import pandas as pd
from tqdm import tqdm

import emmental
from emmental import Meta
from emmental.data import EmmentalDataLoader
from emmental.learner import EmmentalLearner
from emmental.model import EmmentalModel
from emmental.scorer import Scorer
from emmental.task import EmmentalTask

from dataset import DrainDetectionDataset
import modules
from util import ce_loss, output

In [2]:
experiment_dir = 'experiments/drain_detection/cxr_seg/pretrain_chexnet/1'
emmental_dir = 'experiments/drain_detection/cxr_seg/pretrain_chexnet/_emmental_logs/2019_11_18/09_09_48/64ab5a7b'

In [5]:
emmental.init()
split = 'valid'

with open(osp.join(experiment_dir, 'config.json'), 'r') as f:
    config = json.load(f)

task_to_label_dict = config['task_to_label_dict']
task_to_cardinality_dict = config['task_to_cardinality_dict']

dataset_configs = config['dataset_configs']
dataloader_configs = config['dataloader_configs']

encoder_class = config['encoder_class']
encoder_args = config['encoder_args']
decoder_class = config['decoder_class']
decoder_args = config['decoder_args']

model_config = {
    'model_path': osp.join(emmental_dir, 'best_model_drain_drain-detection-dataset_valid_roc_auc.pth'),
    'device': 0,
    'dataparallel': True
}

Meta.update_config(
    config={
        'model_config': {**model_config}
    }
)

ds = DrainDetectionDataset(
    df_path='/lfs/1/gangus/repositories/pytorch-classification/drain_detector/data/chexnet/by-patient-id/split/all.csv', 
    images_dir=dataset_configs[split]['args']['images_dir'],
    split=split,
    transforms=dataset_configs[split]['args']['transforms'],
    cxr_only=dataset_configs[split]['args']['cxr_only']
)

dl = EmmentalDataLoader(
    task_to_label_dict=task_to_label_dict,
    dataset=ds,
    split=split,
    **dataloader_configs[split]
)



encoder_module = getattr(modules, encoder_class)(**encoder_args)
tasks = [
    EmmentalTask(
        name=task_name,
        module_pool=nn.ModuleDict(
            {
                f'encoder_module': encoder_module,
                f'decoder_module_{task_name}': getattr(modules, decoder_class)(task_to_cardinality_dict[task_name], **decoder_args),
            }
        ),
        task_flow=[
            {
                'name': 'encoder_module', 'module': 'encoder_module', 'inputs': [('_input_', 'image')]
            },
            {
                'name':   f'decoder_module_{task_name}',
                'module': f'decoder_module_{task_name}',
                'inputs': [('encoder_module', 0)],
            },
        ],
        loss_func=partial(ce_loss, task_name),
        output_func=partial(output, task_name),
        scorer=Scorer(
            metrics=['accuracy', 'roc_auc', 'precision', 'recall', 'f1']),
    )
    for task_name in task_to_label_dict.keys()
]
model = EmmentalModel(name='drain-detection-model', tasks=tasks)

[2019-12-05 12:17:32,535][INFO] emmental.meta:110 - Logging was already initialized to use /tmp/2019_12_05/12_12_31/b4e82ffe.  To configure logging manually, call emmental.init_logging before initialiting Meta.
[2019-12-05 12:17:32,582][INFO] emmental.meta:60 - Loading Emmental default config from /lfs/1/gangus/repositories/pytorch-classification/.emmental/src/emmental/emmental-default-config.yaml.
[2019-12-05 12:17:32,584][INFO] emmental.meta:160 - Updating Emmental config from user provided config.
[2019-12-05 12:17:33,201][INFO] emmental.data:52 - Auto generate uids for dataset drain-detection-dataset under _uids_.
[2019-12-05 12:17:36,647][INFO] root:62 - Loaded 606/606 pretrained parameters
[2019-12-05 12:17:36,655][INFO] emmental.task:48 - Created task: drain
[2019-12-05 12:17:36,679][INFO] emmental.model:71 - Moving model to GPU (cuda:0).
[2019-12-05 12:17:36,684][INFO] emmental.model:57 - Created emmental model drain-detection-model that contains task {'drain'}.
[2019-12-05 12:

In [6]:
if Meta.config["model_config"]["model_path"]:
    model.load(Meta.config["model_config"]["model_path"])

[2019-12-05 12:17:42,487][INFO] emmental.model:518 - [drain-detection-model] Model loaded from experiments/drain_detection/cxr_seg/pretrain_chexnet/_emmental_logs/2019_11_18/09_09_48/64ab5a7b/best_model_drain_drain-detection-dataset_valid_roc_auc.pth
[2019-12-05 12:17:42,489][INFO] emmental.model:71 - Moving model to GPU (cuda:0).


In [7]:
res = model.predict(dl, return_preds=True)

100%|██████████| 5606/5606 [16:30<00:00,  5.66it/s]


In [8]:
y_prob = res['probs']['drain']
np.save('y_prob_all_chexnet', np.array(y_prob))

In [32]:
res_idxs = sorted(list(enumerate(res['uids']['drain'])), key=lambda x: int(x[1][x[1].index('_') + 1:]))
res_idxs[:5]

[(89542, 'drain-detection-dataset_0'),
 (101884, 'drain-detection-dataset_1'),
 (75126, 'drain-detection-dataset_2'),
 (25333, 'drain-detection-dataset_3'),
 (2905, 'drain-detection-dataset_4')]

In [46]:
res_image_ids = {}
for idx, (res_idx, uid) in enumerate(res_idxs):
    res_image_ids[ds.X_dict['image_ids'][idx]] = {
        'uid': ds.X_dict["_uids_"][idx],
        'res_idx': res_idx,
        'y_prob': y_prob[res_idx]
    }

### 2.) Getting labels for the entire CheXNet dataset

Here we create a copy of nih_labels.csv, add a dummy drain column (for compatibility with DrainDetectionDataset), then run the model

In [90]:
labels_df = pd.read_csv('/dfs/scratch1/senwu/mmtl/emmental-tutorials/chexnet/data/nih_labels.csv')
image_ids = list(labels_df['Image Index'])
drain_preds = {'drain': [], 'drain_weak': []}
for image_id in image_ids:
    drain_preds['drain'].append(int(res_image_ids[image_id]['y_prob'][1] > 0.5))
    drain_preds['drain_weak'].append(res_image_ids[image_id]['y_prob'][1])
labels_df['drain'] = drain_preds['drain']
labels_df['drain_weak'] = drain_preds['drain_weak']
labels_df.to_csv('/lfs/1/gangus/repositories/pytorch-classification/drain_detector/data/chexnet/by-patient-id/split/all_v2.csv')

Unnamed: 0,Image Index,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,Cardiomegaly,Emphysema,Effusion,Hernia,...,Atelectasis,Pneumothorax,Pleural_Thickening,Pneumonia,Fibrosis,Edema,Consolidation,fold,drain_weak,drain
0,00000001_000.png,0,1,058Y,M,PA,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.090974,0
1,00000001_001.png,1,1,058Y,M,PA,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.003500,0
2,00000001_002.png,2,1,058Y,M,PA,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.835209,1
3,00000002_000.png,0,2,081Y,M,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.026913,0
4,00000003_000.png,0,3,081Y,F,PA,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.013373,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,1,30801,039Y,M,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,train,0.999795,1
112116,00030802_000.png,0,30802,029Y,M,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,val,0.700383,1
112117,00030803_000.png,0,30803,042Y,F,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.001643,0
112118,00030804_000.png,0,30804,030Y,F,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train,0.091752,0


In [83]:
train_df = pd.read_csv('/lfs/1/gangus/repositories/pytorch-classification/drain_detector/data/chexnet/by-patient-id/split/train.csv').set_index('Image Index')

In [84]:
train_df

Unnamed: 0_level_0,Unnamed: 0,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,Cardiomegaly,Emphysema,Effusion,Hernia,...,Nodule,Atelectasis,Pneumothorax,Pleural_Thickening,Pneumonia,Fibrosis,Edema,Consolidation,fold,drain
Image Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00000118_002.png,57,2,118,073Y,M,PA,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00000118_003.png,58,3,118,073Y,M,PA,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00000118_004.png,59,4,118,073Y,M,PA,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00000118_005.png,60,5,118,073Y,M,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00000118_009.png,64,9,118,073Y,M,AP,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00030637_010.png,11177,10,30637,049Y,M,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00030637_011.png,11178,11,30637,049Y,M,PA,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00030637_012.png,11179,12,30637,049Y,M,PA,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,1
00030679_002.png,11200,2,30679,063Y,M,PA,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,val,0


In [88]:
labels_df.loc[train_df.index]['drain'] == train_df['drain']

Image Index
00000118_002.png    True
00000118_003.png    True
00000118_004.png    True
00000118_005.png    True
00000118_009.png    True
                    ... 
00030637_010.png    True
00030637_011.png    True
00030637_012.png    True
00030679_002.png    True
00030679_003.png    True
Name: drain, Length: 404, dtype: bool

NameError: name 'asdf' is not defined