In [1]:
import torch
from torchvision import transforms
import torch.nn.functional as F
from torch.utils.data import DataLoader

from datasets.mpr_dataset import MPR_Dataset,MPR_Dataset_H5
from tqdm import tqdm_notebook as tqdm
import yaml
import re
import os
import cv2
import copy
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN
from imblearn.over_sampling import (SMOTE, BorderlineSMOTE, SVMSMOTE, SMOTENC,
                                    KMeansSMOTE)
from imblearn.over_sampling import RandomOverSampler
from imblearn.base import BaseSampler
from imblearn.ensemble import BalancedRandomForestClassifier

from models import ShuffleNetv2

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import xgboost as xgb
from sklearn import svm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import lightgbm as lgb
from imblearn.datasets import make_imbalance

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks

In [2]:
def convert_string_to_np_array(inp_str):
    numeric_const_pattern = '[-+]? (?: (?: \d* \. \d+ ) | (?: \d+ \.? ) )(?: [Ee] [+-]? \d+ ) ?'
    rx = re.compile(numeric_const_pattern, re.VERBOSE)

    return np.array([float(x) for x in rx.findall(inp_str)])

# Data Loading

In [3]:
train_df = pd.read_csv('ensemble/xgboost_train.csv')
train_df.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)
train_df['PRED'] = train_df['PRED'].apply(convert_string_to_np_array)
test_df = pd.read_csv('ensemble/xgboost_test.csv')
test_df.drop(['Unnamed: 0', 'index'], axis=1, inplace=True)
test_df['PRED'] = test_df['PRED'].apply(convert_string_to_np_array)

# Aggregate the datatable

In [4]:
def get_data(grouped_by_patients):
    
    X = list()
    y = list()
    
    for patient in tqdm(grouped_by_patients['PATIENT'].unique()):
        cur_patient = grouped_by_patients.get_group(patient[0])
        grouped_by_section = cur_patient.groupby('ARTERY_SECTION')        

        for section in grouped_by_section['ARTERY_SECTION'].unique():
            all_images_per_section = grouped_by_section.get_group(section[0])

            train_data = np.stack(all_images_per_section['PRED'], axis=0)
            label = all_images_per_section['LABEL'].iloc[0]

            if len(train_data) ==50:
                X.append(train_data.ravel())
                y.append(label)  
    return np.stack(X), np.array(y)

In [5]:
X_train, y_train = get_data(train_df.groupby('PATIENT'))
X_test, y_test = get_data(test_df.groupby('PATIENT'))

HBox(children=(IntProgress(value=0, max=708), HTML(value='')))




HBox(children=(IntProgress(value=0, max=105), HTML(value='')))




# Train

# LIB FOR IMBALANCED LEARNING
https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/

In [6]:
# model = xgb.XGBClassifier()
# model = LinearDiscriminantAnalysis()
# model = KNeighborsClassifier()
# model = DecisionTreeClassifier()

# 85 79 81
# model = xgb.XGBClassifier(learning_rate=0.001, n_estimators=3, max_depth=2,
#                     min_child_weight=2, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
#                     nthread=4, scale_pos_weight=1, seed=32)

# 85 81 82
# model = xgb.XGBClassifier(learning_rate=0.0001, n_estimators=100, max_depth=4,
#                     min_child_weight=2,  subsample=0.6, colsample_bytree=1.0,
#                     nthread=4, scale_pos_weight=1, seed=42)

# 69 71 70 / 82 81 81 
# model = BalancedRandomForestClassifier(max_depth=3)  # {0: 1, 1: 0.5, 2:0.4}

In [7]:
sampling_strategy = {0: 2346, 1:670*3, 2:226*10}
ros = RandomOverSampler(sampling_strategy=sampling_strategy)
X_res, y_res = ros.fit_resample(X_train, y_train)

unique, counts = np.unique(y_res, return_counts=True)
dict(zip(unique, counts))

{0: 2346, 1: 2010, 2: 2260}

In [8]:
# Train
model.fit(X_res, y_res)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0,
              learning_rate=1e-05, max_delta_step=0, max_depth=4,
              min_child_weight=2, missing=None, n_estimators=100, n_jobs=1,
              nthread=4, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              silent=None, subsample=0.6, verbosity=1)

In [10]:
# LightGBM params: https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst

param = {}
param['application'] = 'multiclass'
param['num_class'] = '3'
param['learning_rate'] = 0.01
param['feature_fraction'] = 0.4
param['bagging_fraction'] = 0.4
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param["verbose"] = 0
param["metric"] = ''

lgb_train = lgb.Dataset(X_res, label=y_res)
lgb_val = lgb.Dataset(X_test, label=y_test)

model = lgb.train(param, lgb_train, 1000, valid_sets=[lgb_train, lgb_val], early_stopping_rounds=10,  verbose_eval=0)

In [11]:
# For LightGBM

y_pred = model.predict(X_train)
predictions = [round(np.argmax(value)) for value in y_pred]
accuracy = accuracy_score(y_train, predictions)
print('TRAIN: ')
print(classification_report(predictions, y_train))

y_pred = model.predict(X_test)
predictions = [round(np.argmax(value)) for value in y_pred]
print('TEST:')
print(classification_report(predictions, y_test))

TRAIN: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2339
           1       1.00      0.99      1.00       676
           2       1.00      1.00      1.00       227

    accuracy                           1.00      3242
   macro avg       1.00      1.00      1.00      3242
weighted avg       1.00      1.00      1.00      3242

TEST:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89       367
           1       0.52      0.69      0.59        84
           2       0.55      0.60      0.58        35

    accuracy                           0.81       486
   macro avg       0.67      0.71      0.69       486
weighted avg       0.83      0.81      0.82       486



In [53]:
y_pred = model.predict(X_train)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_train, predictions)
print('TRAIN: ')
print(classification_report(predictions, y_train))

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
print('TEST:')
print(classification_report(predictions, y_test))

TRAIN: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2351
           1       0.99      1.00      0.99       663
           2       1.00      0.99      1.00       228

    accuracy                           1.00      3242
   macro avg       0.99      0.99      0.99      3242
weighted avg       1.00      1.00      1.00      3242

TEST:
              precision    recall  f1-score   support

           0       0.95      0.84      0.89       380
           1       0.46      0.72      0.57        72
           2       0.55      0.62      0.58        34

    accuracy                           0.81       486
   macro avg       0.66      0.73      0.68       486
weighted avg       0.85      0.81      0.82       486



In [11]:
y_pred = model.predict(X_res)
predictions = [round(value) for value in y_pred]
accuracy = accuracy_score(y_res, predictions)
print('TRAIN: ')
print(classification_report(predictions, y_res))

y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
print('TEST:')
print(classification_report(predictions, y_test))

TRAIN: 
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2330
           1       1.00      0.99      1.00      2021
           2       1.00      1.00      1.00      2265

    accuracy                           1.00      6616
   macro avg       1.00      1.00      1.00      6616
weighted avg       1.00      1.00      1.00      6616

TEST:
              precision    recall  f1-score   support

           0       0.92      0.84      0.88       367
           1       0.52      0.67      0.58        87
           2       0.55      0.66      0.60        32

    accuracy                           0.80       486
   macro avg       0.66      0.72      0.69       486
weighted avg       0.83      0.80      0.81       486



# Predict on test

In [12]:
grouped_by_patients = test_df.groupby('PATIENT')

predicts = []
indexes = []

for patient in tqdm(grouped_by_patients['PATIENT'].unique()):
    cur_patient = grouped_by_patients.get_group(patient[0])
    grouped_by_section = cur_patient.groupby('ARTERY_SECTION')        
    
    for section in grouped_by_section['ARTERY_SECTION'].unique():
        all_images_per_section = grouped_by_section.get_group(section[0])
        
        test_data = np.stack(all_images_per_section['PRED'], axis=0)
        
        # Predict
        test_data = np.expand_dims(test_data.ravel(), axis=0)
        pred = model.predict(test_data)
        predicts += list(pred)*50
        indexes += list(all_images_per_section.index)
        
test_df['PRED'] = pd.Series(data=predicts, index=indexes)

HBox(children=(IntProgress(value=0, max=105), HTML(value='')))




In [15]:
def calculate_metrics(col_section, col_ids, col_preds, col_labels):
    """
    Calculate final auc and f1 metrics on three levels: per patient, per section and per artery
    :return: {dict} each metric as a key and its calculated metric as a value
    """
    assert len(col_section) == len(col_ids) == len(col_preds) == len(col_labels)

    metrics = {'ACC_section': 0, 'ACC_patient': 0, 'ACC_artery': 0, 'F1_section': 0, 'F1_patient': 0, 'F1_artery': 0}
    dict_artery = {'LAD': ['D-1', 'D-2', 'LAD', 'D-3', '2D-2', 'D-1Original', 'LADOriginal', 'D-4'],
                   'LCX': ['LCX', 'OM-2', 'OM-1', 'OM-3', 'OM', 'LCX-PLB', 'LCX-PDA', 'PLV_LCX', 'PDA_LCX'],
                   'RCA': ['RCA', 'RCA-PLB', 'RCA-PDA', 'PLV_RCA']}

    df = pd.concat([col_ids, col_section, col_preds, col_labels], axis=1)
    df = df.rename(columns={col_section.name: 'section', col_ids.name: 'patient', col_preds.name:
        'preds', col_labels.name: 'labels'})
    df['artery'] = df['section'].apply(lambda x: [k for k in dict_artery.keys() if x in dict_artery[k]][0])
    
#     df['preds'] = df['preds'].apply(lambda x: np.argmax(x))   # for lightgbm
    
    # SECTION
    section_labels = df[['preds', 'labels', 'section', 'artery', 'patient']].groupby(['patient', 'section']).agg(
        lambda x: max(x))
    preds_section = df[['preds', 'labels', 'section', 'artery', 'patient']].groupby(['patient', 'section']).agg(
        lambda x: x.value_counts().index[0])
    acc = accuracy_score(preds_section['preds'], section_labels['labels'])
    f1 = f1_score(preds_section['preds'], section_labels['labels'], average='weighted')
    metrics['ACC_section'], metrics['F1_section'] = acc, f1

    # ARTERY
    sect = section_labels.reset_index()
    artery_labels = sect.groupby(['patient', 'artery']).agg(lambda x: max(x))['labels']
    preds_artery = preds_section.reset_index().groupby(['patient', 'artery']).agg(lambda x: max(x))['preds']  
    acc = accuracy_score(preds_artery, artery_labels)
    f1 = f1_score(preds_artery, artery_labels, average='weighted')
    metrics['ACC_artery'], metrics['F1_artery'] = acc, f1

    # PATIENT
    art = artery_labels.reset_index()
    patient_labels = art.groupby(['patient']).agg(lambda x: max(x))['labels']
    preds_patient = preds_artery.reset_index().groupby(['patient']).agg(lambda x: max(x))['preds'] 
    acc = accuracy_score(preds_patient, patient_labels)
    f1 = f1_score(preds_patient, patient_labels, average='weighted')
    metrics['ACC_patient'], metrics['F1_patient'] = acc, f1

    return metrics

In [16]:
calculate_metrics(test_df['ARTERY_SECTION'], test_df['PATIENT'], test_df['PRED'], test_df['LABEL'])

{'ACC_artery': 0.8063380281690141,
 'ACC_patient': 0.7428571428571429,
 'ACC_section': 0.8065843621399177,
 'F1_artery': 0.8143026727304132,
 'F1_patient': 0.7441044471644227,
 'F1_section': 0.816159563736268}

In [17]:
calculate_metrics(test_df['ARTERY_SECTION'], test_df['PATIENT'], test_df['PRED'], test_df['LABEL'])

{'ACC_artery': 0.8133802816901409,
 'ACC_patient': 0.7619047619047619,
 'ACC_section': 0.7962962962962963,
 'F1_artery': 0.8171297491246654,
 'F1_patient': 0.7618417990938829,
 'F1_section': 0.800808481586288}

In [107]:
calculate_metrics(test_df['ARTERY_SECTION'], test_df['PATIENT'], test_df['PRED'], test_df['LABEL'])

['LAD' 'RCA' 'LCX']


{'ACC_artery': 0.8063380281690141,
 'ACC_patient': 0.7619047619047619,
 'ACC_section': 0.8065843621399177,
 'F1_artery': 0.8208258119764523,
 'F1_patient': 0.7686028257456828,
 'F1_section': 0.8212599536266535}

In [None]:
test_df.to_csv('WIMLsubmitted_predictions.csv', index=False)