<a href="https://colab.research.google.com/github/gowun/BladderCancer_AMC/blob/master/Notebooks/classifier_check_20211001.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
from google.colab import drive

import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

auth.authenticate_user()
drive.mount('/content/gdrive')
home_path = '/content/gdrive/My Drive/BladderCancer_AMC/'

In [None]:
!pip install lifelines
!pip install shap

In [None]:
!git clone https://github.com/gowun/BladderCancer_AMC.git

In [None]:
from BladderCancer_AMC.ModelingTools import utils as ut
from BladderCancer_AMC.ModelingTools import clustering as cl
from BladderCancer_AMC.ModelingTools import tree_modeling as tm
from BladderCancer_AMC.ModelingTools import linear_modeling as lm
from BladderCancer_AMC.ModelingTools import figure as fe

In [None]:
data_labels = ['MDA_MVAC', 'MDA_DDMVAC', 'Meta_Datasets', 'AMC']
classifiers = ut.load_data(home_path + 'intersect_classifiers.pkl', 'pickle')
datasets = ut.load_data(f'{home_path}scaled_datasets_3mths_20211001.pkl', 'pickle')

In [None]:
def random_oversampling(idx_list, n):
  np.random.seed(1234)
  return np.random.choice(idx_list, n)

MAX_ROW = 100000
over_idx_dict = dict()
for i, d in enumerate(datasets['power']):
  over_idx_dict[data_labels[i]] = random_oversampling(list(range(len(d))), MAX_ROW)

In [None]:
%%time
topK = 10
### Simple Random Forest for Variable Selection when the classifier inclued 10 more variables 
### And.. select the best normalizer
methods = ['power', 'standard', 'rankgauss']
sum_score = dict()
columns = dict()
for m in methods:
  sum_score[m] = 0
  columns[m] = dict()
  for cls, vars in classifiers.items():
    if len(vars) > topK:
      for i, l in enumerate(data_labels):
        X, y = datasets[m][i][vars].iloc[over_idx_dict[l]], np.array(datasets[m][i]['response'])[over_idx_dict[l]]
        sample_leaf = round(MAX_ROW / len(datasets[m][i]) * 3/2)
        result = tm.random_forest_with_performance([X, y], 50, 3, sample_leaf)
        sum_score[m] = sum_score[m] + result['performance']['AUC'] + result['performance']['PRAUC'] - abs(result['performance']['R2'])
        columns[m]['_'.join([cls, l])] = result['feature importance']['feature'].values[:topK]

In [None]:
sum_score

In [None]:
#BEST_NOR = methods[np.argmax(list(sum_score.values()))]
#print(BEST_NOR)
BEST_NOR = 'standard'
final_columns = {}
for cls, vars in classifiers.items():
  if len(vars) > 10:
    names = list(filter(lambda x: x.startswith(cls), columns[BEST_NOR].keys()))
    tmp = list()
    for n in names:
      tmp += list(columns[BEST_NOR][n])
    final_columns[cls] = sorted(set(tmp))
  else:
    final_columns[cls] = sorted(vars)
final_csv = dict()
max_len = max(list(map(lambda x: len(x), final_columns.values())))
for cls, vars in final_columns.items():
  print(cls, len(classifiers[cls]), len(vars))
  ll = max_len - len(vars)
  final_csv[cls] = list(vars) + [''] * ll
ut.save_data(pd.DataFrame(final_csv), home_path + 'final_classifiers.csv', 'csv')

topK_columns = {}
for l in data_labels:
  print(l)
  topK_columns[l] = {}
  for cls, vars in classifiers.items():
    if len(vars) <= 10:
      topK_columns[l][cls] = sorted(vars)
    else:
      topK_columns[l][cls] = columns[BEST_NOR][cls + '_' + l]
  topK_csv = {}
  max_len = max(list(map(lambda x: len(x), topK_columns[l].values())))
  for cls, vars in topK_columns[l].items():
    ll = max_len - len(vars)
    topK_csv[cls] = list(vars) + [''] * ll
  ut.save_data(pd.DataFrame(topK_csv), home_path + f'top10_{l}_classifiers.csv', 'csv')

In [None]:
from itertools import permutations
orders = list(range(len(data_labels)))
orders = list(permutations(orders, 2))
orders

In [None]:
from scipy.stats import ttest_ind
from itertools import chain

def confirm_by_ttest(arr1, arr2, pvalue=0.05):
  tmp = ttest_ind(arr1, arr2, equal_var=False)
  if tmp.pvalue <= pvalue:
    differ = True
  else:
    differ = False
  return differ, tmp

def modeling_with_various_features(X_tr, y_tr, X_val, y_val, fts_dict, md_mode, Xy_ts_sets=None):
  result = dict()
  result['models'] = dict()
  result['scores_tr_val'] = dict()
  result['ttest_vals'] = dict()
  result['best_classifiers'] = []

  perf = []
  metrics = ['AUC', 'PRAUC', 'R2']
  for k, filtered in fts_dict.items():

    if md_mode == 'logistic':
      tmp = lm.logiReg_model_with_performance([X_tr[filtered], y_tr], 10, class_weight='balanced')
    elif md_mode == 'decision':
      tmp = tm.tree_model_with_performance([X_tr[filtered], y_tr], 3, 3, class_weight='balanced')
    elif md_mode == 'random':
      tmp = tm.random_forest_with_performance([X_tr[filtered], y_tr], 50, 3, 3)
    
    prob_tr = tmp['model'].predict_proba(X_tr[filtered])[:, 1]
    pred_val = tmp['model'].predict(X_val[filtered])
    prob_val = tmp['model'].predict_proba(X_val[filtered])[:, 1]

    result['scores_tr_val'][k] = [prob_tr, prob_val]

    pred_ts_sets = []
    prob_ts_sets = []
    if Xy_ts_sets is not None:
      for X_ts, y_ts in Xy_ts_sets:
        pred_ts_sets.append(tmp['model'].predict(X_ts[filtered]))
        prob_ts_sets.append(tmp['model'].predict_proba(X_ts[filtered])[:, 1])
        result['scores_tr_val'][k] += [prob_ts_sets[-1]]
    
    ### 스코어 검증
    div_tr = []
    div_val = []
    for i in range(2):
      div_tr.append(prob_tr[np.array(y_tr) == i])
      div_val.append(prob_val[np.array(y_val) == i])
    # 1. 동일데이터 내 R vs. NR 차이가 유효한가
    # 2. R 끼리 유사한가
    # 3. NR 끼리 유사한가
    result['ttest_vals'][k] = [confirm_by_ttest(div_tr[0], div_tr[1])[0], confirm_by_ttest(div_val[0], div_val[1])[0], not confirm_by_ttest(div_tr[0], div_val[0])[0], not confirm_by_ttest(div_tr[1], div_val[1])[0]]
    if sum(result['ttest_vals'][k]) == 4:
      result['best_classifiers'].append(k)
    result['models'][k] = tmp

    val = tm.compute_performance(y_val, pred_val, prob_val)
    ts_sets = []
    if Xy_ts_sets is not None:
      for i, (X_ts, y_ts) in enumerate(Xy_ts_sets):
        ts_sets.append(tm.compute_performance(y_ts, pred_ts_sets[i], prob_ts_sets[i]))
    tmp_ = []
    for met in metrics:
      tmp_ += [tmp['performance'][met], val[met]] + list(map(lambda x: x[met], ts_sets))
    perf.append(tmp_)
    print(k, tmp_)

  r_perf = []
  if len(result['best_classifiers']) > 0:
    for c in result['best_classifiers']:
      ii = list(fts_dict.keys()).index(c)
      r_perf.append(perf[ii])
    cols_ = list(map(lambda x: 'ts'+str(x), range(len(ts_sets))))
    cols = list(chain(*map(lambda x: list(map(lambda y: y + '_' + x, ['tr', 'val'] + cols_)), metrics)))
    comp = pd.DataFrame(r_perf, columns=cols, index=result['best_classifiers'])
    print(comp)
  else:
    comp = None
  return result, comp

In [None]:
def validated_models(orders, datasets, best_nor, col_dict, data_labels, over_idx_dict, mode):
  mode_al, mode_cls = mode.split('_')
  models = dict()
  for o1, o2 in orders:
    X_tr, y_tr = datasets[best_nor][o1].iloc[over_idx_dict[data_labels[o1]]], np.array(datasets[best_nor][o1]['response'])[over_idx_dict[data_labels[o1]]]
    X_ts, y_ts = datasets[best_nor][o2], np.array(datasets[best_nor][o2]['response'])
    ts_idxs = sorted({0, 1, 2, 3} - {o1, o2})
    Xy_ts_sets = []
    for i in ts_idxs:
      Xy_ts_sets.append([datasets[best_nor][i], np.array(datasets[best_nor][i]['response'])])
    if mode_cls == 'union':
      cols = col_dict
    elif mode_cls == 'topK':
      cols = col_dict[data_labels[o1]]
    total = modeling_with_various_features(X_tr, y_tr, X_ts, y_ts, cols, mode_al, Xy_ts_sets)
    print(o1, o2, len(total[0]['best_classifiers']), ts_idxs)
    if len(total[0]['best_classifiers']) > 0:
      models['->'.join([data_labels[o1], data_labels[o2]])] = total

  if len(models) > 0:
    ut.save_data(models, home_path + f'{mode_cls}_{mode_al}.pkl', 'pkl')
    tmp = []
    for key in models.keys():
      tt = models[key][1].copy()
      tt.index = list(map(lambda x: key + ',' + x, tt.index))
      tmp.append(tt)
    comp = pd.concat(tmp)
    print(comp)
    return models, comp
  else:
    return None, None

In [None]:
%%time
log_union, log_union_comp = validated_models(orders, datasets, BEST_NOR, final_columns, data_labels, over_idx_dict, 'logistic_union')
log_topK, log_topK_comp = validated_models(orders, datasets, BEST_NOR, topK_columns, data_labels, over_idx_dict, 'logistic_topK')

In [None]:
ut.save_data(over_idx_dict, home_path + f'over_idx_dict_20211001.pkl', 'pickle')
ut.save_data([log_union, log_union_comp], home_path + f'log_union_info_20211001.pkl', 'pickle')
ut.save_data([log_topK, log_topK_comp], home_path + f'log_topK_info_20211001.pkl', 'pickle')

In [None]:
log_union_comp

In [None]:
log_topK_comp

In [None]:
%%time
dt_union, dt_union_comp = validated_models(orders, datasets, BEST_NOR, final_columns, data_labels, over_idx_dict, 'decision_union')
dt_topK, dt_topK_comp = validated_models(orders, datasets, BEST_NOR, topK_columns, data_labels, over_idx_dict, 'decision_topK')

In [None]:
dt_union_comp

In [None]:
dt_topK_comp

In [None]:
%%time
rf_union, rf_union_comp = validated_models(orders, datasets, BEST_NOR, final_columns, data_labels, over_idx_dict, 'random_union')
rf_topK, rf_topK_comp = validated_models(orders, datasets, BEST_NOR, topK_columns, data_labels, over_idx_dict, 'random_topK')

In [None]:
rf_union_comp

In [None]:
rf_topK_comp

In [None]:
def draw_box_plots(tag, cls, model, cols, datasets, data_labels):
  tags = tag.split('->')
  tags = [tags[0]] + tags[1].split(',')
  idxs = list(map(lambda x: data_labels.index(x), tags[:2]))
  r_idxs = sorted({0, 1, 2, 3} - set(idxs))
  ll = ['M', 'V', 'T0', 'T1']

  scores = []
  labels = []
  for i, j in enumerate(idxs + r_idxs):
    yy = datasets[j]['response']
    scores.append(model.predict_proba(datasets[j][cols])[:, 1])
    tmp = np.array([f'{i}.{ll[i]}_{data_labels[j]}_NR'] * len(yy))
    tmp[np.array(yy) == 1.0] = f'{i}.{ll[i]}_{data_labels[j]}_R'
    labels.append(tmp)
  
  fe.plot_box(np.concatenate(scores), 'y', np.concatenate(labels), tag + ' + ' + cls)

def draw_all(models, datasets, data_labels):
  for i in models.keys():
    for j in models[i][0]['best_classifiers']:
      draw_box_plots(i, j, models[i][0]['models'][j]['model'], list(models[i][0]['models'][j]['columns']), datasets, data_labels)

In [None]:
draw_all(log_union, datasets[BEST_NOR], data_labels)

In [None]:
draw_all(log_topK, datasets[BEST_NOR], data_labels)

In [None]:
draw_all(dt_union, datasets[BEST_NOR], data_labels)