<a href="https://colab.research.google.com/github/gowun/BladderCancer_AMC/blob/master/Notebooks/classifier_check.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
from google.colab import drive

import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

auth.authenticate_user()
drive.mount('/content/gdrive')
home_path = '/content/gdrive/My Drive/BladderCancer_AMC/'

In [None]:
!pip install lifelines
!pip install shap

In [None]:
!git clone https://github.com/gowun/BladderCancer_AMC.git

In [None]:
from BladderCancer_AMC.ModelingTools import utils as ut
from BladderCancer_AMC.ModelingTools import clustering as cl
from BladderCancer_AMC.ModelingTools import tree_modeling as tm
from BladderCancer_AMC.ModelingTools import linear_modeling as lm
from BladderCancer_AMC.ModelingTools import figure as fe

In [None]:
data_labels = ['MDA_MVAC', 'MDA_DDMVAC', 'Meta_Datasets', 'AMC']
classifiers = ut.load_data(home_path + 'intersect_classifiers.pkl', 'pickle')
datasets = ut.load_data(f'{home_path}scaled_datasets_3mths.pkl', 'pickle')

In [None]:
def random_oversampling(idx_list, n):
  np.random.seed(1234)
  return np.random.choice(idx_list, n)

MAX_ROW = 100000
over_idx_dict = dict()
for i, d in enumerate(datasets['power']):
  over_idx_dict[data_labels[i]] = random_oversampling(list(range(len(d))), MAX_ROW)

In [None]:
%%time
topK = 10
### Simple Random Forest for Variable Selection when the classifier inclued 10 more variables 
### And.. select the best normalizer
methods = ['power', 'standard', 'rankgauss']
sum_score = dict()
columns = dict()
for m in methods:
  sum_score[m] = 0
  columns[m] = dict()
  for cls, vars in classifiers.items():
    if len(vars) > topK:
      for i, l in enumerate(data_labels):
        X, y = datasets[m][i][vars].iloc[over_idx_dict[l]], np.array(datasets[m][i]['response'])[over_idx_dict[l]]
        sample_leaf = round(MAX_ROW / len(datasets[m][i]) * 3/2)
        result = tm.random_forest_with_performance([X, y], 50, 3, sample_leaf)
        sum_score[m] += sum(result['performance'].values())
        columns[m]['_'.join([cls, l])] = result['feature importance']['feature'].values[:topK]

In [None]:
sum_score

In [None]:
BEST_NOR = methods[np.argmax(list(sum_score.values()))]
print(BEST_NOR)
final_columns = classifiers.copy()
for cls, vars in classifiers.items():
  if len(vars) > 10:
    names = list(filter(lambda x: x.startswith(cls), columns[BEST_NOR].keys()))
    tmp = list()
    for n in names:
      tmp += list(columns[BEST_NOR][n])
    final_columns[cls] = list(set(tmp))
max_len = max(list(map(lambda x: len(x), final_columns.values())))
final_csv = dict()
for cls, vars in final_columns.items():
  print(cls, len(classifiers[cls]), len(vars))
  ll = max_len - len(vars)
  final_csv[cls] = list(vars) + [''] * ll

In [None]:
ut.save_data(pd.DataFrame(final_csv), home_path + 'final_classifiers.csv', 'csv')

In [None]:
from itertools import permutations
orders = list(range(len(data_labels)))
orders = list(permutations(orders, 2))
orders

In [None]:
from scipy.stats import ttest_ind

def confirm_by_ttest(arr1, arr2, pvalue=0.05):
  tmp = ttest_ind(arr1, arr2, equal_var=False)
  if tmp.pvalue <= pvalue:
    differ = True
  else:
    differ = False
  return differ, tmp

def modeling_with_various_features(X_tr, y_tr, X_ts, y_ts, fts_dict, md_mode):
  result = dict()
  result['models'] = dict()
  result['scores_tr_val'] = dict()
  result['ttest_vals'] = dict()
  result['best_classifiers'] = []

  perf_tr = []
  perf_ts = []
  for k, filtered in fts_dict.items():

    if md_mode == 'logistic':
      tmp = lm.logiReg_model_with_performance([X_tr[filtered], y_tr], 10, class_weight='balanced')
    elif md_mode == 'decision':
      tmp = tm.tree_model_with_performance([X_tr[filtered], y_tr], 3, 3, class_weight='balanced')
    elif md_mode == 'random':
      tmp = tm.random_forest_with_performance([X_tr[filtered], y_tr], 50, 3, 3)
    
    prob_tr = tmp['model'].predict_proba(X_tr[filtered])[:, 1]
    pred_ts = tmp['model'].predict(X_ts[filtered])
    prob_ts = tmp['model'].predict_proba(X_ts[filtered])[:, 1]
    
    result['scores_tr_val'][k] = [prob_tr, prob_ts]
    
    ### 스코어 검증
    div_tr = []
    div_ts = []
    for i in range(2):
      div_tr.append(prob_tr[np.array(y_tr) == i])
      div_ts.append(prob_ts[np.array(y_ts) == i])
    # 1. 동일데이터 내 R vs. NR 차이가 유효한가
    # 2. R 끼리 유사한가
    # 3. NR 끼리 유사한가
    result['ttest_vals'][k] = [confirm_by_ttest(div_tr[0], div_tr[1])[0], confirm_by_ttest(div_ts[0], div_ts[1])[0], not confirm_by_ttest(div_tr[0], div_ts[0])[0], not confirm_by_ttest(div_tr[1], div_ts[1])[0]]
    if sum(result['ttest_vals'][k]) == 4:
      result['best_classifiers'].append(k)

    result['models'][k] = tmp
    perf_tr.append(tmp['performance'])
    perf_ts.append(tm.compute_performance(y_ts, pred_ts, prob_ts))
    print(k)

  if len(result['best_classifiers']) > 0:
    r_perf_tr = []
    r_perf_ts = []
    for c in result['best_classifiers']:
      ii = list(fts_dict.keys()).index(c)
      r_perf_tr.append(perf_tr[ii])
      r_perf_ts.append(perf_ts[ii])
    compare_tr = pd.DataFrame(r_perf_tr, index=result['best_classifiers'])
    compare_ts = pd.DataFrame(r_perf_ts, index=result['best_classifiers'])
    comp = pd.concat([compare_tr, compare_ts], 1)
    print(comp)
  else:
    comp = None
  return result, comp

In [None]:
%%time
## logistic
logistic = dict()
for o1, o2 in orders:
  X_tr, y_tr = datasets[BEST_NOR][o1].iloc[over_idx_dict[data_labels[o1]]], np.array(datasets[BEST_NOR][o1]['response'])[over_idx_dict[data_labels[o1]]]
  X_ts, y_ts = datasets[BEST_NOR][o2], np.array(datasets[BEST_NOR][o2]['response'])
  total = modeling_with_various_features(X_tr, y_tr, X_ts, y_ts, final_columns, 'logistic')
  print(o1, o2)
  if len(total[0]['best_classifiers']) > 0:
    logistic['->'.join([data_labels[o1], data_labels[o2]])] = total

In [None]:
logistic.keys()

In [None]:
logistic['Meta_Datasets->MDA_MVAC'][1]

In [None]:
logistic['Meta_Datasets->MDA_DDMVAC'][1]

In [None]:
logistic['Meta_Datasets->AMC'][1]

In [None]:
%%time
## decision tree
dt = dict()
for o1, o2 in orders:
  X_tr, y_tr = datasets[BEST_NOR][o1].iloc[over_idx_dict[data_labels[o1]]], np.array(datasets[BEST_NOR][o1]['response'])[over_idx_dict[data_labels[o1]]]
  X_ts, y_ts = datasets[BEST_NOR][o2], np.array(datasets[BEST_NOR][o2]['response'])
  total = modeling_with_various_features(X_tr, y_tr, X_ts, y_ts, final_columns, 'decision')
  if len(total[0]['best_classifiers']) > 0:
    dt['->'.join([data_labels[o1], data_labels[o2]])] = total

In [None]:
dt.keys()

In [None]:
dt['MDA_MVAC->AMC'][1]

In [None]:
%%time
## random forest
rf = dict()
for o1, o2 in orders:
  X_tr, y_tr = datasets[BEST_NOR][o1].iloc[over_idx_dict[data_labels[o1]]], np.array(datasets[BEST_NOR][o1]['response'])[over_idx_dict[data_labels[o1]]]
  X_ts, y_ts = datasets[BEST_NOR][o2], np.array(datasets[BEST_NOR][o2]['response'])
  total = modeling_with_various_features(X_tr, y_tr, X_ts, y_ts, final_columns, 'random')
  if len(total[0]['best_classifiers']) > 0:
    rf['->'.join([data_labels[o1], data_labels[o2]])] = total

In [None]:
rf.keys()

In [None]:
logistic.update(dt)

In [None]:
logistic.keys()

In [None]:
##dt['MDA_MVAC->AMC'][0]['scores_tr_val']
def draw_box_plots(tag, score_dict, best_cls_names):
  tags = tag.split('->')
  m_lb = np.array([f'M_{tags[0]}_NR'] * MAX_ROW)
  m_lb[np.array(datasets[BEST_NOR][data_labels.index(tags[0])]['response'])[over_idx_dict[tags[0]]] == 1.0] = f'M_{tags[0]}_R'
  v_lb = np.array([f'V_{tags[1]}_NR'] * len(datasets[BEST_NOR][data_labels.index(tags[1])]['response']))
  v_lb[datasets[BEST_NOR][data_labels.index(tags[1])]['response'] == 1.0] = f'V_{tags[1]}_R'
  #print(np.concatenate([m_lb, v_lb]))
  for b in best_cls_names:
    #print(np.concatenate(score_dict[b]))
    fe.plot_box(np.concatenate(score_dict[b]), 'y', np.concatenate([m_lb, v_lb]), tag + ' + ' + b)

In [None]:
draw_box_plots('MDA_MVAC->AMC', dt['MDA_MVAC->AMC'][0]['scores_tr_val'], dt['MDA_MVAC->AMC'][0]['best_classifiers'])

In [None]:
for l in logistic.keys():
  draw_box_plots(l, logistic[l][0]['scores_tr_val'], logistic[l][0]['best_classifiers'])

In [None]:
for k, v in logistic.items():
  for ii in k.split('->'):
    idx = data_labels.index(ii)
    y_str = np.array(['NR'] * len(datasets[BEST_NOR][idx]))
    y_str[datasets[BEST_NOR][idx]['response'] == 1.0] = 'R'

    cls_names = list(v[1].index)
    for cls in cls_names:
      dd = datasets[BEST_NOR][idx][final_columns[cls]]
      dd.index = y_str
      cl.plot_cluster_heatmap(dd.T, title='+'.join([ii, cls]), figsize=(10, 10))