<a href="https://colab.research.google.com/github/gowun/BladderCancer_AMC/blob/2-gowun/Notebooks/Data_Stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
from google.colab import drive
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', -1)

auth.authenticate_user()
drive.mount('/content/gdrive')
home_path = '/content/gdrive/My Drive/BladderCancer_AMC/'

In [None]:
!git clone -b 2-gowun https://github.com/gowun/BladderCancer_AMC.git

In [None]:
from BladderCancer_AMC.ModelingTools import utils as ut
from BladderCancer_AMC.ModelingTools import preprocessing as pp
from BladderCancer_AMC.ModelingTools import figure as fg

In [None]:
ds_names = ['mda_mvac', 'mda_ddmvac', 'meta_datasets', 'amc']
datasets4 = list(map(lambda x: ut.load_data(f'{home_path}{x}.pkl', 'pickle'), ds_names))

1. 모든 데이터셋에 존재하는 컬럼 (intersect_cols) 및 각 데이터 내 intersect_cols 비율 

In [None]:
#### 전체 데이터셋 모두에서 등장하는 컬럼셋
datasets4_cols = list(map(lambda x: set(x.columns), datasets4))
intersect_cols = datasets4_cols[0]
for cols in datasets4_cols[1:]:
  intersect_cols = sorted(set(intersect_cols).intersection(cols))
len(intersect_cols)

In [None]:
## 데이터셋 별 intersect_cols 의 비율
for i, cols in enumerate(datasets4_cols):
  r = round(len(intersect_cols) / len(cols) * 100.0, 2)
  print(f'{ds_names[i]}: {r}% from {len(cols)} Gene Symbols')

2. 주어진 각 Classifiers 중 intersect_cols에 포함된 비율

In [None]:
## Classifier.csv 로드 및 정제
clfs = ut.load_data(home_path + 'Classifiers.csv', 'csv').T

In [None]:
clfs_dict = dict()
for i, row in enumerate(clfs.iloc[:-1].values):
  tmp = row[np.array(clfs.iloc[i].isna() == False)]
  tmp1 = list(filter(lambda x: x in intersect_cols, tmp))
  k_name = list(clfs.index)[i]
  clfs_dict[k_name] = [len(tmp1), len(tmp1) / len(tmp), tmp1]
  if len(tmp1) < len(tmp):
    print(f'{k_name}: {clfs_dict[k_name][:2]}')

In [None]:
ut.save_data(clfs_dict, home_path + 'intersect_classifiers.pkl', 'pickle')

In [None]:
clfs_df = pd.DataFrame(list(map(lambda x: x[2], clfs_dict.values())), index=list(clfs_dict.keys())).T
ut.save_data(clfs_df, home_path + 'intersect_classifiers.csv', 'csv')

3. 각 데이터셋의 기초 통계

In [None]:
%%time
mean_datasets4 = list(map(lambda x: pp.handle_repeated_columns(x[intersect_cols])[1], datasets4))

In [None]:
ut.save_data(mean_datasets4, f'{home_path}mean_datasets4.pkl', 'pickle')

In [None]:
%%time
for i, nm in enumerate(ds_names):
  tmp = mean_datasets4[i].describe()
  ut.save_data(tmp.reset_index(), f'{home_path}{nm}.csv', 'csv')

4. Optimal Normalization Algorithm 도출

In [None]:
methods = ['log2', 'log2_minmax', 'standard', 'minmax', 'max_abs', 'robust', 'power', 'quantile', 'rankgauss']

In [None]:
%%time
scaled_datasets4 = []
for i, d in enumerate(mean_datasets4):
  print(i)
  scaled_datasets4.append(pp.scale_with_scalers(d, methods))

In [None]:
from itertools import combinations
orders = list(range(len(mean_datasets4)))
orders = list(combinations(orders, 2))
orders

In [None]:
%%time
## 알고리즘 별로 분포가 비슷한 컬럼셋 도출
cols_dict = dict()
remain_clfs_dict = dict()
for m in methods:
  print(m)
  for o1, o2 in orders:
    if m in cols_dict.keys():
      cols = sorted(set(intersect_cols).intersection(set(cols_dict[m])))
      cols_dict[m] = pp.compare_two_samples_and_draw_feasible_columns(scaled_datasets4[o1][m][cols], scaled_datasets4[o2][m][cols])
    else:
      cols_dict[m] = pp.compare_two_samples_and_draw_feasible_columns(scaled_datasets4[o1][m], scaled_datasets4[o2][m])
  print(len(cols_dict[m]), cols_dict[m])
  if len(cols_dict[m]) > 0:
    remain_clfs_dict[m] = dict()
    for key in clfs_dict.keys():
      rrr = sorted(set(clfs_dict[key][2]).intersection(set(cols_dict[m])))
      rr = len(rrr) / clfs_dict[key][0] * 100.0
      r = clfs_dict[key][1] * rr
      remain_clfs_dict[m][key] = [rrr, rr, r]
      print(f'{m}-{key}: {len(rrr)}({r} % from the original, {rr} % from the reduced) remaines')

In [None]:
%%time
remain_mth_datasets = dict()
for m in remain_clfs_dict.keys():
  if sum(map(lambda x: x[1] == 100.0, remain_clfs_dict[m].values())) == len(clfs_dict):
    print(m)
    tmp = []
    for ll in scaled_datasets4:
      tmp.append(ll[m][cols_dict[m]])
    remain_mth_datasets[m] = tmp

In [None]:
ut.save_data(remain_clfs_dict, f'{home_path}scaled_datasets_3mths.pkl', 'pickle')