In [0]:
!pip install shap
!pip install importlib
!pip install lifelines
!pip install plotly



In [0]:
from google.colab import auth
from google.colab import drive
import pandas as pd
import numpy as np
import scipy as sc

In [0]:
auth.authenticate_user()
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
!ls '/content/gdrive/My Drive/MD.KimSW'

'PD-L1 IHC-표 1.csv'


In [0]:
home_path = '/content/gdrive/My Drive/'
df = pd.read_csv(home_path + 'MD.KimSW/PD-L1 IHC-표 1.csv')[:100]

In [0]:
df.shape

(100, 25)

In [0]:
biopsy_cols = sorted(list(filter(lambda x: 'BX' in x, df.columns)))
biopsy_org_cols = sorted(list(filter(lambda x: 'CPSBX' in x, biopsy_cols)))
biopsy_cat_cols = sorted(set(biopsy_cols) - set(biopsy_org_cols))

resaction_cols = sorted(list(filter(lambda x: 'RC' in x, df.columns)))
resaction_org_cols = sorted(list(filter(lambda x: 'CPSRC' in x, resaction_cols)))
resaction_cat_cols = sorted(set(resaction_cols) - set(resaction_org_cols))

In [0]:
df[biopsy_cat_cols + resaction_cat_cols] = df[biopsy_cat_cols + resaction_cat_cols].astype(int)

In [0]:
df[biopsy_cat_cols + resaction_cat_cols].head()

Unnamed: 0,22C3 ventanaBX,22C3DakoBX,SP263BX,22C3 ventanaRC,22C3DakoRC,SP263RC
0,0,0,0,0,0,0
1,2,0,3,0,0,2
2,1,0,2,1,0,3
3,0,0,2,0,0,2
4,0,1,2,0,1,2


In [0]:
import plotly.figure_factory as ff

biopsy_dist_fig = ff.create_distplot(df[biopsy_org_cols].values.T, biopsy_org_cols)
biopsy_dist_fig.show()

In [0]:
resaction_dist_fig = ff.create_distplot(df[resaction_org_cols].values.T, resaction_org_cols)
resaction_dist_fig.show()

In [0]:
import plotly.graph_objects as go


def plot_stackbar(x_labels, y_labels, values, tt):
  data_list = list()
  for i, yl in enumerate(y_labels):
    data_list.append(go.Bar(name=yl, x=x_labels, y=values[i], text=values[i], textposition='auto', width=[0.3]*len(x_labels)))
  fig = go.Figure(data=data_list)
  fig.update_layout(barmode='stack', autosize=False, title=tt)
  fig.show()

In [0]:
nms = ['[0, 1) %', '[1, 5) %', '[5, 50) %', '[50, 100] %']
xs = ['Biopsy', 'Resaction']

pair_cols = list(zip(biopsy_cat_cols, resaction_cat_cols))
pair_cnts = list(map(lambda x: list(map(lambda y: [sum(df[x[0]] == y), sum(df[x[1]] == y)], [0, 1, 2, 3])), pair_cols))

for i, p in enumerate(pair_cols):
  plot_stackbar(xs, nms, pair_cnts[i], p[0][:-2])

In [0]:
#### pairwise correlation among a set of data 

from sklearn.metrics import cohen_kappa_score

def compute_varied_correlations_numeric(df):
  cors = list()
  idx = list()
  for i, c in enumerate(df.columns):
    if i < len(df.columns) - 1:
      cols = [c, df.columns[i+1]]
    else:
      cols = [c, df.columns[0]]
    idx.append(' vs. '.join(cols))
    x = df[c]
    y = df[cols[1]]
    pearson = sc.stats.pearsonr(x, y)
    spearman = sc.stats.spearmanr(x, y)
    cors.append({
        'pearson_cor.': pearson[0],
        'pearson_pvalue': pearson[1],
        'spearman_cor.': spearman[0],
        'spearman_pvalue': spearman[1]
    })
  return pd.DataFrame(cors, index=idx)

def compute_pairwise_agreements(df, pos_neg_list):
  cors = list()
  idx = list()
  col_list = list()
  for i, c in enumerate(df.columns):
    if i < len(df.columns) - 1:
      cols = [c, df.columns[i+1]]
    else:
      cols = [c, df.columns[0]]
    idx.append(' vs. '. join(cols))
    x = df[c]
    y = df[cols[1]]
    tmp = dict()
    tmp['Kappa'] = cohen_kappa_score(x, y)
    tmp['OPA'] = sum(x == y) / len(x)
    if i == 0:
      col_list = col_list + ['Kappa', 'OPA']
    for pn in pos_neg_list:
      x_y = list(zip(x, y))
      sum_pos_correct = sum(map(lambda a: a[0] in pn[0] and a[1] in pn[0], x_y))
      sum_pos_wrong = sum(map(lambda a: a[0] not in pn[0] and a[1] in pn[0], x_y))
      sum_neg_correct = sum(map(lambda a: a[0] in pn[1] and a[1] in pn[1], x_y))
      sum_neg_wrong = sum(map(lambda a: a[0] in pn[1] and a[1] not in pn[1], x_y))
      nm = '/'.join([','.join(list(map(lambda x: str(x), pn[0]))), ','.join(list(map(lambda x: str(x), pn[1])))])
      tmp['PPA_' + nm] = sum_pos_correct / (sum_pos_correct + sum_pos_wrong)
      tmp['NPA_' + nm] = sum_neg_correct / (sum_neg_correct + sum_neg_wrong)
      if i == 0:
        col_list = col_list + ['PPA_' + nm, 'NPA_' + nm]
    cors.append(tmp)
  return pd.DataFrame(cors, index=idx)[col_list]

In [0]:
set_list = [[[0], [1, 2, 3]], [[0, 1], [2, 3]], [[0, 1, 2], [3]]]
compute_pairwise_agreements(df[biopsy_cat_cols], set_list)

Unnamed: 0,Kappa,OPA,"PPA_0/1,2,3","NPA_0/1,2,3","PPA_0,1/2,3","NPA_0,1/2,3","PPA_0,1,2/3","NPA_0,1,2/3"
22C3 ventanaBX vs. 22C3DakoBX,0.04,0.58,0.75,0.333333,0.892473,0.166667,0.979592,0.0
22C3DakoBX vs. SP263BX,0.117777,0.4,0.971429,0.964286,0.966102,0.714286,0.977778,0.0
SP263BX vs. 22C3 ventanaBX,0.185788,0.45,0.424658,0.353846,0.636364,0.219512,0.918367,0.2


In [0]:
cor_list = list()
for i, c in enumerate(biopsy_org_cols):
  cor_list.append(compute_varied_correlations_numeric(df[[c, resaction_org_cols[i]]]))
pd.concat(cor_list).iloc[[0, 2, 4]]

Unnamed: 0,pearson_cor.,pearson_pvalue,spearman_cor.,spearman_pvalue
22C3 ventanaCPSBX vs. 22C3 ventanaCPSRC,0.923778,1.2146679999999999e-42,0.807476,3.4046689999999996e-24
22C3DakoCPSBX vs. 22C3DakoCPSRC,0.19666,0.04987032,0.71138,1.093822e-16
SP263CPSBX vs. SP263CPSRC,0.763621,2.511765e-20,0.946397,6.778185999999999e-50


In [0]:
cor_list = list()
for i, c in enumerate(biopsy_cat_cols):
  cor_list.append(compute_pairwise_agreements(df[[c, resaction_cat_cols[i]]], set_list))
pd.concat(cor_list).iloc[[0, 2, 4]]

Unnamed: 0,Kappa,OPA,"PPA_0/1,2,3","NPA_0/1,2,3","PPA_0,1/2,3","NPA_0,1/2,3","PPA_0,1,2/3","NPA_0,1,2/3"
22C3 ventanaBX vs. 22C3 ventanaRC,0.719561,0.88,0.945946,0.851852,0.955056,0.666667,1.0,1.0
22C3DakoBX vs. 22C3DakoRC,0.544915,0.77,0.983333,0.964286,0.977011,0.714286,0.979592,0.0
SP263BX vs. SP263RC,0.780159,0.84,1.0,1.0,1.0,1.0,0.97561,0.8


In [0]:
def plot_scatter(df, n_cases):
  fig = go.Figure()

  # Add traces
  for c in df.columns:
    fig.add_trace(go.Scatter(x=list(range(1, n_cases+1)), y=df[c], mode='markers', name=c))

  fig.show()

In [0]:
ordered = df[biopsy_org_cols + resaction_org_cols]
ordered['sum'] = ordered.sum(1)
ordered = ordered.sort_values(by = 'sum').drop(['sum'], 1)

In [0]:
plot_scatter(ordered[biopsy_org_cols], 100)

In [0]:
plot_scatter(ordered[resaction_org_cols], 100)