**Source Code Authors :**   
> *Abigail Swamidoss (Abigail.Swamidoss@gmail.com), Samhith Kethireddy (Kethireddy.samhith@gmail.com)*  

**Published in Research Article:**  
> [Computational Analysis of Routine Biopsies Improves Diagnosis and Prediction of Cardiac Allograft Vasculopathy](https://www.ahajournals.org/doi/10.1161/CIRCULATIONAHA.121.058459)  
> *by Eliot G. Peyster, Andrew Janowczyk, Abigail Swamidoss, Samhith Kethireddy, Michael D. Feldman and Kenneth B. Margulies*  
*Originally published 11 Apr 2022*  
  
> [Supplementary Material](https://www.ahajournals.org/action/downloadSupplement?doi=10.1161%2FCIRCULATIONAHA.121.058459&file=10.1161.circulationaha.121.058459_supplemental_materials.pdf)  

**Publisher**
[Journal of American Heart Association (JAHA) - Circulation](https://www.ahajournals.org/journal/circ)

#  Install, Setup gdrive and imports

In [None]:
from shapely.geometry import shape
from shapely.strtree import STRtree

import os
import numpy as np
import geojson
from shapely.ops import unary_union 
from shapely import wkt
import pandas as pd

import shapely.geometry

In [None]:
IN_CWR_SERVER = False 
slideset = 'cav'

# Files


In [None]:
if IN_CWR_SERVER :
  DATADIR = "/somedrive/datacd31/"
else :
  DATADIR = "C:\\research\\cav\\datacd31\\"

###############################
# RUN THIS TWICE ONCE WITH DILATE FLAG TRUE AND FALSE
###############################

# configs
dilate = True
#exp_nuc_4_qp = True
save_csv = True
HEALTHY = ['HC', 'US', 'MA']
DESEASED = ['DC']     
DESEASED_YR1 = ['DY'] 
COHORTS = HEALTHY + DESEASED + DESEASED_YR1

#input files and folders
JASONDIR = DATADIR + "json" + "\\" + slideset + "\\"
DAB_CSV = "21cd31_dab.csv"

#output files
outdir = DATADIR + "output\\" + slideset + "\\"
if dilate :
  DAB_NUC_CSV = "22cd31_dab_nuc_dil.csv"
else :
  DAB_NUC_CSV = "22cd31_dab_nuc_udl.csv"

#debug files
#DAB2_CSV = "22cd31_dab.csv"
#NUC2_CSV = "22cd31_nuc.csv"
#NUC_CSV = "22cd31_nuc.csv"

# Functions

## Funtions for reading JSON files

In [None]:
#%%script echo skipping
#---------------------------------------------------
def readAnnoJsonFile(ann_fname): 
  with open(ann_fname) as a:
      annotationgeojson = geojson.load(a)
  cd31_dab_anno_list = [obj for obj in annotationgeojson if(obj['properties']['classification']['name']== "CD31DAB")]
  cd31_dab_geom_list = [shape(obj["geometry"]) for obj in cd31_dab_anno_list] 

  cd31_hem_anno_list = [obj for obj in annotationgeojson if(obj['properties']['classification']['name']== "CD31HEMA")]
  cd31_hem_geom_list = [shape(obj["geometry"]) for obj in cd31_hem_anno_list] 

  cd31_tis_anno_list = [obj for obj in annotationgeojson if(obj['properties']['classification']['name']== "CD31Tissue")]
  cd31_tis_geom_list = [shape(obj["geometry"]) for obj in cd31_tis_anno_list] 

  return cd31_dab_geom_list, cd31_hem_geom_list, cd31_tis_geom_list

#---------------------------------------------------
def readNucJsonFile(nuc_fname): 
  with open(nuc_fname) as a:
      nucobjects = geojson.load(a)

  nucshapes=[shape(obj["geometry"]) for obj in nucobjects]
  nuccenters=[s.centroid  for s in nucshapes]

  for i in range(len(nucshapes)):
      nuccenters[i].id=i
      nucshapes[i].id=i
      if 'classification' in nucobjects[i].properties:
          #label = 1 if nucobjects[i].properties["classification"]['name']=='Positive' else 0
          if nucobjects[i].properties["classification"]['name']=='Positive':
            label = 1
          else:
            label = 0 if nucobjects[i].properties["classification"]['name']=='Negative' else -1
      else:
          label = -2
      nuccenters[i].label=label
      nucshapes[i].label=label
      #nucshapes[i].in_dilation0 = 0
      #nucshapes[i].in_dilation1 = 0
      nucshapes[i].in_dab = 0
      nucshapes[i].in_excl = 0
      nucshapes[i].in_q1 = 0 # 25%
      nucshapes[i].in_q2 = 0 # 50% 
      nucshapes[i].in_q3 = 0 # 75%
      nucshapes[i].in_q4 = 0 # 100%
      nucshapes[i].in_s1 = 0 # within +/-0.5 SD
      nucshapes[i].in_s2 = 0 # within +/-1 SD 
      nucshapes[i].in_s3 = 0 # within +/-1.5 SD
      nucshapes[i].in_s4 = 0 # within +/-2 SD
      nucshapes[i].in_s5 = 0 # within +/-3 SD.
      nucshapes[i].in_s6 = 0 # beyond +/-3 SD.
      nucshapes[i].in_b1 = 0 # Microvessels- area between 10 µm2 and 78.4 µm2
      nucshapes[i].in_b2 = 0 # Small pre-capillary arterioles (area 78.5–314 µm2)  
      nucshapes[i].in_b3 = 0 # Medium veins and arterioles: area 315 um2 – 1000 um2
      nucshapes[i].in_b4 = 0 # Medium-large: 1000-2500 um2
      nucshapes[i].in_b5 = 0 # Large: >2500 um2
  
  return nucobjects, nucshapes, nuccenters
#---------------------------------------------------
#---------------------------------------------------


## Functions for binning

In [1]:
#%%script echo skipping

#---------------------------------------------------
def findNucsInDAB_qrt(in_df, fname, nuc_cent_list, nuc_geom_list):

  dab_q0 = in_df.loc[(in_df['fname'] == fname)]
  dab_q1 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_q1'] > 0)]
  dab_q2 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_q2'] > 0)]
  dab_q3 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_q3'] > 0)]
  dab_q4 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_q4'] > 0)]
  
    
  q0_geoms = dab_q0['dab_poly'].to_numpy()
  q1_geoms = dab_q1['dab_poly'].to_numpy()
  q2_geoms = dab_q2['dab_poly'].to_numpy()
  q3_geoms = dab_q3['dab_poly'].to_numpy()
  q4_geoms = dab_q4['dab_poly'].to_numpy()
  
  tree = STRtree(nuc_cent_list)
  centsinsideDAB0 = []
  centsinsideDAB1 = []
  centsinsideDAB2 = []
  centsinsideDAB3 = []
  centsinsideDAB4 = []

  mpoly0 = q0_geoms # list of undilated polygons
  if (dilate) :
    mpoly1 = unary_union([poly.buffer(2) for poly in q1_geoms])  #dilated polygons
    q1_area = mpoly1.area
    mpoly2 = unary_union([poly.buffer(5) for poly in q2_geoms])  #dilated polygons
    q2_area = mpoly2.area
    mpoly3 = unary_union([poly.buffer(10) for poly in q3_geoms])  #dilated polygons
    q3_area = mpoly3.area
    mpoly4 = unary_union([poly.buffer(15) for poly in q4_geoms])  #dilated polygons
    q4_area = mpoly4.area
  else :
    mpoly1 = q1_geoms
    mpoly2 = q2_geoms
    mpoly3 = q3_geoms
    mpoly4 = q4_geoms
    q1_area = sum([p.area for p in q1_geoms])
    q2_area = sum([p.area for p in q2_geoms])
    q3_area = sum([p.area for p in q3_geoms])
    q4_area = sum([p.area for p in q4_geoms])

  for poly_0 in mpoly0:
    results0 = [c for c in tree.query(poly_0) if c.within(poly_0)]
    centsinsideDAB0.extend(results0)
  for poly_1 in mpoly1:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB1.extend(results1)
  for poly_1 in mpoly2:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB2.extend(results1)
  for poly_1 in mpoly3:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB3.extend(results1)
  for poly_1 in mpoly4:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB4.extend(results1)
    
  for c1 in centsinsideDAB0:
    nuc_geom_list[c1.id].in_dab = 1
  for c1 in centsinsideDAB1:
    nuc_geom_list[c1.id].in_q1 = 1
  for c1 in centsinsideDAB2:
    nuc_geom_list[c1.id].in_q2 = 1
  for c1 in centsinsideDAB3:
    nuc_geom_list[c1.id].in_q3 = 1
  for c1 in centsinsideDAB4:
    nuc_geom_list[c1.id].in_q4 = 1

  
  return nuc_geom_list, [q1_area, q2_area, q3_area, q4_area]
#---------------------------------------------------
def findNucsInDAB_std(in_df, fname, nuc_cent_list, nuc_geom_list):

  dab_s050 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_s1'] > 0)]
  dab_s100 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_s2'] > 0)]
  dab_s150 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_s3'] > 0)]
  dab_s200 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_s4'] > 0)]
  dab_s300 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_s5'] > 0)]
  dab_sotl = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_s6'] > 0)]
  
  s1_geoms = dab_s050['dab_poly'].to_numpy()
  s2_geoms = dab_s100['dab_poly'].to_numpy()
  s3_geoms = dab_s150['dab_poly'].to_numpy()
  s4_geoms = dab_s200['dab_poly'].to_numpy()
  s5_geoms = dab_s300['dab_poly'].to_numpy()
  s6_geoms = dab_sotl['dab_poly'].to_numpy()

  tree = STRtree(nuc_cent_list)
  centsinsideDAB1 = []
  centsinsideDAB2 = []
  centsinsideDAB3 = []
  centsinsideDAB4 = []
  centsinsideDAB5 = []
  centsinsideDAB6 = []

  if (dilate) :
    mpoly1 = unary_union([poly.buffer(2) for poly in s1_geoms]) 
    s1_area = mpoly1.area
    if (isinstance(mpoly1, shapely.geometry.polygon.Polygon)):
        mpoly1 = [mpoly1]
    mpoly2 = unary_union([poly.buffer(5) for poly in s2_geoms])  
    s2_area = mpoly2.area
    if (isinstance(mpoly2, shapely.geometry.polygon.Polygon)):
        mpoly2 = [mpoly2]
    mpoly3 = unary_union([poly.buffer(10) for poly in s3_geoms]) 
    s3_area = mpoly3.area
    if (isinstance(mpoly3, shapely.geometry.polygon.Polygon)):
        mpoly3 = [mpoly3]
    mpoly4 = unary_union([poly.buffer(15) for poly in s4_geoms]) 
    s4_area = mpoly4.area
    if (isinstance(mpoly4, shapely.geometry.polygon.Polygon)):
        mpoly4 = [mpoly4]
    mpoly5 = unary_union([poly.buffer(20) for poly in s5_geoms]) 
    s5_area = mpoly5.area
    if (isinstance(mpoly5, shapely.geometry.polygon.Polygon)):
        mpoly5 = [mpoly5]
    mpoly6 = unary_union([poly.buffer(25) for poly in s6_geoms]) 
    s6_area = mpoly6.area
    if (isinstance(mpoly6, shapely.geometry.polygon.Polygon)):
        mpoly6 = [mpoly6]
  else :
    mpoly1 = s1_geoms
    mpoly2 = s2_geoms
    mpoly3 = s3_geoms
    mpoly4 = s4_geoms
    mpoly5 = s5_geoms
    mpoly6 = s6_geoms    
    s1_area = sum([p.area for p in s1_geoms])
    s2_area = sum([p.area for p in s2_geoms])
    s3_area = sum([p.area for p in s3_geoms])
    s4_area = sum([p.area for p in s4_geoms])
    s5_area = sum([p.area for p in s5_geoms])
    s6_area = sum([p.area for p in s6_geoms])

  for poly_1 in mpoly1:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB1.extend(results1)
  for poly_1 in mpoly2:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB2.extend(results1)
  for poly_1 in mpoly3:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB3.extend(results1)
  for poly_1 in mpoly4:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB4.extend(results1)
  for poly_1 in mpoly5:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB5.extend(results1)
  for poly_1 in mpoly6:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB6.extend(results1)
    
  for c1 in centsinsideDAB1:
    nuc_geom_list[c1.id].in_s1 = 1
  for c1 in centsinsideDAB2:
    nuc_geom_list[c1.id].in_s2 = 1
  for c1 in centsinsideDAB3:
    nuc_geom_list[c1.id].in_s3 = 1
  for c1 in centsinsideDAB4:
    nuc_geom_list[c1.id].in_s4 = 1
  for c1 in centsinsideDAB5:
    nuc_geom_list[c1.id].in_s5 = 1
  for c1 in centsinsideDAB6:
    nuc_geom_list[c1.id].in_s6 = 1
  
  return nuc_geom_list, [s1_area, s2_area, s3_area, s4_area, s5_area, s6_area]
#---------------------------------------------------
def findNucsInDAB_excl(dab_geom_list, nuc_cent_list, nuc_geom_list):
  tree = STRtree(nuc_cent_list)
  centsinsideDAB0 = []
  centsinsideDAB1 = []
  #dilatedDABlist1 = []
  #for dab_geom in dab_geom_list:
  mpoly0 = dab_geom_list # list of undilated polygons
  #mpoly1 = unary_union([poly.buffer(15) for poly in dab_geom_list])  #dilated polygons

  for poly_0 in mpoly0:
    results0 = [c for c in tree.query(poly_0) if c.within(poly_0)]
    centsinsideDAB0.extend(results0)
  #for poly_1 in mpoly1:
  #  results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
  #  centsinsideDAB1.extend(results1)
    #dilatedDABlist1.append(mpoly1)'''
  for c0 in centsinsideDAB0:
    nuc_geom_list[c0.id].in_excl = 1
  #for c1 in centsinsideDAB1:
  #  nuc_geom_list[c1.id].in_dildab_q = 1
  
  return nuc_geom_list
#---------------------------------------------------
def findNucsInDAB_bio(in_df, fname, nuc_cent_list, nuc_geom_list):

  dab_b1 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_b1'] == 1)]
  dab_b2 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_b2'] == 1)]
  dab_b3 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_b3'] == 1)]
  dab_b4 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_b4'] == 1)]
  dab_b5 = in_df.loc[(in_df['fname'] == fname) & (in_df['dab_b5'] == 1)]
    
  b1_geoms = dab_b1['dab_poly'].to_numpy()
  b2_geoms = dab_b2['dab_poly'].to_numpy()
  b3_geoms = dab_b3['dab_poly'].to_numpy()
  b4_geoms = dab_b4['dab_poly'].to_numpy()
  b5_geoms = dab_b5['dab_poly'].to_numpy()
  
  tree = STRtree(nuc_cent_list)
  centsinsideDAB1 = []
  centsinsideDAB2 = []
  centsinsideDAB3 = []
  centsinsideDAB4 = []
  centsinsideDAB5 = []
  if (dilate) :  
    mpoly1 = unary_union([poly.buffer(2) for poly in b1_geoms])  #dilated DAB polygons
    b1_area = mpoly1.area
    if (isinstance(mpoly1, shapely.geometry.polygon.Polygon)):
        mpoly1 = [mpoly1]
    mpoly2 = unary_union([poly.buffer(5) for poly in b2_geoms])  #dilated DAB polygons
    b2_area = mpoly2.area
    if (isinstance(mpoly2, shapely.geometry.polygon.Polygon)):
        mpoly2 = [mpoly2]
    mpoly3 = unary_union([poly.buffer(10) for poly in b3_geoms])  #dilated DAB polygons
    b3_area = mpoly3.area
    if (isinstance(mpoly3, shapely.geometry.polygon.Polygon)):
        mpoly3 = [mpoly3]
    mpoly4 = unary_union([poly.buffer(15) for poly in b4_geoms])  #dilated DAB polygons
    b4_area = mpoly4.area
    if (isinstance(mpoly4, shapely.geometry.polygon.Polygon)):
        mpoly4 = [mpoly4]
    mpoly5 = unary_union([poly.buffer(20) for poly in b5_geoms])  #dilated DAB polygons
    b5_area = mpoly5.area
    if (isinstance(mpoly5, shapely.geometry.polygon.Polygon)):
        mpoly5 = [mpoly5]
  else :
    mpoly1 = b1_geoms
    mpoly2 = b2_geoms
    mpoly3 = b3_geoms
    mpoly4 = b4_geoms
    mpoly5 = b5_geoms  
    b1_area = sum([p.area for p in b1_geoms])
    b2_area = sum([p.area for p in b2_geoms])
    b3_area = sum([p.area for p in b3_geoms])
    b4_area = sum([p.area for p in b4_geoms])
    b5_area = sum([p.area for p in b5_geoms])

  for poly_1 in mpoly1:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB1.extend(results1)
  for poly_1 in mpoly2:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB2.extend(results1)
  for poly_1 in mpoly3:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB3.extend(results1)
  for poly_1 in mpoly4:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB4.extend(results1)
  for poly_1 in mpoly5:
    results1 = [c for c in tree.query(poly_1) if c.within(poly_1)]
    centsinsideDAB5.extend(results1)
    
  for c1 in centsinsideDAB1:
    nuc_geom_list[c1.id].in_b1 = 1
  for c1 in centsinsideDAB2:
    nuc_geom_list[c1.id].in_b2 = 1
  for c1 in centsinsideDAB3:
    nuc_geom_list[c1.id].in_b3 = 1
  for c1 in centsinsideDAB4:
    nuc_geom_list[c1.id].in_b4 = 1
  for c1 in centsinsideDAB5:
    nuc_geom_list[c1.id].in_b5 = 1
  
  return nuc_geom_list, [b1_area, b2_area, b3_area, b4_area, b5_area]
#---------------------------------------------------


# Main

## Load DAB CSV & Determine DAB BIN boundaries

In [None]:
if os.path.exists(f"{outdir}{DAB_CSV}.zip"):
      print(f"reading file: {outdir}{DAB_CSV}.zip")
      raw_dab_df=pd.read_csv(f"{outdir}{DAB_CSV}.zip")

# eliminate errors from QuPath
error_margin_percentile = raw_dab_df.dab_area.quantile(0.99) 
dab_df = pd.DataFrame(raw_dab_df[raw_dab_df.dab_area < error_margin_percentile])

# boundaries for Quartiles
dstats = dab_df.describe()
q25 = dstats.loc['25%'].values[0]
q50 = dstats.loc['50%'].values[0]
q75 = dstats.loc['75%'].values[0]
#boundaries_m1 = [0, q25, q50, q75]
#print(f'quartiles 25% = {q25}, 50% = {q50}, 75% = {q75}')

# calculate zscore
mean = dab_df['dab_area'].mean()
std = dab_df['dab_area'].std()
dab_df['dab_zscore'] = (dab_df['dab_area']-mean)/std

# mark dab area above 0.05% as outliers
otl_percentile = dab_df.dab_area.quantile(.995)
dab_df["dab_ex"] = np.where(dab_df["dab_area"] < otl_percentile, 0, 1)

# quartile based bins
dab_df["dab_q1"] = np.where(dab_df["dab_area"] < q25 ,1,0)
dab_df["dab_q2"] = np.where((dab_df["dab_area"] >= q25) & (dab_df["dab_area"] < q50), 1, 0)
dab_df["dab_q3"] = np.where((dab_df["dab_area"] >= q50) & (dab_df["dab_area"] < q75), 1, 0)
dab_df["dab_q4"] = np.where(dab_df["dab_area"] >= q75, 1, 0)

# sd based bins
P050 = 0.50000000000000001
P100 = 1.00000000000000001
P150 = 1.50000000000000001
P200 = 2.00000000000000001
P300 = 3.00000000000000001
dab_df["dab_s1"] = np.where( dab_df['dab_zscore'].between(-P050, 0.5), 1,0)
dab_df["dab_s2"] = np.where( dab_df['dab_zscore'].between(-P100, -0.5) | dab_df['dab_zscore'].between(P050, 1.0) ,1,0)
dab_df["dab_s3"] = np.where( dab_df['dab_zscore'].between(-P150, -1.0) | dab_df['dab_zscore'].between(P100, 1.5) ,1,0)
dab_df["dab_s4"] = np.where( dab_df['dab_zscore'].between(-P200, -1.5) | dab_df['dab_zscore'].between(P150, 2.0) ,1,0)
dab_df["dab_s5"] = np.where( dab_df['dab_zscore'].between(-P300, -2.0) | dab_df['dab_zscore'].between(P200, 3.0) ,1,0)
dab_df["dab_s6"] = np.where( (dab_df['dab_zscore'] < -3.0) | (dab_df['dab_zscore'] > 3), 1,0)

# biological  bins
B = [158, 1239, 4964, 15809, 39524] # m2 [10, 78.5, 315, 1000, 2500] 
dab_df["dab_b1"] = np.where(dab_df["dab_area"] < B[1], 1, 0)
dab_df["dab_b2"] = np.where((dab_df["dab_area"] >= B[1]) & (dab_df["dab_area"] < B[2]), 1, 0)
dab_df["dab_b3"] = np.where((dab_df["dab_area"] >= B[2]) & (dab_df["dab_area"] < B[3]), 1, 0)
dab_df["dab_b4"] = np.where((dab_df["dab_area"] >= B[3]) & (dab_df["dab_area"] < B[4]), 1, 0)
dab_df["dab_b5"] = np.where(dab_df["dab_area"] >= B[4], 1, 0)


#save_df = dab_df.drop(columns=['dab_poly'])
#print(f"saving: {outdir}{DAB2_CSV}.zip")
#save_df.to_csv(f"{outdir}{DAB2_CSV}.zip", index=False, compression=dict(method='zip', archive_name=f'{DAB2_CSV}'))

#save_df.to_csv(DAB2_CSV, index=False)


## Main Loop: It loops through DABs grouped by fname, identifies and marks nucs by various DAB bins (qrt, std, bio)

In [None]:
d_df = dab_df

# for testing start #########
#testfiles = ['CAV-DC-1', 'CAV-DC-2', 'CAV-DC-11', 'CAV-DC-12', 'CAV-DC-19','CAV-DY1-10', 'CAV-HC-18', 'CAV-DY1-2', 'CAV-MATCH-5', 'CAV-MATCH-5 ']
#testfiles = ['CAV2-DC-10_HS-17-0034099'] #'CAV2-DC-12_HS-18-0041740', 'CAV2-DC-29_HS-17-0019977', 'CAV2-HC-9_HS-15-0031776', 'CAV2-DY1-11_HS-13-0022491', 'CAV2-DY1-6_HS-16-0010050']
#d_df = dab_df.loc[dab_df['fname'].isin(testfiles)]
# for testing end #########

d_df['dab_poly'] = d_df['dab_poly'].apply(wkt.loads)

nuc_df=pd.DataFrame()
dil_df=pd.DataFrame()
grouped = d_df.groupby(['cohort','fname'])
# iterate over each group
for name, group in grouped:
    cohort = name[0]
    fname = name[1]
    #dab_geom_list = group.dab_poly.to_numpy()
    ex_df = group.loc[(group['dab_ex'] == 1)] # excluded dabs >0.05 %
    in_df = group.loc[(group['dab_ex'] == 0)] # selected dabs < 0.995 pcile
    dab_geom_list = in_df.dab_poly.to_numpy()
    ex_dab_geom_list = ex_df.dab_poly.to_numpy()


    nnn, nuc_geom_list, nuc_cent_list = readNucJsonFile(JASONDIR + fname +'_nuc.json') 
    #print(f"{fname}: dab.size={len(dab_geom_list)}, nuc.size={len(nuc_geom_list)}")

    #print(len(nuc_geom_list))
    ################# Quartile bins
    nuc_geom_list, dil_d_q = findNucsInDAB_qrt(in_df, fname, nuc_cent_list, nuc_geom_list)

    ################# SD bins
    nuc_geom_list, dil_d_s = findNucsInDAB_std(in_df, fname, nuc_cent_list, nuc_geom_list)

    ############## outliers
    nuc_geom_list = findNucsInDAB_excl(ex_dab_geom_list, nuc_cent_list, nuc_geom_list)

    ############## Bilogical 
    nuc_geom_list, dil_d_b = findNucsInDAB_bio(in_df, fname, nuc_cent_list, nuc_geom_list)

    ############## 
    temp_n_df = pd.DataFrame({'cohort':cohort,'fname':fname,  
                            'in_dab': [nuc.in_dab  for nuc in nuc_geom_list], 
                            'in_q1': [nuc.in_q1  for nuc in nuc_geom_list], 
                            'in_q2': [nuc.in_q2  for nuc in nuc_geom_list], 
                            'in_q3': [nuc.in_q3  for nuc in nuc_geom_list], 
                            'in_q4': [nuc.in_q4  for nuc in nuc_geom_list], 
                            'in_s1': [nuc.in_s1  for nuc in nuc_geom_list], 
                            'in_s2': [nuc.in_s2  for nuc in nuc_geom_list], 
                            'in_s3': [nuc.in_s3  for nuc in nuc_geom_list], 
                            'in_s4': [nuc.in_s4  for nuc in nuc_geom_list], 
                            'in_s5': [nuc.in_s5  for nuc in nuc_geom_list], 
                            'in_s6': [nuc.in_s6  for nuc in nuc_geom_list],
                            'in_b1': [nuc.in_b1  for nuc in nuc_geom_list], 
                            'in_b2': [nuc.in_b2  for nuc in nuc_geom_list], 
                            'in_b3': [nuc.in_b3  for nuc in nuc_geom_list], 
                            'in_b4': [nuc.in_b4  for nuc in nuc_geom_list], 
                            'in_b5': [nuc.in_b5  for nuc in nuc_geom_list], 
                            'in_excl': [nuc.in_excl for nuc in nuc_geom_list],
                            'nuc_area' : [nuc.area  for nuc in nuc_geom_list]
                            })
    nuc_df = nuc_df.append(temp_n_df)

    temp_dil_df = pd.DataFrame({'cohort':cohort,'fname':fname,  
                            'dil_d_q1_area':[dil_d_q[0]],
                            'dil_d_q2_area':[dil_d_q[1]],
                            'dil_d_q3_area':[dil_d_q[2]],
                            'dil_d_q4_area':[dil_d_q[3]],
                            'dil_d_s1_area':[dil_d_s[0]],
                            'dil_d_s2_area':[dil_d_s[1]],
                            'dil_d_s3_area':[dil_d_s[2]],
                            'dil_d_s4_area':[dil_d_s[3]],
                            'dil_d_s5_area':[dil_d_s[4]],
                            'dil_d_s6_area':[dil_d_s[5]],
                            'dil_d_b1_area':[dil_d_b[0]],
                            'dil_d_b2_area':[dil_d_b[1]],
                            'dil_d_b3_area':[dil_d_b[2]],
                            'dil_d_b4_area':[dil_d_b[3]],
                            'dil_d_b5_area':[dil_d_b[4]]
                            })
    dil_df = dil_df.append(temp_dil_df)

    #break
    #end loop

#print(f"saving {outdir}{NUC2_CSV}.zip")
##nuc_df.to_csv(NUC2_CSV, index=False)
#nuc_df.to_csv(f"{outdir}{NUC2_CSV}.zip", index=False, compression=dict(method='zip', archive_name=f'{NUC2_CSV}'))

## Collect and save data

In [None]:

#print(f"saving {outdir}{NUC2_CSV}.zip")
##nuc_df.to_csv(NUC2_CSV, index=False)
#nuc_df.to_csv(f"{outdir}{NUC2_CSV}.zip", index=False, compression=dict(method='zip', archive_name=f'{NUC2_CSV}'))

########### DAB
df11 = d_df.groupby(['cohort','fname']).size().reset_index(name="dab_count")
df12 = d_df.loc[(d_df['dab_ex'] == 1)]
df12 = df12.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_ex_area")
df13 = d_df.groupby(['cohort','fname'])[['dab_area', 'dab_ex', 'dab_q1',  'dab_q2',  'dab_q3',  'dab_q4',  'dab_s1',   'dab_s2',   'dab_s3',   'dab_s4', 'dab_s5', 'dab_s6', 'dab_b1',  'dab_b2',  'dab_b3',  'dab_b4', 'dab_b5']].sum()

df14 = d_df.loc[(d_df['dab_q1'] == 1)]
df14 = df14.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_q1_area")
df15 = d_df.loc[(d_df['dab_q2'] == 1)]
df15 = df15.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_q2_area")
df16 = d_df.loc[(d_df['dab_q3'] == 1)]
df16 = df16.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_q3_area")
df17 = d_df.loc[(d_df['dab_q4'] == 1)]
df17 = df17.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_q4_area")

df18 = d_df.loc[(d_df['dab_s1'] == 1)]
df18 = df18.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_s1_area")
df19 = d_df.loc[(d_df['dab_s2'] == 1)]
df19 = df19.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_s2_area")
df20 = d_df.loc[(d_df['dab_s3'] == 1)]
df20 = df20.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_s3_area")
df21 = d_df.loc[(d_df['dab_s4'] == 1)]
df21 = df21.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_s4_area")
df22 = d_df.loc[(d_df['dab_s5'] == 1)]
df22 = df22.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_s5_area")
df23 = d_df.loc[(d_df['dab_s6'] == 1)]
df23 = df23.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_s6_area")

df24 = d_df.loc[(d_df['dab_b1'] == 1)]
df24 = df24.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_b1_area")
df25 = d_df.loc[(d_df['dab_b2'] == 1)]
df25 = df25.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_b2_area")
df26 = d_df.loc[(d_df['dab_b3'] == 1)]
df26 = df26.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_b3_area")
df27 = d_df.loc[(d_df['dab_b4'] == 1)]
df27 = df27.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_b4_area")
df28 = d_df.loc[(d_df['dab_b5'] == 1)]
df28 = df28.groupby(['cohort','fname'])['dab_area'].sum().reset_index(name="dab_b5_area")

############  DILATED DAB
df40 = dil_df.groupby(['cohort','fname'])[['dil_d_q1_area','dil_d_q2_area','dil_d_q3_area','dil_d_q4_area','dil_d_s1_area','dil_d_s2_area','dil_d_s3_area','dil_d_s4_area','dil_d_s5_area','dil_d_s6_area', 'dil_d_b1_area','dil_d_b2_area','dil_d_b3_area','dil_d_b4_area','dil_d_b5_area']].sum()

############  NUC
df54 = nuc_df.groupby(['cohort','fname']).size().reset_index(name="nuc_count")
df55 = nuc_df.groupby(['cohort','fname'])[['in_dab', 'in_q1', 'in_q2', 'in_q3', 'in_q4', 'in_s1', 'in_s2', 'in_s3', 'in_s4', 'in_s5', 'in_s6', 'in_b1', 'in_b2', 'in_b3', 'in_b4', 'in_b5', 'in_excl', 'nuc_area']].sum()

df56 = nuc_df.loc[(nuc_df['in_s1'] == 1)]
df56 = df56.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_s1_n_area")
df57 = nuc_df.loc[(nuc_df['in_s2'] == 1)]
df57 = df57.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_s2_n_area")
df58 = nuc_df.loc[(nuc_df['in_s3'] == 1)]
df58 = df58.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_s3_n_area")
df59 = nuc_df.loc[(nuc_df['in_s4'] == 1)]
df59 = df59.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_s4_n_area")
df60 = nuc_df.loc[(nuc_df['in_s5'] == 1)]
df60 = df60.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_s5_n_area")
df61 = nuc_df.loc[(nuc_df['in_s6'] == 1)]
df61 = df61.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_s6_n_area")
df62 = nuc_df.loc[(nuc_df['in_excl'] == 1)]
df62 = df62.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_excl_n_area")

df63 = nuc_df.loc[(nuc_df['in_b1'] == 1)]
df63 = df63.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_b1_n_area")
df64 = nuc_df.loc[(nuc_df['in_b2'] == 1)]
df64 = df64.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_b2_n_area")
df65 = nuc_df.loc[(nuc_df['in_b3'] == 1)]
df65 = df65.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_b3_n_area")
df66 = nuc_df.loc[(nuc_df['in_b4'] == 1)]
df66 = df66.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_b4_n_area")
df67 = nuc_df.loc[(nuc_df['in_b5'] == 1)]
df67 = df67.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_b5_n_area")

df68 = nuc_df.loc[(nuc_df['in_q1'] == 1)]
df68 = df68.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_q1_n_area")
df69 = nuc_df.loc[(nuc_df['in_q2'] == 1)]
df69 = df69.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_q2_n_area")
df70 = nuc_df.loc[(nuc_df['in_q3'] == 1)]
df70 = df70.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_q3_n_area")
df71 = nuc_df.loc[(nuc_df['in_q4'] == 1)]
df71 = df71.groupby(['cohort','fname'])['nuc_area'].sum().reset_index(name="in_q4_n_area")


df = pd.merge(left=df11, right=df12, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df13, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df14, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df15, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df16, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df17, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df18, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df19, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df20, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df21, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df22, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df23, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df24, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df25, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df26, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df27, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df28, on=['cohort', 'fname'], how="outer")

df = pd.merge(left=df, right=df40, on=['cohort', 'fname'], how="outer")

df = pd.merge(left=df, right=df54, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df55, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df56, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df57, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df58, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df59, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df60, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df61, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df62, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df63, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df64, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df65, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df66, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df67, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df68, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df69, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df70, on=['cohort', 'fname'], how="outer")
df = pd.merge(left=df, right=df71, on=['cohort', 'fname'], how="outer")

print(f'saving {outdir}{DAB_NUC_CSV}.zip')
#df.to_csv(DAB_NUC_CSV, index=False)
df.to_csv(f"{outdir}{DAB_NUC_CSV}.zip", index=False, compression=dict(method='zip', archive_name=f'{DAB_NUC_CSV}'))

# Working Area