In [None]:
!pip install anndata
!pip install dask[array]>=2021.02.0
!pip install scanpy
!pip install libgl1-mesa-dev --user
!pip install squidpy --user
!pip install scimap --user

In [None]:
import os, sys, re, random, math, time, glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans
import seaborn as sns
from pprint import pprint
import yaml
import uuid

import anndata as ad
print(f"anndata=={ad.__version__}")
import scanpy as sc
import squidpy as sq
print(f"squidpy=={sq.__version__}")
import scimap as sm
print(f"scimap=={sm.__version__}")

from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.stats import iqr

%matplotlib inline
# breakout by LN/Tumor(PrimarySite) (only want matched pairs)

In [None]:
qDir = r"./QUANT_with_preds"
qFiles = glob.glob(os.path.join(qDir,"*_QUANT.tsv"))
print("Found "+str(len(qFiles))+" Quant Files")

In [None]:
## Reformat quant files
import time
start_time = time.time()
keepDFList = []
for qFile in qFiles:
    df = pd.read_csv(qFile, sep='\t', low_memory=False)
    if df.shape[0] < 3:
        continue
    header = [e.replace(':', '') for e in df.columns.values.tolist() ]
    header = [e.replace('/', '') for e in header ]
    header = [e.replace('^', '') for e in header ]
    header = [e.replace('.', '') for e in header ]
    header = [e.replace('µ', 'u') for e in header ]
    header = [e.replace(' ', '_') for e in header ]
    header = [e.replace('-02_', '_') for e in header ]
    df.columns = header
    df['ROI'] = [e.split(' - ')[0].replace('.ome.tiff', '') for e in df['Image'].tolist() ]
    df['Slide'] = df['ROI']
           
    for i in range(len(df)):
        e=df['Slide'][i]
        if (("-" in e) and ("split" not in e)):
            df.loc[i, 'Slide']='_'.join(e.split('_')[0:2])
        else: 
            df.loc[i, 'Slide']='_'.join(e.split('_')[0:3])
    
    top = np.min(df['Centroid_Y_um']) + np.max(df['Centroid_Y_um'])
    df['invertY'] = top - df['Centroid_Y_um']
    nClass = df['Class'].astype(str).nunique()
    #print("    Unique Classifications: "+str(nClass) )
    print("{}  : {} x {}  <{} classes>".format(os.path.basename(qFile), df.shape[0], df.shape[1],nClass))
    keepDFList.append(df)
print("--- %s minutes ---" % ((time.time() - start_time)/60))

In [None]:
allClassData = pd.concat(keepDFList)
allClassData

In [None]:
allClassData = allClassData[allClassData.columns.drop(list(allClassData.filter(regex='(_Variance|_Min|_Cytoplasm_)')))]

print("Pre-DeDup Rows = {}".format(allClassData.shape[0])) # number of row
allClassData.drop_duplicates(subset=["ROI", "Centroid_X_um","Centroid_Y_um"], keep='first', inplace=True) # Try adding ROI to see if there are still duplicates
print("Post-DeDup Rows = {}".format(allClassData.shape[0])) # number of row
allClassData = allClassData[~allClassData['Class'].str.contains("ARTIFACT: ", na=False)]
print("No Artifacts Rows = {}".format(allClassData.shape[0])) # number of row
allClassData.reset_index(drop=True, inplace=True)

allClassData['uuid'] = [uuid.uuid4() for _ in range(len(allClassData.index))]

In [None]:
allClassData['ROI'].value_counts()

In [None]:
allClassData['Slide'].value_counts()

In [None]:
allClassData['Class'].value_counts()

In [None]:
# Check that all cells have an associated Slide
allClassData[allClassData['Slide'].isnull()]

In [None]:
mFile =r"./MetaData.txt"
meta = pd.read_csv(mFile, sep='\t')
allClassData = allClassData.merge(meta,how='left', left_on='Slide', right_on='SampleName')

In [None]:
pd.crosstab(allClassData['Class'], allClassData['Cohort'])

In [None]:
data = allClassData['Cohort'].value_counts()
pie, ax = plt.subplots(figsize=(4,4))
labels = data.index
ax.pie(x=data.values, autopct="%.1f%%", explode=[0.05]*len(data), labels=labels, 
       pctdistance=0.4, textprops={'fontsize': 14})
plt.title("Total Cells Per Clinical Endpoint", fontsize=12);
plt.show()

In [None]:
# Make sure all cells have an associated cohort
allClassData["Cohort"].value_counts(dropna=False)

In [None]:
## ONVA, C, E = LN
## ONVB, D, F = Tumor
conditions = [
    (allClassData['Slide'].str.contains("lymph")),
    (allClassData['Slide'].str.contains("ps"))
   ]
choices = ['Lymphnode', 'PrimarySite']
allClassData['SiteLoc'] = np.select(conditions, choices, default='NA')

pd.crosstab(allClassData['Slide'],allClassData['SiteLoc'])

In [None]:
allClassData["Origin"] = allClassData["Cohort"] + "-" + allClassData["SiteLoc"]
data = allClassData['Origin'].value_counts()
pie, ax = plt.subplots(figsize=(4,4))
labels = data.index
ax.pie(x=data.values, autopct="%.1f%%", explode=[0.05]*len(data), labels=labels, 
       pctdistance=0.4, textprops={'fontsize': 14})
plt.title("Total Cells Per Source", fontsize=12);
plt.show()

In [None]:
# Make sure all cells have an associated SiteLoc
allClassData["SiteLoc"].value_counts(dropna=False)

In [None]:
# Save table for next step
allClassData.to_csv("./allClassData.csv", index=False)