In [None]:
!pip install xgboost --user

In [None]:
import os, sys, re, random, math, time, glob
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
from sklearn.cluster import MiniBatchKMeans
import seaborn as sns
from pprint import pprint
import yaml
import uuid

from scipy.stats import kurtosis
from scipy.stats import skew
from scipy.stats import iqr

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
qDir = r"./QUANT_v4"
qFiles = glob.glob(os.path.join(qDir,"*_QUANT.tsv"))
#pprint(qFiles)
print("Found "+str(len(qFiles))+" Quant Files")

In [None]:
keepDFList = []
for qFile in qFiles:
    fhNom = os.path.basename(qFile)
    if "tonsil" in fhNom:
        next
    
    df = pd.read_csv(qFile, sep='\t', low_memory=False)
    print(os.path.basename(qFile))
    if df.shape[0] < 10:
        continue
    header = [e.replace(':', '') for e in df.columns.values.tolist() ]
    header = [e.replace('/', '') for e in header ]
    header = [e.replace('^', '') for e in header ]
    header = [e.replace('.', '') for e in header ]
    header = [e.replace('µ', 'u') for e in header ]
    header = [e.replace(' ', '_') for e in header ]
    header = [e.replace('-02_', '_') for e in header ]
    df.columns = header
    df['ROI'] = [e.split(' - ')[0].replace('.ome.tiff', '') for e in df['Image'].tolist() ]
    df['Slide'] = ['_'.join(e.split('_')[0:3]) for e in df['ROI'].tolist() ]
    top = np.min(df['Centroid_Y_um']) + np.max(df['Centroid_Y_um'])
    df['invertY'] = top - df['Centroid_Y_um']
    print("    Resulting Shape: {} x {}".format(df.shape[0], df.shape[1]))
    nClass = df['Class'].astype(str).nunique()
    print("    Unique Classifications: "+str(nClass) )
    keepDFList.append(df)

allClassData = pd.concat(keepDFList)
allClassData = allClassData[allClassData.columns.drop(list(allClassData.filter(regex='(_Variance|_Min|_Max|_Cytoplasm_)')))]
allClassData['uuid'] = [uuid.uuid4() for _ in range(len(allClassData.index))]

In [None]:
allClassData.loc[allClassData['Class'] == "CD68", 'Class'] = 'Macs'
allClassData.loc[allClassData['Class'] == "CD8/CD3", 'Class'] = 'CD8 T'
allClassData.loc[allClassData['Class'] == "CD4/CD3", 'Class'] = 'CD4 T'
allClassData['Class'].value_counts()

In [None]:
allClassData.shape[0]

In [None]:
pd.crosstab(allClassData['Slide'],allClassData['Class'])

In [None]:
df_batching = allClassData.filter(regex='(_Mean|ROI)',axis=1)
df_melted = pd.melt(df_batching, id_vars=["ROI"])

fig, ax1 = plt.subplots(figsize=(26,6))
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=65)
sns.boxplot(x='ROI', y='value', data=df_melted, ax=ax1, showfliers = False).set(title='Total Protein Distribution')

In [None]:
phenotypingData = allClassData.copy(deep=True)
keepCols = ['Class','ROI','Slide','Centroid_','uuid',
            'NA1_Nucleus','NA2_Nucleus','SMA_Cell', 'CD19_Cell', 'CD20_Cell','Pan-Ker_Cell','CD11b_Cell', 'Vimentin_Membrane',
            'CD45_Membrane','CD4_Cell','E_Cadherin_Cell','CD68_Cell','CD8a_Cell','CD3_Cell','cd14_Cell','Nak-ATPase_Membrane',
            'Vista_Nucleus','CollagenI_Cell','CD45RO_Cell','_Length','_area_ratio']

lst = "("+'|'.join(keepCols)+")"
phenotypingData = phenotypingData.filter(regex=lst,axis=1)
print( ', '.join(phenotypingData.columns.values.tolist()))

In [None]:
phenotypingData["Class2"] = phenotypingData["Class"]
minSD = abs(np.min(phenotypingData['NA1_Nucleus_StdDev']))+1
minMn = abs(np.min(phenotypingData['NA1_Nucleus_Mean']))+1
phenotypingData['NA1_Nucleus_Ratio'] = (phenotypingData['NA1_Nucleus_StdDev']+minSD)/(phenotypingData['NA1_Nucleus_Mean']+minMn)

fig, ax = plt.subplots(figsize=(18,6))
sns.scatterplot(x = "NA1_Nucleus_Mean", y = "NA1_Nucleus_StdDev", data = phenotypingData,
                hue = "NA1_Nucleus_Ratio", palette = "coolwarm", ax=ax)

dapiSTDcutpoint1 = phenotypingData['NA1_Nucleus_Ratio'].quantile(0.9999) 
dapiSTDcutpoint2 = phenotypingData['NA1_Nucleus_Ratio'].quantile(0.0001)
print(f"High Ratio: {dapiSTDcutpoint1}\nLow Ratio: {dapiSTDcutpoint2}")

## Dapi guided variablity likely indicating very poor segementation
phenotypingData.loc[phenotypingData['NA1_Nucleus_Ratio'] > dapiSTDcutpoint1, 'Class2'] = 'ARTIFACT: DNA1 RATIO'

In [None]:
phenotypingData["Class2"] = phenotypingData["Class"]
minSD = abs(np.min(phenotypingData['NA2_Nucleus_StdDev']))+1
minMn = abs(np.min(phenotypingData['NA2_Nucleus_Mean']))+1
phenotypingData['NA2_Nucleus_Ratio'] = (phenotypingData['NA2_Nucleus_StdDev']+minSD)/(phenotypingData['NA2_Nucleus_Mean']+minMn)

fig, ax = plt.subplots(figsize=(18,6))
sns.scatterplot(x = "NA2_Nucleus_Mean", y = "NA2_Nucleus_StdDev", data = phenotypingData,
                hue = "NA2_Nucleus_Ratio", palette = "coolwarm", ax=ax)

dapiSTDcutpoint1 = phenotypingData['NA2_Nucleus_Ratio'].quantile(0.9999)
dapiSTDcutpoint2 = phenotypingData['NA2_Nucleus_Ratio'].quantile(0.0001)
print(f"High Ratio: {dapiSTDcutpoint1}\nLow Ratio: {dapiSTDcutpoint2}")

## Dapi guided variablity likely indicating very poor segementation
phenotypingData.loc[phenotypingData['NA2_Nucleus_Ratio'] > dapiSTDcutpoint1, 'Class2'] = 'ARTIFACT: DNA2 RATIO'

In [None]:
#find sum of columns specified
meanMarkers = [x for x in phenotypingData if '_Mean' in x]
phenotypingData['SigSum'] = phenotypingData[meanMarkers].sum(axis=1)
SigSumcutpoint1 = phenotypingData['SigSum'].quantile(0.999)
SigSumcutpoint2 = phenotypingData['SigSum'].quantile(0.0001)

print(f"High SigSum: {SigSumcutpoint1}\nLow SigSum: {SigSumcutpoint2}")

phenotypingData.loc[phenotypingData['SigSum'] > SigSumcutpoint1, 'Class2'] = 'ARTIFACT: SIGSUM'
phenotypingData.loc[phenotypingData['SigSum'] < SigSumcutpoint2, 'Class2'] = 'ARTIFACT: SIGSUM'

In [None]:
pd.crosstab(phenotypingData['Class2'],phenotypingData['Class'])

In [None]:
ct = phenotypingData['Class2'].value_counts()
pt = phenotypingData['Class2'].value_counts(normalize=True).mul(100).round(2).astype(str) + '%'
pd.concat([ct,pt], axis=1, keys=['counts', '%'])

In [None]:
from scipy.stats import boxcox

secondNormalized = phenotypingData.copy(deep=True)
secondNormalized = secondNormalized.loc[~secondNormalized['Class2'].str.contains("ARTIFACT: ", na=False)]
secondNormalized.reset_index(drop=True, inplace=True)

secondNormalized['_id'] = secondNormalized.index
### Do Box Cox on batch
tmp = secondNormalized.filter(regex='(_Median|_id)',axis=1)
tmp2 = pd.melt(tmp, id_vars=['_id'])
tmp2['value'] = (tmp2['value'] + 1)
# box cox cannot handle values of zero
nArr, mxLambda = boxcox(tmp2['value'].to_list())
tmp2['valueBC'] = nArr
tmp3 = tmp2[["_id","variable","valueBC"]].pivot(columns="variable", index="_id", values='valueBC')

secondNormalized.set_index('_id', inplace=True, drop=False)
df_a = secondNormalized[secondNormalized.columns.difference(tmp3.columns)]
dfBMSnorm = pd.concat([df_a, tmp3], axis=1)

In [None]:
dfBMSnorm

In [None]:
df_batching = dfBMSnorm.filter(regex='(_Median|ROI|Slide)',axis=1)
df_melted = pd.melt(df_batching, id_vars=["ROI","Slide"])
fig, ax1 = plt.subplots(figsize=(28,6))
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=45)
sns.boxplot(x='ROI', y='value', hue="Slide", data=df_melted, ax=ax1, showfliers = False) 

In [None]:
tmp = dfBMSnorm.filter(regex='(_Mean)',axis=1)

for clm in tmp.columns.values.tolist():
    sFl = clm.replace('_Mean','_StdDev')
    rto = clm.replace('_Mean','_Ratio')
    dfBMSnorm[rto] = (dfBMSnorm[sFl]+1)/(dfBMSnorm[clm]+1)
    dfBMSnorm[dfBMSnorm[rto] < 0] = 0

dfBMSnorm

In [None]:
sub3 = dfBMSnorm.filter(regex='(_Median|_Ratio)',axis=1).sample(frac=0.01)
sns.clustermap(data=sub3, yticklabels=False, cmap = "coolwarm", vmin= -2, vmax=5)

In [None]:
from sklearn.preprocessing import StandardScaler

# create a scaler object
scaler = StandardScaler()
df_numerics_only = dfBMSnorm.filter(regex='(_Median|_Ratio|_id)',axis=1)
df_norm = pd.DataFrame(scaler.fit_transform(df_numerics_only), columns=df_numerics_only.columns)
df_a = dfBMSnorm[dfBMSnorm.columns.difference(df_numerics_only.columns)]
dfStandardize = pd.concat([df_a.reset_index(drop=True), df_norm], axis=1)


df_batching = dfStandardize.filter(regex='(_Median|ROI|Slide)',axis=1)
df_melted = pd.melt(df_batching, id_vars=["ROI","Slide"])
fig, ax1 = plt.subplots(figsize=(18,6))
ax1.set_xticklabels(ax1.get_xticklabels(),rotation=45)
sns.boxplot(x='ROI', y='value', hue="Slide", data=df_melted, ax=ax1, showfliers = False)

In [None]:
sub3 = dfStandardize.filter(regex='(_Median|_Ratio)',axis=1).sample(frac=0.015)
sns.clustermap(data=sub3, yticklabels=False, cmap = "coolwarm", vmin= -2, vmax=5)

In [None]:
tierOnePredict = dfStandardize.copy(deep=True)
tierOnePredict = tierOnePredict[~tierOnePredict['Class'].isna()]
# tierOnePredict.shape
## split into predictor variables (X) and outcome variable (y)
idx = tierOnePredict.columns.get_loc("Class")
y = tierOnePredict.iloc[:,idx]
X = tierOnePredict.filter(regex='(_Membrane|_Cell)*(_Median|_Ratio|_Length_um)$',axis=1)
pprint(y.value_counts())

In [None]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
num_round = 380

le = preprocessing.LabelEncoder()
y_Encode = le.fit_transform(y)
(unique, counts) = np.unique(y_Encode, return_counts=True)

# read in data
x_train, x_test, y_train, y_test = train_test_split(X, y_Encode, test_size=0.33)
dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

In [None]:
# specify parameters via map
# eta [default=0.3, alias: learning_rate]
depthField = [4, 6, 8, 12, 16, 19]
learnRates = [0.1, 0.25, 0.38, 0.5]
metricModel = []

for d in depthField:
    for l in learnRates:
        param = {'max_depth':d, 'eta': l, 'objective':'multi:softmax', 'n_jobs': 32,
                 'num_class': len(unique), 'eval_metric': 'mlogloss' }
        bst = xgb.train(param, dtrain, num_round)
        
        predTrain = bst.predict(dtrain) ## Exports lables of type Float
        GBCmpredTrain = le.inverse_transform(np.array(predTrain, dtype=np.int))
        yLabelTrain = le.inverse_transform(np.array(y_train, dtype=np.int)) 
        accuracyTrain = accuracy_score(yLabelTrain, GBCmpredTrain)
    
        preds = bst.predict(dtest) ## Exports lables of type Float
        GBCmpred = le.inverse_transform(np.array(preds, dtype=np.int))
        yLabelTest = le.inverse_transform(np.array(y_test, dtype=np.int)) 
        accuracy = accuracy_score(yLabelTest, GBCmpred)
        metricModel.append({'max_depth':d, 'eta':l, 'Training': "%.2f%%" % (accuracyTrain * 100.0), 
                          'Test': "%.2f%%" % (accuracy * 100.0), 'testf':accuracy })

xgboostParams = pd.DataFrame(metricModel)
xgboostParams

In [None]:
mx = np.max(xgboostParams['testf'])
rr = xgboostParams.loc[xgboostParams['testf'] == mx,]
print("Max Test Accuracy: %.2f%%" % (mx * 100.0) )
rr

In [None]:
param = {'max_depth':4, 'eta': 0.38, 'objective':'multi:softmax', 'n_jobs': 32,
                 'num_class': len(unique), 'eval_metric': 'mlogloss' }
bst = xgb.train(param, dtrain, num_round)
    
# make prediction
predTrain = bst.predict(dtrain) ## Exports lables of type Float
GBCmpredTrain = le.inverse_transform(np.array(predTrain, dtype=np.int))
yLabelTrain = le.inverse_transform(np.array(y_train, dtype=np.int)) 
# evaluate predictions
accuracyTrain = accuracy_score(yLabelTrain, GBCmpredTrain)
print("Training Accuracy: %.2f%%" % (accuracyTrain * 100.0))
#print(pd.crosstab(GBCmpredTrain,yLabelTrain))
#print("\n")
# make prediction
preds = bst.predict(dtest) ## Exports lables of type Float
GBCmpred = le.inverse_transform(np.array(preds, dtype=np.int))
yLabelTest = le.inverse_transform(np.array(y_test, dtype=np.int)) 
# evaluate predictions
accuracy = accuracy_score(yLabelTest, GBCmpred)
print("Test Accuracy: %.2f%%" % (accuracy * 100.0))
pd.crosstab(GBCmpred,yLabelTest)

In [None]:
feature_important = bst.get_score(importance_type='gain')
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)
data.nlargest(25, columns="score").plot(kind='barh', figsize = (9,12)) 

In [None]:
Xall = dfStandardize[X.columns.to_list()]
xReal = xgb.DMatrix(Xall)

realPred = bst.predict(xReal) ## Exports labels of type Float
fullXGBPred = le.inverse_transform(np.array(realPred, dtype=np.int))

dfStandardize['XGBoostPrediction'] = fullXGBPred.astype('str')

nonBlank = dfStandardize.loc[pd.notna(dfStandardize["Class"])]
accuracy = accuracy_score(nonBlank["Class"], nonBlank['XGBoostPrediction'])
print("Total Model Accuracy: %.2f%%" % (accuracy * 100.0))
pd.crosstab(nonBlank["Class"], nonBlank['XGBoostPrediction'])

In [None]:
data = pd.Series(fullXGBPred).value_counts()
pie, ax = plt.subplots(figsize=(11,10))
labels = data.index
ax.pie(x=data.values, autopct="%.1f%%", explode=[0.05]*len(data), labels=labels, pctdistance=0.5)
plt.title("XGBoosting Classifier", fontsize=22);
plt.show()

In [None]:
dfStandardize["PredMatch"] = np.where(
    (dfStandardize["XGBoostPrediction"] == dfStandardize["Class"]) & (pd.notna(dfStandardize["Class"])), "Y", "N")

dfStandardize["PredMatch"] = np.where(pd.isna(dfStandardize["Class"]), '-',dfStandardize["PredMatch"])
dfStandardize["PredMatch"].value_counts()

In [None]:
cscnts = pd.crosstab(dfStandardize['Slide'],dfStandardize["PredMatch"])

In [None]:
cscnts = cscnts.loc[cscnts['N'] > 0]
cscnts['pm'] = (cscnts['N'] / (cscnts['N']+cscnts['Y']))
cscnts['PecentMissed'] = cscnts['pm'].mul(100).round(2).astype(str) + '%'
cscnts.sort_values('pm', ascending=False)

In [None]:
fovs = dfStandardize['ROI'].unique().tolist()
#random.shuffle(fovs)
outCls = "XGBoostPrediction"
t1Order = dfStandardize[outCls].unique().tolist()

fov1 = dfStandardize.loc[dfStandardize['ROI'] == fovs[0]]
fov2 = dfStandardize.loc[dfStandardize['ROI'] == fovs[1]]
fov3 = dfStandardize.loc[dfStandardize['ROI'] == fovs[2]]
fov4 = dfStandardize.loc[dfStandardize['ROI'] == fovs[3]]

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1,4, figsize=(20,5))
sns.scatterplot(data=fov1, x='Centroid_X_um', y='Centroid_Y_um', hue=outCls, ax=ax1, palette="Dark2",
                hue_order = t1Order, legend=False).set(title=fov1.iloc[1]['ROI'])
sns.scatterplot(data=fov2, x='Centroid_X_um', y='Centroid_Y_um',hue=outCls, ax=ax2,  palette="Dark2",
                hue_order = t1Order, legend=False).set(title=fov2.iloc[1]['ROI'])
sns.scatterplot(data=fov3, x='Centroid_X_um', y='Centroid_Y_um',hue=outCls, ax=ax3,  palette="Dark2",
                hue_order = t1Order, legend=False).set(title=fov3.iloc[1]['ROI'])
sns.scatterplot(data=fov4, x='Centroid_X_um', y='Centroid_Y_um', hue=outCls, ax=ax4,  palette="Dark2",
                hue_order = t1Order, legend=False).set(title=fov4.iloc[1]['ROI'])
plt.show()

In [None]:
dfStandardize['cnt'] = 1
dfStandardize.groupby(['ROI', 'XGBoostPrediction']).agg({'cnt':"count"})

In [None]:
### Put the skipped artifacts back in.
subCols = dfStandardize.loc[:,['XGBoostPrediction','uuid']]
mergeDF = pd.merge(phenotypingData, subCols, on="uuid", how="left")
#mergeDF.loc[mergeDF['XGBoostPrediction'].isna(), 'XGBoostPrediction'] = 'Artifact'
mergeDF['FinalClassify'] = mergeDF['XGBoostPrediction']
mergeDF['FinalClassify'].fillna(mergeDF['Class2'], inplace=True)

In [None]:
pd.crosstab(mergeDF['Class2'],mergeDF['FinalClassify'])

In [None]:
selectCols = ["uuid","Centroid_X_um","Centroid_Y_um","FinalClassify"]
outDir = r"./QUANT_with_preds"

for f in fovs:
    roiTbl = mergeDF.loc[mergeDF['ROI'] == f, selectCols] 
    outFh = os.path.join(outDir,f+"_PRED.tsv")
    roiTbl.to_csv(outFh, sep="\t")
    print(outFh)