In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as ss
import seaborn as sns
sc.settings.set_figure_params(dpi=100)

### Gather Data

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, roc_curve, precision_recall_curve, f1_score, balanced_accuracy_score, accuracy_score
import pickle as pkl
# read in data
a_trb = sc.read_h5ad('../outs/adata.trb.h5ad')
# load the pickled data
with open('../external_data/results.tcr.pkl', 'rb') as f:
    results_tcr = pkl.load(f)
with open('../external_data/results.ag.pkl', 'rb') as f:
    results_ag = pkl.load(f)

#### Fetal Donors (Suo and Dann et al. 2022)

In [None]:
from tqdm import tqdm
# define the minimum number of cells
min_cells = 2
# get the tag, keeping only pairs that have at least min_cells cells
clusters = ['CD8+T','TREG','CD4+T']
mask = results_tcr['SUO_SCIENCE2022_FETAL']['celltype_annotation'].isin(clusters)
data = results_tcr['SUO_SCIENCE2022_FETAL'].loc[mask, ['donor','celltype_annotation','TRB']].astype(str).copy()
data['tag'] = data[['donor','celltype_annotation']].astype(str).agg(':'.join, axis=1)
# filter the data more harshly because less assured of quality
data['TRB'][~data['TRB'].isin(a_trb.obs.index)] = np.nan
data = data.dropna(subset=['TRB'])
counts = data['tag'].value_counts(); tags = counts.index[counts >= min_cells]
# compile the Xs
Xs = []
for tag in tqdm(tags):
    trbs = data.loc[data['tag'] == tag, 'TRB']
    mask = trbs[trbs.isin(a_trb.obs.index)]
    X_ = pd.Series(a_trb[mask].X.mean(0), name=tag)
    Xs.append(X_)
og_trb_suo2022_X = pd.concat(Xs, axis=1).T

#### Adults with COVID-19 and Healthy Donors (Su, Yuan, and Chen et al. 2022)

In [None]:
# derive annotations
results_tcr['SU_CELL2022_COVID19'][['batch','subbatch','sample']] = \
results_tcr['SU_CELL2022_COVID19']['batch_info'].str.split(':', expand=True)
# get the tag, keeping only pairs that have at least min_cells cells
data = results_tcr['SU_CELL2022_COVID19'][['sample','TcellType','TRB']].astype(str).copy()
data['tag'] = data[['sample','TcellType']].astype(str).agg(':'.join, axis=1)
# filter the data more harshly because less assured of quality
data['TRB'][~data['TRB'].isin(a_trb.obs.index)] = np.nan
data = data.dropna(subset=['TRB'])
counts = data['tag'].value_counts(); tags = counts.index[counts >= min_cells]
# compile the Xs
Xs = []
for tag in tqdm(tags):
    trbs = data.loc[data['tag'] == tag, 'TRB']
    mask = trbs[trbs.isin(a_trb.obs.index)]
    X_ = pd.Series(a_trb[mask].X.mean(0), name=tag)
    Xs.append(X_)
og_trb_su2022_X = pd.concat(Xs, axis=1).T

#### Pan-Cancer Types (Zheng, Qin, and Si et al. 2021)

In [None]:
# get the tag, keeping only pairs that have at least min_cells cells
data = results_tcr['ZHENG_SCIENCE2021_PANCAN'][['patient','TcellType','TRB']].astype(str).copy()
data['tag'] = data[['patient','TcellType']].astype(str).agg(':'.join, axis=1)
# filter the data more harshly because less assured of quality
data['TRB'][~data['TRB'].isin(a_trb.obs.index)] = np.nan
data = data.dropna(subset=['TRB'])
counts = data['tag'].value_counts(); tags = counts.index[counts >= min_cells]
# compile the Xs
Xs = []
for tag in tqdm(tags):
    trbs = data.loc[data['tag'] == tag, 'TRB']
    mask = trbs[trbs.isin(a_trb.obs.index)]
    X_ = pd.Series(a_trb[mask].X.mean(0), name=tag)
    Xs.append(X_)
og_trb_zheng2021_X = pd.concat(Xs, axis=1).T

### Modeling with Random Forest 100 Trees

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# define a function to interrogate the data
def interrogate_with_globals():
    # create statistics tracking dataframe
    df_stat = pd.DataFrame(columns=['auroc','auprc'])
    # create tracking variables for downstream visualization and statistics
    probas, probas_bin, truths = [], [], []
    fprs, tprs, pres, recs = [], [], [], []
    # train utilizing random forest models in a stratified shuffled manner
    skf = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=1/4)
    for idxs_train, idxs_test in skf.split(X1, y1):
        # instantiate the random forest model
        clf = RandomForestClassifier(random_state=0, n_estimators=100)
        # fit the random forest model using Dataset #1
        clf = clf.fit(X1.iloc[idxs_train], y1.iloc[idxs_train])

        # predict on Dataset #2 correcting to all indices if requested
        if pred_on_all:
            idxs_test = range(X2.shape[0])
        # derive the probabilities
        proba = clf.predict_proba(X2.iloc[idxs_test])[:, clf.classes_ == 1]
        probas.append(pd.Series(proba[:, 0], index=X2.index[idxs_test]))
        # binarize into categorical predictions
        proba_bin = 1 * (proba >= 0.50)
        probas_bin.append(pd.Series(proba_bin[:, 0], index=X2.index[idxs_test]))
        # retrieve the associated ground truth
        truth = y2.iloc[idxs_test]
        truths.append(truth.copy())

        # compute subsequent AUROC and AUPRC related metrics
        fpr, tpr, _ = roc_curve(truth, proba)
        pre, rec, _ = precision_recall_curve(truth, proba)
        fprs.append(fpr); tprs.append(tpr); pres.append(pre); recs.append(rec)
        # save the relevant statistics
        df_stat.loc[df_stat.shape[0]] = auc(fpr, tpr), auc(rec, pre)

    # check the difference
    for stat in df_stat.columns:
        fig, ax = plt.subplots(figsize=[1, 4]); ax.grid(False)
        sns.boxplot(y=df_stat[stat], linewidth=1.5, saturation=1, showfliers=False, linecolor='dodgerblue', color='skyblue')
        sns.stripplot(y=df_stat[stat], linewidth=1.5, s=6, alpha=0.5, color='skyblue', edgecolor='dodgerblue')
        ax.set_xlim(-0.75, 0.75); ax.set_ylabel(stat.upper())
        print(stat.upper(), df_stat[stat].mean(), df_stat[stat].std() / np.sqrt(df_stat.shape[0])*1.96)

    # plot the FPR, TPR
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for fpr, tpr in zip(fprs, tprs):
        ax.plot(fpr, tpr, color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, fpr, tpr))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0, 1], color='lightgray', linestyle='dotted')
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')

    # plot the precision recall
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for pre, rec in zip(pres, recs):
        ax.plot(rec[::-1], pre[::-1], color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, rec[::-1], pre[::-1]))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0.5]*2, color='lightgray', linestyle='dotted')
    ax.set(xlabel='Recall', ylabel='Precision')

    from sklearn.metrics import confusion_matrix
    # define the results
    result = pd.DataFrame(columns=['Truth','PredProb'])
    for proba, truth in zip(probas, truths):
        result.loc[result.shape[0]] = '+', proba[truth == 1].mean()
        result.loc[result.shape[0]] = '-', proba[truth == 0].mean()

    # compare the average prediction probabilities
    fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
    sns.boxplot(x='Truth', y='PredProb', data=result, linewidth=1.5, saturation=1,
                showfliers=False, linecolor='dodgerblue', color='skyblue',
                order=['-', '+'], palette=['lightgray','skyblue'])
    np.random.seed(0)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['dodgerblue'], order=['+'], alpha=0.6, s=6)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['grey'], order=['-'], alpha=0.6, s=6)
    ax.set_xlim(-1, 2); ax.set_ylabel('Prediction Probability'); ax.set_xlabel('Ground Truth')
    ax.get_children()[0].set_hatch('//')
    ax.get_children()[0].set_edgecolor('grey')
    for idx in range(1, 6):
        ax.get_children()[idx].set_color('grey')

    # report statistics
    print('p-value for + vs. -:')
    print(ss.mannwhitneyu(result.loc[result['Truth'] == '+', 'PredProb'], result.loc[result['Truth'] == '-', 'PredProb']))
    print('average:')
    print(df_stat.mean(0))
    print('95% cis:')
    print(df_stat.std(0) / np.sqrt(10) * 1.96)
    return df_stat

#### Fetal Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_fetal = interrogate_with_globals()

#### COVID-19 and Adult Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_covid = interrogate_with_globals()

#### Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_tumor = interrogate_with_globals()

#### Fetal Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2tumor = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2covid = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2adult = interrogate_with_globals()

#### Fetal Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_adult2fetal = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2tumor = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2covid = interrogate_with_globals()

In [None]:
import pickle as pkl
# save all of the values
df_stats = {'adult2fetal':df_stat_adult2fetal,
            'covid2fetal':df_stat_covid2fetal, 'covid2tumor':df_stat_covid2tumor, 'covid':df_stat_covid,
            'tumor2covid':df_stat_tumor2covid, 'tumor':df_stat_tumor, 'tumor2fetal':df_stat_tumor2fetal,
            'fetal2adult':df_stat_fetal2adult,
            'fetal2covid':df_stat_fetal2covid, 'fetal2tumor':df_stat_fetal2tumor, 'fetal':df_stat_fetal,}
with open('../outs/250421_cd4vscd8_randomforest100.pkl', 'wb') as f:
    pkl.dump(df_stats, f)

### Modeling with Random Forest 200 Trees

In [None]:
# define a function to interrogate the data
def interrogate_with_globals():
    # create statistics tracking dataframe
    df_stat = pd.DataFrame(columns=['auroc','auprc'])
    # create tracking variables for downstream visualization and statistics
    probas, probas_bin, truths = [], [], []
    fprs, tprs, pres, recs = [], [], [], []
    # train utilizing random forest models in a stratified shuffled manner
    skf = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=1/4)
    for idxs_train, idxs_test in skf.split(X1, y1):
        # instantiate the random forest model
        clf = RandomForestClassifier(random_state=0, n_estimators=200)
        # fit the random forest model using Dataset #1
        clf = clf.fit(X1.iloc[idxs_train], y1.iloc[idxs_train])

        # predict on Dataset #2 correcting to all indices if requested
        if pred_on_all:
            idxs_test = range(X2.shape[0])
        # derive the probabilities
        proba = clf.predict_proba(X2.iloc[idxs_test])[:, clf.classes_ == 1]
        probas.append(pd.Series(proba[:, 0], index=X2.index[idxs_test]))
        # binarize into categorical predictions
        proba_bin = 1 * (proba >= 0.50)
        probas_bin.append(pd.Series(proba_bin[:, 0], index=X2.index[idxs_test]))
        # retrieve the associated ground truth
        truth = y2.iloc[idxs_test]
        truths.append(truth.copy())

        # compute subsequent AUROC and AUPRC related metrics
        fpr, tpr, _ = roc_curve(truth, proba)
        pre, rec, _ = precision_recall_curve(truth, proba)
        fprs.append(fpr); tprs.append(tpr); pres.append(pre); recs.append(rec)
        # save the relevant statistics
        df_stat.loc[df_stat.shape[0]] = auc(fpr, tpr), auc(rec, pre)

    # check the difference
    for stat in df_stat.columns:
        fig, ax = plt.subplots(figsize=[1, 4]); ax.grid(False)
        sns.boxplot(y=df_stat[stat], linewidth=1.5, saturation=1, showfliers=False, linecolor='dodgerblue', color='skyblue')
        sns.stripplot(y=df_stat[stat], linewidth=1.5, s=6, alpha=0.5, color='skyblue', edgecolor='dodgerblue')
        ax.set_xlim(-0.75, 0.75); ax.set_ylabel(stat.upper())
        print(stat.upper(), df_stat[stat].mean(), df_stat[stat].std() / np.sqrt(df_stat.shape[0])*1.96)

    # plot the FPR, TPR
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for fpr, tpr in zip(fprs, tprs):
        ax.plot(fpr, tpr, color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, fpr, tpr))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0, 1], color='lightgray', linestyle='dotted')
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')

    # plot the precision recall
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for pre, rec in zip(pres, recs):
        ax.plot(rec[::-1], pre[::-1], color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, rec[::-1], pre[::-1]))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0.5]*2, color='lightgray', linestyle='dotted')
    ax.set(xlabel='Recall', ylabel='Precision')

    from sklearn.metrics import confusion_matrix
    # define the results
    result = pd.DataFrame(columns=['Truth','PredProb'])
    for proba, truth in zip(probas, truths):
        result.loc[result.shape[0]] = '+', proba[truth == 1].mean()
        result.loc[result.shape[0]] = '-', proba[truth == 0].mean()

    # compare the average prediction probabilities
    fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
    sns.boxplot(x='Truth', y='PredProb', data=result, linewidth=1.5, saturation=1,
                showfliers=False, linecolor='dodgerblue', color='skyblue',
                order=['-', '+'], palette=['lightgray','skyblue'])
    np.random.seed(0)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['dodgerblue'], order=['+'], alpha=0.6, s=6)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['grey'], order=['-'], alpha=0.6, s=6)
    ax.set_xlim(-1, 2); ax.set_ylabel('Prediction Probability'); ax.set_xlabel('Ground Truth')
    ax.get_children()[0].set_hatch('//')
    ax.get_children()[0].set_edgecolor('grey')
    for idx in range(1, 6):
        ax.get_children()[idx].set_color('grey')

    # report statistics
    print('p-value for + vs. -:')
    print(ss.mannwhitneyu(result.loc[result['Truth'] == '+', 'PredProb'], result.loc[result['Truth'] == '-', 'PredProb']))
    print('average:')
    print(df_stat.mean(0))
    print('95% cis:')
    print(df_stat.std(0) / np.sqrt(10) * 1.96)
    return df_stat

#### Fetal Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_fetal = interrogate_with_globals()

#### COVID-19 and Adult Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_covid = interrogate_with_globals()

#### Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_tumor = interrogate_with_globals()

#### Fetal Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2tumor = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2covid = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2adult = interrogate_with_globals()

#### Fetal Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_adult2fetal = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2tumor = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2covid = interrogate_with_globals()

In [None]:
import pickle as pkl
# save all of the values
df_stats = {'adult2fetal':df_stat_adult2fetal,
            'covid2fetal':df_stat_covid2fetal, 'covid2tumor':df_stat_covid2tumor, 'covid':df_stat_covid,
            'tumor2covid':df_stat_tumor2covid, 'tumor':df_stat_tumor, 'tumor2fetal':df_stat_tumor2fetal,
            'fetal2adult':df_stat_fetal2adult,
            'fetal2covid':df_stat_fetal2covid, 'fetal2tumor':df_stat_fetal2tumor, 'fetal':df_stat_fetal,}
with open('../outs/250421_cd4vscd8_randomforest200.pkl', 'wb') as f:
    pkl.dump(df_stats, f)

### Modeling with Random Forest 500 Trees

In [None]:
# define a function to interrogate the data
def interrogate_with_globals():
    # create statistics tracking dataframe
    df_stat = pd.DataFrame(columns=['auroc','auprc'])
    # create tracking variables for downstream visualization and statistics
    probas, probas_bin, truths = [], [], []
    fprs, tprs, pres, recs = [], [], [], []
    # train utilizing random forest models in a stratified shuffled manner
    skf = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=1/4)
    for idxs_train, idxs_test in skf.split(X1, y1):
        # instantiate the random forest model
        clf = RandomForestClassifier(random_state=0, n_estimators=500)
        # fit the random forest model using Dataset #1
        clf = clf.fit(X1.iloc[idxs_train], y1.iloc[idxs_train])

        # predict on Dataset #2 correcting to all indices if requested
        if pred_on_all:
            idxs_test = range(X2.shape[0])
        # derive the probabilities
        proba = clf.predict_proba(X2.iloc[idxs_test])[:, clf.classes_ == 1]
        probas.append(pd.Series(proba[:, 0], index=X2.index[idxs_test]))
        # binarize into categorical predictions
        proba_bin = 1 * (proba >= 0.50)
        probas_bin.append(pd.Series(proba_bin[:, 0], index=X2.index[idxs_test]))
        # retrieve the associated ground truth
        truth = y2.iloc[idxs_test]
        truths.append(truth.copy())

        # compute subsequent AUROC and AUPRC related metrics
        fpr, tpr, _ = roc_curve(truth, proba)
        pre, rec, _ = precision_recall_curve(truth, proba)
        fprs.append(fpr); tprs.append(tpr); pres.append(pre); recs.append(rec)
        # save the relevant statistics
        df_stat.loc[df_stat.shape[0]] = auc(fpr, tpr), auc(rec, pre)

    # check the difference
    for stat in df_stat.columns:
        fig, ax = plt.subplots(figsize=[1, 4]); ax.grid(False)
        sns.boxplot(y=df_stat[stat], linewidth=1.5, saturation=1, showfliers=False, linecolor='dodgerblue', color='skyblue')
        sns.stripplot(y=df_stat[stat], linewidth=1.5, s=6, alpha=0.5, color='skyblue', edgecolor='dodgerblue')
        ax.set_xlim(-0.75, 0.75); ax.set_ylabel(stat.upper())
        print(stat.upper(), df_stat[stat].mean(), df_stat[stat].std() / np.sqrt(df_stat.shape[0])*1.96)

    # plot the FPR, TPR
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for fpr, tpr in zip(fprs, tprs):
        ax.plot(fpr, tpr, color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, fpr, tpr))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0, 1], color='lightgray', linestyle='dotted')
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')

    # plot the precision recall
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for pre, rec in zip(pres, recs):
        ax.plot(rec[::-1], pre[::-1], color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, rec[::-1], pre[::-1]))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0.5]*2, color='lightgray', linestyle='dotted')
    ax.set(xlabel='Recall', ylabel='Precision')

    from sklearn.metrics import confusion_matrix
    # define the results
    result = pd.DataFrame(columns=['Truth','PredProb'])
    for proba, truth in zip(probas, truths):
        result.loc[result.shape[0]] = '+', proba[truth == 1].mean()
        result.loc[result.shape[0]] = '-', proba[truth == 0].mean()

    # compare the average prediction probabilities
    fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
    sns.boxplot(x='Truth', y='PredProb', data=result, linewidth=1.5, saturation=1,
                showfliers=False, linecolor='dodgerblue', color='skyblue',
                order=['-', '+'], palette=['lightgray','skyblue'])
    np.random.seed(0)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['dodgerblue'], order=['+'], alpha=0.6, s=6)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['grey'], order=['-'], alpha=0.6, s=6)
    ax.set_xlim(-1, 2); ax.set_ylabel('Prediction Probability'); ax.set_xlabel('Ground Truth')
    ax.get_children()[0].set_hatch('//')
    ax.get_children()[0].set_edgecolor('grey')
    for idx in range(1, 6):
        ax.get_children()[idx].set_color('grey')

    # report statistics
    print('p-value for + vs. -:')
    print(ss.mannwhitneyu(result.loc[result['Truth'] == '+', 'PredProb'], result.loc[result['Truth'] == '-', 'PredProb']))
    print('average:')
    print(df_stat.mean(0))
    print('95% cis:')
    print(df_stat.std(0) / np.sqrt(10) * 1.96)
    return df_stat

#### Fetal Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_fetal = interrogate_with_globals()

#### COVID-19 and Adult Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_covid = interrogate_with_globals()

#### Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_tumor = interrogate_with_globals()

#### Fetal Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2tumor = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2covid = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2adult = interrogate_with_globals()

#### Fetal Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_adult2fetal = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2tumor = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2covid = interrogate_with_globals()

In [None]:
import pickle as pkl
# save all of the values
df_stats = {'adult2fetal':df_stat_adult2fetal,
            'covid2fetal':df_stat_covid2fetal, 'covid2tumor':df_stat_covid2tumor, 'covid':df_stat_covid,
            'tumor2covid':df_stat_tumor2covid, 'tumor':df_stat_tumor, 'tumor2fetal':df_stat_tumor2fetal,
            'fetal2adult':df_stat_fetal2adult,
            'fetal2covid':df_stat_fetal2covid, 'fetal2tumor':df_stat_fetal2tumor, 'fetal':df_stat_fetal,}
with open('../outs/250421_cd4vscd8_randomforest500.pkl', 'wb') as f:
    pkl.dump(df_stats, f)

### Modeling with LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
# define a function to interrogate the data
def interrogate_with_globals():
    # create statistics tracking dataframe
    df_stat = pd.DataFrame(columns=['auroc','auprc'])
    # create tracking variables for downstream visualization and statistics
    probas, probas_bin, truths = [], [], []
    fprs, tprs, pres, recs = [], [], [], []
    # train utilizing random forest models in a stratified shuffled manner
    skf = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=1/4)
    for idxs_train, idxs_test in skf.split(X1, y1):
        # instantiate the random forest model
        clf = LogisticRegression()
        # fit the random forest model using Dataset #1
        clf = clf.fit(X1.iloc[idxs_train], y1.iloc[idxs_train])

        # predict on Dataset #2 correcting to all indices if requested
        if pred_on_all:
            idxs_test = range(X2.shape[0])
        # derive the probabilities
        proba = clf.predict_proba(X2.iloc[idxs_test])[:, clf.classes_ == 1]
        probas.append(pd.Series(proba[:, 0], index=X2.index[idxs_test]))
        # binarize into categorical predictions
        proba_bin = 1 * (proba >= 0.50)
        probas_bin.append(pd.Series(proba_bin[:, 0], index=X2.index[idxs_test]))
        # retrieve the associated ground truth
        truth = y2.iloc[idxs_test]
        truths.append(truth.copy())

        # compute subsequent AUROC and AUPRC related metrics
        fpr, tpr, _ = roc_curve(truth, proba)
        pre, rec, _ = precision_recall_curve(truth, proba)
        fprs.append(fpr); tprs.append(tpr); pres.append(pre); recs.append(rec)
        # save the relevant statistics
        df_stat.loc[df_stat.shape[0]] = auc(fpr, tpr), auc(rec, pre)

    # check the difference
    for stat in df_stat.columns:
        fig, ax = plt.subplots(figsize=[1, 4]); ax.grid(False)
        sns.boxplot(y=df_stat[stat], linewidth=1.5, saturation=1, showfliers=False, linecolor='dodgerblue', color='skyblue')
        sns.stripplot(y=df_stat[stat], linewidth=1.5, s=6, alpha=0.5, color='skyblue', edgecolor='dodgerblue')
        ax.set_xlim(-0.75, 0.75); ax.set_ylabel(stat.upper())
        print(stat.upper(), df_stat[stat].mean(), df_stat[stat].std() / np.sqrt(df_stat.shape[0])*1.96)

    # plot the FPR, TPR
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for fpr, tpr in zip(fprs, tprs):
        ax.plot(fpr, tpr, color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, fpr, tpr))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0, 1], color='lightgray', linestyle='dotted')
    ax.set(xlabel='False Positive Rate', ylabel='True Positive Rate')

    # plot the precision recall
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    xl = np.arange(0, 1.01, 0.01); yls = []
    for pre, rec in zip(pres, recs):
        ax.plot(rec[::-1], pre[::-1], color='skyblue', linestyle='--', lw=1)
        yls.append(np.interp(xl, rec[::-1], pre[::-1]))
    yl = np.vstack(yls).mean(0)
    ax.plot(xl, yl, color='dodgerblue', lw=2)
    ax.plot([0, 1], [0.5]*2, color='lightgray', linestyle='dotted')
    ax.set(xlabel='Recall', ylabel='Precision')

    from sklearn.metrics import confusion_matrix
    # define the results
    result = pd.DataFrame(columns=['Truth','PredProb'])
    for proba, truth in zip(probas, truths):
        result.loc[result.shape[0]] = '+', proba[truth == 1].mean()
        result.loc[result.shape[0]] = '-', proba[truth == 0].mean()

    # compare the average prediction probabilities
    fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
    sns.boxplot(x='Truth', y='PredProb', data=result, linewidth=1.5, saturation=1,
                showfliers=False, linecolor='dodgerblue', color='skyblue',
                order=['-', '+'], palette=['lightgray','skyblue'])
    np.random.seed(0)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['dodgerblue'], order=['+'], alpha=0.6, s=6)
    sns.stripplot(x='Truth', y='PredProb', data=result, jitter=0.4, palette=['grey'], order=['-'], alpha=0.6, s=6)
    ax.set_xlim(-1, 2); ax.set_ylabel('Prediction Probability'); ax.set_xlabel('Ground Truth')
    ax.get_children()[0].set_hatch('//')
    ax.get_children()[0].set_edgecolor('grey')
    for idx in range(1, 6):
        ax.get_children()[idx].set_color('grey')

    # report statistics
    print('p-value for + vs. -:')
    print(ss.mannwhitneyu(result.loc[result['Truth'] == '+', 'PredProb'], result.loc[result['Truth'] == '-', 'PredProb']))
    print('average:')
    print(df_stat.mean(0))
    print('95% cis:')
    print(df_stat.std(0) / np.sqrt(10) * 1.96)
    return df_stat

#### Fetal Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_fetal = interrogate_with_globals()

#### COVID-19 and Adult Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_covid = interrogate_with_globals()

#### Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = False
# perform predictions with all
df_stat_tumor = interrogate_with_globals()

#### Fetal Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2tumor = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2covid = interrogate_with_globals()

#### Fetal Donors... --> Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_fetal2adult = interrogate_with_globals()

#### Fetal Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2fetal = interrogate_with_globals()

#### Fetal Donors... <-- Adult COVID-19 and Healthy Donors... + Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_suo2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
# > covid-19 and healthy donors
X2A = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2A = pd.Series(X2A.index.str.contains(':CD8'), index=X2A.index)
print(X2A.shape[0], y2A.sum(), y2A.mean())
# > pan-cancer types
X2B = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2B = pd.Series(X2B.index.str.contains(':CD8'), index=X2B.index)
print(X2B.shape[0], y2B.sum(), y2B.mean())
# > concatenate the two datasets
X2 = pd.concat([X2A, X2B], axis=0)
y2 = pd.concat([y2A, y2B], axis=0)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_adult2fetal = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... --> Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid2tumor = interrogate_with_globals()

#### Adult COVID-19 and Healthy Donors... <-- Pan-Cancer...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X2A.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_zheng2021_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# reverse the comparison
X1, y1, X2, y2 = X2, y2, X1, y1

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_tumor2covid = interrogate_with_globals()

In [None]:
import pickle as pkl
# save all of the values
df_stats = {'adult2fetal':df_stat_adult2fetal,
            'covid2fetal':df_stat_covid2fetal, 'covid2tumor':df_stat_covid2tumor, 'covid':df_stat_covid,
            'tumor2covid':df_stat_tumor2covid, 'tumor':df_stat_tumor, 'tumor2fetal':df_stat_tumor2fetal,
            'fetal2adult':df_stat_fetal2adult,
            'fetal2covid':df_stat_fetal2covid, 'fetal2tumor':df_stat_fetal2tumor, 'fetal':df_stat_fetal,}
with open('../outs/250421_cd4vscd8_logisticregression.pkl', 'wb') as f:
    pkl.dump(df_stats, f)

### Integrate Data Together into a Single Visualization

In [None]:
# define the helper functions
x2l = {'adult2fetal':'Adult COVID-19+HD and Pan-Cancer --> Fetal',
       'covid2fetal':'Adult COVID-19+HD --> Fetal',
       'covid2tumor':'Adult COVID-19+HD --> Fetal',
       'covid':'Adult COVID-19+HD (w/ Self)',
       'tumor2covid':'Adult Pan-Cancer to COVID-19+HD',
       'tumor':'Adult Pan-Cancer (w/ Self)',
       'tumor2fetal':'Adult Pan-Cancer --> Fetal',
       'fetal2adult':'Fetal --> Adult COVID-19+HD and Pan-Cancer',
       'fetal2covid':'Fetal --> Adult COVID-19+HD',
       'fetal2tumor':'Fetal --> Adult Pan-Cancer',
       'fetal':'Fetal (w/ Self)'}

In [None]:
# define a plotting function
def visualize_on_globals():
    # assemble the plotting dataframe
    df_plot = pd.DataFrame(columns=['x','y'])
    for k, vs in df_stats.items():
        for v in vs[key]:
            df_plot.loc[df_plot.shape[0]] = k, v

    # create the ordered box plots
    fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
    order = df_plot.groupby('x').mean()['y'].sort_values().index
    sns.boxplot(x='x', y='y', data=df_plot, linewidth=1.5, linecolor='dodgerblue',
                color='skyblue', saturation=1, showfliers=False, order=order)
    sns.stripplot(x='x', y='y', data=df_plot, linewidth=1.5, edgecolor='dodgerblue',
                  color='skyblue', jitter=0.25, order=order, s=5, alpha=0.5)
    ax.tick_params(axis='x', labelrotation=90)
    ax.axhline(0.5, color='grey', linestyle='--')
    ax.set_xticklabels([x2l[x.get_text()] for x in ax.get_xticklabels()])
    ax.set(xlabel='Datasets Utilized', ylabel=key_label)
    # statistically test if these means are the same
    print(ss.kruskal(*[df_plot.loc[df_plot['x'] == x, 'y'] for x in df_plot['x'].unique()]))

#### LR

In [None]:
# define the parameters
key, key_label = 'auroc', 'AUROC'
visualize_on_globals()

In [None]:
# define the parameters
key, key_label = 'auprc', 'AUPRC'
visualize_on_globals()

#### RF 100

In [None]:
# read in thedata
with open('../outs/250421_cd4vscd8_randomforest100.pkl', 'rb') as f:
    df_stats = pkl.load(f)

In [None]:
# define the parameters
key, key_label = 'auroc', 'AUROC'
visualize_on_globals()

In [None]:
# define the parameters
key, key_label = 'auprc', 'AUPRC'
visualize_on_globals()

#### RF 200

In [None]:
# read in thedata
with open('../outs/250421_cd4vscd8_randomforest200.pkl', 'rb') as f:
    df_stats = pkl.load(f)

In [None]:
# define the parameters
key, key_label = 'auroc', 'AUROC'
visualize_on_globals()

In [None]:
# define the parameters
key, key_label = 'auprc', 'AUPRC'
visualize_on_globals()

#### RF 500

In [None]:
# read in thedata
with open('../outs/250421_cd4vscd8_randomforest500.pkl', 'rb') as f:
    df_stats = pkl.load(f)

In [None]:
# define the parameters
key, key_label = 'auroc', 'AUROC'
visualize_on_globals()

In [None]:
# define the parameters
key, key_label = 'auprc', 'AUPRC'
visualize_on_globals()

#### Compilation into Circos Plot

In [None]:
# derive the mapping for the labels
x2l = {'lr':'LR','rf100':'RF$_{100}$','rf200':'RF$_{200}$','rf500':'RF$_{500}$'}

In [None]:
# read in the data, lr
df_stat_lr = pd.Series()
with open('../outs/250421_cd4vscd8_logisticregression.pkl', 'rb') as f:
    df_stats = pkl.load(f)
    for k, vs in df_stats.items():
        df_stat_lr.loc[k] = vs['auroc'].mean()
# read in the data, rf100
df_stat_rf100 = pd.Series()
with open('../outs/250421_cd4vscd8_randomforest100.pkl', 'rb') as f:
    df_stats = pkl.load(f)
    for k, vs in df_stats.items():
        df_stat_rf100.loc[k] = vs['auroc'].mean()
# read in the data, rf200
df_stat_rf200 = pd.Series()
with open('../outs/250421_cd4vscd8_randomforest200.pkl', 'rb') as f:
    df_stats = pkl.load(f)
    for k, vs in df_stats.items():
        df_stat_rf200.loc[k] = vs['auroc'].mean()
# read in the data, rf500
df_stat_rf500 = pd.Series()
with open('../outs/250421_cd4vscd8_randomforest500.pkl', 'rb') as f:
    df_stats = pkl.load(f)
    for k, vs in df_stats.items():
        df_stat_rf500.loc[k] = vs['auroc'].mean()
# compile the data together
df_stat = pd.concat([df_stat_lr, df_stat_rf100, df_stat_rf200, df_stat_rf500], axis=1)
df_stat.columns = ['lr','rf100','rf200','rf500']

In [None]:
# derive the values
y2ls = {'adult2fetal':['a1','f2'],
        'covid2fetal':['c1','f2'],
        'covid2tumor':['c1','t2'],
        'covid':['c1','c2'],
        'tumor2covid':['t1','c2'],
        'tumor':['t1','t2'],
        'tumor2fetal':['t1','f2'],
        'fetal2adult':['f1','a2'],
        'fetal2covid':['f1','c2'],
        'fetal2tumor':['f1','t2'],
        'fetal':['f1','f2']}
df_stat[['start','end']] = np.nan
for idx in df_stat.index:
    df_stat.loc[idx, ['start','end']] = y2ls[idx]
df_stat = df_stat.loc[~(df_stat[['start','end']].isin(['a1','a2'])).any(axis=1)]

In [None]:
from pycirclize import Circos

# initialize Circos sectors
sectors = ['f1','c1','t1','t2','c2','f2']
circos = Circos({s:3 for s in sectors}, space=5)
# write sector names
for sector in circos.sectors:
    sector.text(sector.name, r=110, size=15)
# add links
k2b = {k:0 for k in sectors}
e2c = {'f2':'skyblue','c2':'dodgerblue','t2':'navy'}
for start in ['f1','c1','t1']:
    for end in ['f2','c2','t2']:
        # retrieve the value to go from
        b1 = k2b[start]
        b2 = k2b[end]
        # retrieve the value to plot
        value = df_stat.loc[(df_stat['start'] == start)&(df_stat['end'] == end)]['lr'][0]
        circos.link((start, b1, b1+value), (end, b2, b2+value), direction=1, color=e2c[end], ec='k', lw=1.5)
        # update with the current positions
        k2b[start] += value
        k2b[end] += value
    
# save the figure
circos.savefig('07I3_circos.png', dpi=300)

In [None]:
# initialize Circos sectors
circos = Circos({s:1.5 for s in sectors}, space=5)
# write sector names
for sector in circos.sectors:
    sector.text(sector.name, r=110, size=15)
# add links
k2b = {k:0 for k in sectors}
e2c = {'f2':'skyblue','c2':'dodgerblue','t2':'navy'}
for start in ['f1','c1','t1']:
    for end in ['f2','c2','t2']:
        # retrieve the value to go from
        b1 = k2b[start]
        b2 = k2b[end]
        # retrieve the value to plot
        value = df_stat.loc[(df_stat['start'] == start)&(df_stat['end'] == end)]['lr'][0]-0.5
        circos.link((start, b1, b1+value), (end, b2, b2+value), direction=1, color=e2c[end], ec='k', lw=1.5)
        # update with the current positions
        k2b[start] += value
        k2b[end] += value
    
# save the figure
circos.savefig('07I3_circos_50.png', dpi=300)

In [None]:
# initialize Circos sectors
sectors = ['f1','c1','t1','t2','c2','f2']
circos = Circos({s:1.5 for s in sectors}, space=5)
# write sector names
for sector in circos.sectors:
    sector.text(sector.name, r=110, size=15)
# add links
k2b = {k:0 for k in sectors}
e2c = {'f2':'skyblue','c2':'dodgerblue','t2':'navy'}
for start in ['f1','c1','t1']:
    for end in ['f2','c2','t2']:
        # retrieve the value to go from
        b1 = k2b[start]
        b2 = k2b[end]
        # retrieve the value to plot
        value = np.random.choice([0, 0.25, 0.5])
        circos.link((start, b1, b1+value), (end, b2, b2+value), direction=1, color=e2c[end], ec='k', lw=1.5)
        # update with the current positions
        k2b[start] += value
        k2b[end] += value
    
# save the figure
circos.savefig('07I3_circos_legend.png', dpi=100)

### Identify CD4-like CD8+ T cells

In [None]:
from sklearn.linear_model import LogisticRegression
# define a function to interrogate the data
def interrogate_with_globals():
    # create statistics tracking dataframe
    df_stat = pd.DataFrame(columns=['auroc','auprc','f1_score','balacc'])
    # create tracking variables for downstream visualization and statistics
    probas, probas_bin, truths = [], [], []
    fprs, tprs, pres, recs = [], [], [], []
    # train utilizing random forest models in a stratified shuffled manner
    skf = StratifiedShuffleSplit(n_splits=10, random_state=0, test_size=1/4)
    for idxs_train, idxs_test in skf.split(X1, y1):
        # instantiate the random forest model
        clf = LogisticRegression()
        # fit the random forest model using Dataset #1
        clf = clf.fit(X1.iloc[idxs_train], y1.iloc[idxs_train])

        # predict on Dataset #2 correcting to all indices if requested
        if pred_on_all:
            idxs_test = range(X2.shape[0])
        # derive the probabilities
        proba = clf.predict_proba(X2.iloc[idxs_test])[:, clf.classes_ == 1]
        probas.append(pd.Series(proba[:, 0], index=X2.index[idxs_test]))
        # binarize into categorical predictions
        proba_bin = 1 * (proba >= 0.50)
        probas_bin.append(pd.Series(proba_bin[:, 0], index=X2.index[idxs_test]))
        # retrieve the associated ground truth
        truth = y2.iloc[idxs_test]
        truths.append(truth.copy())

        # compute subsequent AUROC and AUPRC related metrics
        fpr, tpr, _ = roc_curve(truth, proba)
        pre, rec, _ = precision_recall_curve(truth, proba)
        fprs.append(fpr); tprs.append(tpr); pres.append(pre); recs.append(rec)
        # save the relevant statistics
        df_stat.loc[df_stat.shape[0]] = auc(fpr, tpr), auc(rec, pre), \
                                        f1_score(truth, proba_bin, average='binary'), \
                                        balanced_accuracy_score(truth, proba_bin)

    return df_stat, probas

#### Derive CD4 vs. CD8 Scores on COVID-19 and Adult Healthy Donors...

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
print(X1.shape[0], y1.sum(), y1.mean())

# define the data to predict on
X2 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y2 = pd.Series(X2.index.str.contains(':CD8'), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid, probas_covid = interrogate_with_globals()

In [None]:
# project the CD8 probabilities
proba_cd8 = pd.concat(probas_covid, axis=1).mean(1)
fig, ax = plt.subplots(figsize=[4, 3]); ax.grid(False)
mask = proba_cd8.index.str.endswith('CD8')
sns.kdeplot(proba_cd8[mask], color='tab:green', lw=2, fill=True, alpha=0.8, bw_adjust=0.5, label='CD8+')
mask = proba_cd8.index.str.endswith('CD4')
sns.kdeplot(proba_cd8[mask], color='tab:orange', lw=2, fill=True, alpha=0.8, bw_adjust=0.5, label='CD4+')
ax.set(xlabel='Probability of CD8+')
ax.legend(bbox_transform=ax.transAxes, bbox_to_anchor=(1.01, .5), frameon=False, loc='center left')

In [None]:
# retrieve clinical characteristics
df_out = pd.read_table('../../COVID_ISB_STORAGE/observations_2021-11-03_1042.tsv', index_col=0)
# look at the samples that have confident CD8s vs. those that do not
samples_bad = proba_cd8[proba_cd8.index.str.endswith('CD8') & (proba_cd8 < 0.25)]
samples_bad.index = samples_bad.index.str.slice(0, -4)
samples_good = proba_cd8[proba_cd8.index.str.endswith('CD8') & (proba_cd8 >= 0.75)]
samples_good.index = samples_good.index.str.slice(0, -4)
# extract baseline samples
s_bad = samples_bad[samples_bad.index.str.endswith('-1')]
s_good = samples_good[samples_good.index.str.endswith('-1')]

# get the tag, keeping only pairs that have at least min_cells cells
data = results_tcr['SU_CELL2022_COVID19'][['sample','TcellType','TRB']].astype(str).copy()
data['tag'] = data[['sample','TcellType']].astype(str).agg(':'.join, axis=1)
data['TRB'][~data['TRB'].isin(a_trb.obs.index)] = np.nan
data = data.dropna(subset=['TRB'])
counts = data['tag'].value_counts(); tags = counts.index[counts >= min_cells]
# subset accordingly
counts = counts.loc[tags]
counts_cd8 = counts.loc[counts.index.str.endswith(':CD8')]
counts_cd4 = counts.loc[counts.index.str.endswith(':CD4')]

In [None]:
# look at the purity as a function of number of cells
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.scatter(proba_cd8.loc[counts_cd8.index], np.log10(counts_cd8), s=1, color='dodgerblue')
ax.set(xlabel='CD8 Purity Confidence\nPredicted Prob. of CD8+ as CD8',
       ylabel='log$_{10}$(Number of CD8+ T cells)')
ss.pearsonr(proba_cd8.loc[counts_cd8.index], np.log10(counts_cd8))

In [None]:
# examine for any clinical outcome differences
df_obs = pd.read_table('../../COVID_ISB_STORAGE/outcomes_2021-11-03_1042.tsv', index_col=0)
# extract the baseline samples
s_bad = samples_bad[samples_bad.index.str.endswith('-1')]
s_good = samples_good[samples_good.index.str.endswith('-1')]
# convert into the proper annotations
s_bad.index = 'INCOV' + s_bad.index.str.slice(0, -2).str.zfill(3)
s_good.index = 'INCOV' + s_good.index.str.slice(0, -2).str.zfill(3)
# derive the relevant ys
ys = df_obs.loc[s_bad.index.tolist()+s_good.index.tolist(), 'days_in_hospital']
xs = ['bad']*len(s_bad)+['good']*len(s_good)
xs_cont = pd.concat([s_bad, s_good], axis=0)
# save for baseline
xs1, xs_cont1, ys1 = xs, xs_cont, ys.loc[xs_cont.index]

# compare the disease severity
fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
sns.boxplot(x=xs, y=ys.loc[xs_cont.index], linecolor='dodgerblue', color='skyblue', linewidth=1.5,
            saturation=1, order=['bad','good'])
ax.set_xlim(-0.75, 1.75)
ax.set_xticklabels(['≥75%','<25%'][::-1])
ax.set(xlabel='CD8 Purity Conf.', ylabel='Days in the Hospital')
# ax.tick_params(axis='x', labelrotation=90)
ss.mannwhitneyu(ys.loc[xs_cont.index][np.array(xs)=='bad'].tolist(),
                ys.loc[xs_cont.index][np.array(xs)=='good'].tolist())

In [None]:
# repeat with WOS based clinical grading
s_bad = samples_bad[samples_bad.index.str.endswith('-1')]
s_good = samples_good[samples_good.index.str.endswith('-1')]
s_bad.index = 'INCOV' + s_bad.index.str.slice(0, -2).str.zfill(3)
s_good.index = 'INCOV' + s_good.index.str.slice(0, -2).str.zfill(3)

# derive the relevant ys
ys = df_out.loc[df_out['incov_redcap_event_name'] == 'baseline_blood_dra_arm_1', 'who_ordinal_scale'].dropna()
ys = ys.str.replace('1 or 2', '1.5').astype(float)
s_bad = s_bad.loc[s_bad.index.isin(ys.index)]
s_good = s_good.loc[s_good.index.isin(ys.index)]
xs = ['bad']*len(s_bad)+['good']*len(s_good)
xs_cont = pd.concat([s_bad, s_good], axis=0)
# save for baseline
xs1, xs_cont1, ys1 = xs, xs_cont, ys.loc[xs_cont.index]

# compare the disease severity
fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
sns.boxplot(x=xs, y=ys.loc[xs_cont.index], linecolor='dodgerblue', color='skyblue', linewidth=1.5,
            saturation=1, order=['bad','good'])
ax.set_xlim(-0.75, 1.75)
ax.set_xticklabels(['≥75%','<25%'][::-1])
ax.set(xlabel='CD8 Purity Conf.', ylabel='COVID-19 Severity (T1)')
# ax.tick_params(axis='x', labelrotation=90)
print(ss.mannwhitneyu(ys.loc[xs_cont.index][np.array(xs)=='bad'].tolist(),
                ys.loc[xs_cont.index][np.array(xs)=='good'].tolist()))

# extract the baseline samples
s_bad = samples_bad[samples_bad.index.str.endswith('-1')]
s_good = samples_good[samples_good.index.str.endswith('-1')]
# convert into the proper annotations
s_bad.index = 'INCOV' + s_bad.index.str.slice(0, -2).str.zfill(3)
s_good.index = 'INCOV' + s_good.index.str.slice(0, -2).str.zfill(3)

# derive the relevant ys
ys = df_out.loc[df_out['incov_redcap_event_name'] == 'acute_blood_draw_arm_1', 'who_ordinal_scale'].dropna()
ys = ys.str.replace('1 or 2', '1.5').astype(float)
s_bad = s_bad.loc[s_bad.index.isin(ys.index)]
s_good = s_good.loc[s_good.index.isin(ys.index)]
xs = ['bad']*len(s_bad)+['good']*len(s_good)
xs_cont = pd.concat([s_bad, s_good], axis=0)
# save for baseline
xs1, xs_cont1, ys1 = xs, xs_cont, ys.loc[xs_cont.index]

# compare the disease severity
fig, ax = plt.subplots(figsize=[2, 4]); ax.grid(False)
sns.boxplot(x=xs, y=ys.loc[xs_cont.index], linecolor='dodgerblue', color='skyblue', linewidth=1.5,
            saturation=1, order=['bad','good'], medianprops=dict(color='blue'))
ax.set_xlim(-0.75, 1.75)
ax.set_xticklabels(['≥75%','<25%'][::-1])
ax.set(xlabel='CD8 Purity Conf.', ylabel='COVID-19 Severity (T2)')
# ax.tick_params(axis='x', labelrotation=90)
print(ss.mannwhitneyu(ys.loc[xs_cont.index][np.array(xs)=='bad'].tolist(),
                ys.loc[xs_cont.index][np.array(xs)=='good'].tolist()))

In [None]:
# compare against plasma protein profiles
df_pro = pd.read_excel('../../COVID_ISB_STORAGE/1-s2.0-S0092867422000721-mmc2.xlsx', sheet_name=1, index_col=1)
df_pro = df_pro.loc[~df_pro.index.isna()]
df_pro.index = df_pro.index.str.replace('-T','-')
# derive a subset
df_pro_ = df_pro.loc[df_pro.index.str.endswith('-1')].iloc[:, 5:].copy().T
df_pro_['gene'] = df_pro_.index.to_series().str.split('_', expand=True)[0]
df_pro_ = df_pro_.groupby('gene').mean()
df_pro_ = df_pro_.T; df_pro_.index = df_pro_.index.str.slice(0, -2)
# derive the relevant xs
s_bad_, s_good_ = s_bad.loc[s_bad.index.isin(df_pro_.index)].copy(), s_good.loc[s_good.index.isin(df_pro_.index)].copy()
# aggregate the p-values
df_stat = pd.DataFrame(columns=['pval','diff'])
for col in tqdm(df_pro_.columns):
    p = ss.mannwhitneyu(df_pro_.loc[s_bad_.index, col], df_pro_.loc[s_good_.index, col])[1]
    d = df_pro_.loc[s_bad_.index, col].mean() - df_pro_.loc[s_good_.index, col].mean()
    df_stat.loc[col] = p, d
# convert to fdrs
import statsmodels.api as sm
import statsmodels as sm
df_stat['fdr'] = sm.stats.multitest.fdrcorrection(df_stat['pval'])[1]
# plot the useable scatter
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.scatter(df_stat['diff'], -np.log10(df_stat['fdr']), s=1.5, color='dodgerblue')
ax.axvline(0, color='k'); ax.axhline(0, color='k')
ax.set(xlabel='Plasma Protein Abundance (T1)\nPredicted CD8+ <25% - ≥75%', ylabel='-log$_{10}$(False Discovery Rate)')
# save the data
df_stat.to_csv('../outs/250424_plasmaprotein.t1diff_cd8goodbad.csv')

In [None]:
# derive a subset (T2)
df_pro_ = df_pro.loc[df_pro.index.str.endswith('-2')].iloc[:, 5:].copy().T
df_pro_['gene'] = df_pro_.index.to_series().str.split('_', expand=True)[0]
df_pro_ = df_pro_.groupby('gene').mean()
df_pro_ = df_pro_.T; df_pro_.index = df_pro_.index.str.slice(0, -2)
# derive the relevant xs
s_bad_, s_good_ = s_bad.loc[s_bad.index.isin(df_pro_.index)].copy(), s_good.loc[s_good.index.isin(df_pro_.index)].copy()
# aggregate the p-values
df_stat = pd.DataFrame(columns=['pval','diff'])
for col in tqdm(df_pro_.columns):
    p = ss.mannwhitneyu(df_pro_.loc[s_bad_.index, col], df_pro_.loc[s_good_.index, col])[1]
    d = df_pro_.loc[s_bad_.index, col].mean() - df_pro_.loc[s_good_.index, col].mean()
    df_stat.loc[col] = p, d
# convert to fdrs
import statsmodels.api as sm
import statsmodels as sm
df_stat['fdr'] = sm.stats.multitest.fdrcorrection(df_stat['pval'])[1]
# plot the useable scatter
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
ax.scatter(df_stat['diff'], -np.log10(df_stat['fdr']), s=1.5, color='dodgerblue')
ax.axvline(0, color='k'); ax.axhline(0, color='k')
ax.set(xlabel='Plasma Protein Abundance (T2)\nPredicted CD8+ <25% - ≥75%', ylabel='-log$_{10}$(False Discovery Rate)')
# save the data
df_stat.to_csv('../outs/250424_plasmaprotein.t2diff_cd8goodbad.csv')

#### Visualize Single Cell Scores

In [None]:
# define the data to train on
X1 = og_trb_su2022_X.copy()
# setup a mask for CD8+ cells
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
# define the data to predict on
trbs = results_tcr['SU_CELL2022_COVID19'].loc[adata.obs.index, 'TRB'].dropna().unique()
trbs = a_trb.obs.index.intersection(trbs)
X2 = pd.DataFrame(a_trb[trbs].X.toarray(), index=trbs)
# setup a mask for CD8+ cells
y2 = pd.Series(np.random.choice([0, 1], size=X2.shape[0]), index=X2.index)
# define whether we are to predict on the complete data
pred_on_all = True
# perform predictions with all
df_stat_covid, probas_covid = interrogate_with_globals()

In [None]:
# grab the data
adata = sc.read_h5ad('../../COVID_ISB_STORAGE/upto_v16_P_GE_int_gex_cd8_t_cells.has_abtcr_chain.h5ad')
# derive the probabilities of being a CD8
proba_cd8_sctcr = pd.concat(probas_covid, axis=1).mean(1)
proba_cd8_sccell = results_tcr['SU_CELL2022_COVID19']['TRB'].map(proba_cd8_sctcr)
adata.obs['prob_cd8_from_tcr'] = proba_cd8_sccell
# derive the data
adata.obs['score'] = adata.obs['prob_cd8_from_tcr'].copy()
adata.obs['score'] -= adata.obs['score'].mean()
adata.obs['score'] /= adata.obs['score'].std()
# visualize the data
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
sns.barplot(x='phenotype', y='score', data=adata.obs, color='skyblue', errcolor='dodgerblue',
            edgecolor='dodgerblue', linewidth=1.5, saturation=1, ci=95, errwidth=1.5, capsize=0.3)
ax.tick_params(axis='x', labelrotation=90); ax.set_xlim(-0.75, 5.75)
ax.set(xlabel='Phenotype', ylabel='Relative CD8 Purity Confidence')
ax.axhline(0, color='k')
print(ss.kruskal(*[adata.obs['score'][adata.obs['phenotype'] == x].dropna().tolist() for x in adata.obs['phenotype'].unique()]))
# print out the p-values
for x in adata.obs['phenotype'].unique():
    if x == 'CD8_Cytotoxic': continue
    print(f'p vs. {x}', ss.mannwhitneyu(*[adata.obs.loc[adata.obs['phenotype'] == x, 'score'].dropna() for x in ['CD8_Cytotoxic', x]])[1])

In [None]:
# score with multiple objects
adata = sc.read_h5ad('../../COVID_ISB_STORAGE/upto_v16_P_GE_int_gex_cd8_t_cells.has_abtcr_chain.h5ad')
results_tcr['SU_CELL2022_COVID19']['phenotype_leiden'] = adata.obs[['phenotype','leiden']].astype(str).agg('_'.join, axis=1)
results_tcr['SU_CELL2022_COVID19']['phenotype'] = adata.obs['phenotype'].astype(str)

# define the data to predict on
trbs = results_tcr['SU_CELL2022_COVID19']['TRB'].dropna().unique()
trbs = a_trb.obs.index.intersection(trbs)
X2 = pd.DataFrame(a_trb[trbs].X.toarray(), index=trbs)
# setup a mask for CD8+ cells
y2 = pd.Series(np.random.choice([0, 1], size=X2.shape[0]), index=X2.index)
print(X2.shape[0], y2.sum(), y2.mean())

# train via pan-cancer data
X1 = og_trb_zheng2021_X.copy()
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
_, probas_covid_from_pancan = interrogate_with_globals()

# train via covid+healthy data
X1 = og_trb_su2022_X.copy()
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
_, probas_covid_from_covid = interrogate_with_globals()

# train via fetal data
X1 = og_trb_suo2022_X.copy()
y1 = pd.Series(X1.index.str.contains(':CD8'), index=X1.index)
_, probas_covid_from_fetal = interrogate_with_globals()

In [None]:
# derive single cell scores from cancer datasets
t2s = pd.concat(probas_covid_from_pancan, axis=1).mean(1)
data = results_tcr['SU_CELL2022_COVID19'][['TRB','phenotype']].dropna()
data['prob'] = data['TRB'].map(t2s); data = data.dropna()
data['prob'] -= data['prob'].mean(); data['prob'] /= data['prob'].std()
probcd8_covid_from_pancan = data.copy()
# derive single cell scores from covid+hd datasets
t2s = pd.concat(probas_covid_from_covid, axis=1).mean(1)
data = results_tcr['SU_CELL2022_COVID19'][['TRB','phenotype']].dropna()
data['prob'] = data['TRB'].map(t2s); data = data.dropna()
data['prob'] -= data['prob'].mean(); data['prob'] /= data['prob'].std()
probcd8_covid_from_covid = data.copy()
# derive single cell scores from fetal datasets
t2s = pd.concat(probas_covid_from_fetal, axis=1).mean(1)
data = results_tcr['SU_CELL2022_COVID19'][['TRB','phenotype']].dropna()
data['prob'] = data['TRB'].map(t2s); data = data.dropna()
data['prob'] -= data['prob'].mean(); data['prob'] /= data['prob'].std()
probcd8_covid_from_fetal = data.copy()

In [None]:
from mpl_toolkits.mplot3d import Axes3D
# retrieve points
x = -probcd8_covid_from_pancan.groupby('phenotype').mean(numeric_only=True)['prob']
y = -probcd8_covid_from_covid.groupby('phenotype').mean(numeric_only=True)['prob'].loc[x.index]
z = -probcd8_covid_from_fetal.groupby('phenotype').mean(numeric_only=True)['prob'].loc[x.index]
# create the figure and 3D axes
fig = plt.figure(figsize=(4, 4))
ax = fig.add_subplot(111, projection='3d')
# create the scatter plot
c = ['r' if 'Cytotoxic' in idx else 'b' for idx in x.index]
c = np.array(c)
ax.scatter(x[c == 'r'], y[c == 'r'], z[c == 'r'], c='b', marker='o', edgecolor='k', s=1e2)
ax.scatter(x[c != 'r'], y[c != 'r'], z[c != 'r'], c='skyblue', marker='s', edgecolor='k', s=1e2)
ax.set(xlabel='from Pancan', ylabel='from COVID', zlabel='from Fetal')