In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

In [2]:
def cloud(references, tests, k):
    """
    reference: pd.DataFrame
        Should represent a distance matrix where rows and columns are indexed by reference sample ids
    tests: pd.DataFrame
        Should represent a distance matrix where row are indexed by reference sample ids and columns are indexed
        by tests sample ids
    k: int
        The number of k nearest neighbors to look at
    """
    reference_ids = references.columns.to_list()
    test_ids = tests.columns.to_list()
    # step 1
    r_di = references.apply(lambda x: x.nsmallest(k).mean()).values
        
    # step 2
    d_bar = r_di.mean()
    
    # step 3
    r_ri = r_di / d_bar
    
    # step 4
    t_di = tests.apply(lambda x: x.nsmallest(k).mean()).values
    t_ri = t_di / d_bar
    
    return (t_di)

In [None]:
def get_ref_train_test(distances, samples, ref_lam, anom_lam, s_r):
    # get reference set
    reference_distances = distances.loc[ref_lam(samples)]
    reference_distances = reference_distances.sample(int(reference_distances.shape[0]*s_r))
    reference_distances = reference_distances[reference_distances.index]
    ref_ids = np.array(reference_distances.index.to_list())
    
    # get train set
    train_set_distances = distances.loc[
        (anom_lam(samples)) & ~(samples.index.isin(ref_ids)),
        ref_ids
    ]
    train_set_distances = train_set_distances.sample(int(train_set_distances.shape[0]*.8))
    train_set_ids = train_set_distances.index.to_list()
    train_set_distances = train_set_distances.T
    
    test_set_distances = distances.loc[
        (anom_lam(samples)) & ~(samples.index.isin(np.concatenate((ref_ids, train_set_ids)))),
        reference_distances.index.to_list()
    ].T
    return (reference_distances, train_set_distances, test_set_distances)

In [None]:
distances = pd.read_csv("artifact_110218_021021-144748/BIOM/110218/distance-matrix.tsv", sep="\t", index_col=0)
distances = distances.replace(0, np.nan)
distance_ids = distances.index.to_list()
samples = pd.read_csv("artifact_110218_021021-144748/BIOM/110218/sample.tsv", sep="\t", index_col=0)
samples = samples.loc[distance_ids]

# get ref, train, test for m3 vs f4

In [None]:
# ref_dist, train_dist, test_dist = get_ref_train_test(
#     distances,
#     samples,
#     lambda df: (df.host == "M3"),
#     lambda df: (df.host == "M3") | (df.host == "F4")
    
# )
# train_ids = train_dist.columns.to_list()
# test_ids = test_dist.columns.to_list()

In [None]:
i_s, j_s = (6,9)
fig, axs = plt.subplots(i_s,j_s)
sizes = [0] * i_s
ks = [0] * j_s
for i in range(1, i_s+1):
    for j in range(1, j_s+1):
        ref_dist, train_dist, test_dist = get_ref_train_test(
            distances,
            samples,
            lambda df: (df.host == "M3"),
            lambda df: (df.host == "M3") | (df.host == "F4"),
            i*0.1
        )
        train_ids = train_dist.columns.to_list()
        test_ids = test_dist.columns.to_list()
        k = int(j * 0.05 * ref_dist.shape[0])
        x = np.array(cloud(ref_dist, train_dist, k))
        x = x.reshape(-1, 1)
        y = samples.loc[train_ids, "host"] == "M3"
        y = np.array(y.to_list())
        y = y.reshape(y.size,)

        mdl = LogisticRegression()
        mdl = mdl.fit(x,y)

        x = np.array(cloud(ref_dist, test_dist, k))
        x = x.reshape(-1, 1)
        y = samples.loc[test_ids, "host"] == "M3"
        y = np.array(y.to_list())
        y = y.reshape(y.size,)
        probs = mdl.predict_proba(x)
        preds = probs[:,1]
        fpr, tpr, thres = metrics.roc_curve(y, preds)
        roc_auc = metrics.auc(fpr, tpr)

        axs[i-1, j-1].plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
        axs[i-1, j-1].legend(loc = 'lower right')
        axs[i-1, j-1].plot([0, 1], [0, 1],'r--')
        axs[i-1, j-1].set_xlim([0, 1])
        axs[i-1, j-1].set_xlim([0, 1])
#         axs[i-1, j-1].set_ylabel("k = " + str(size))
        
        sizes[i-1] = i * 0.1
        ks [j-1] = j * 0.05
        print(ref_dist.shape[0], i, j)

for i in range(i_s):
    axs[i, 0].set_ylabel("s = %0.2f" % sizes[i])
        
for j in range(j_s):
    axs[0, j].set_title("k = %0.2f" % ks[j])

In [None]:
 for ax in axs.flat:
    ax.label_outer()

fig.set_size_inches(20,20)
fig.savefig('no_mean.png', bbox_inches='tight')

# AUC for m3 vs f4

In [None]:
x = np.array(cloud(ref_dist, train_dist, 2))
x = x.reshape(-1, 1)
y = samples.loc[train_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)

mdl = LogisticRegression()
mdl = mdl.fit(x,y)

x = np.array(cloud(ref_dist, test_dist, 2))
x = x.reshape(-1, 1)
y = samples.loc[test_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)
probs = mdl.predict_proba(x)
preds = probs[:,1]
fpr, tpr, thres = metrics.roc_curve(y, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC m3 vs f4')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# ref, train, test m3 gut vs f4 gut

In [None]:
ref_dist, train_dist, test_dist = get_ref_train_test(
    distances,
    samples,
    lambda df: (df.host == "M3") & (df.env_package == "human-gut"),
    lambda df: (df["env_package"] == "human-gut")
)
train_ids = train_dist.columns.to_list()
test_ids = test_dist.columns.to_list()

# AUC for m3 gut vs f4 gut

In [None]:
x = np.array(cloud(ref_dist, train_dist, 8))
x = x.reshape(-1, 1)
y = samples.loc[train_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)

mdl = LogisticRegression()
mdl = mdl.fit(x,y)

x = np.array(cloud(ref_dist, test_dist, 8))
x = x.reshape(-1, 1)
y = samples.loc[test_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)
probs = mdl.predict_proba(x)
preds = probs[:,1]
fpr, tpr, thres = metrics.roc_curve(y, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC m3 gut vs f4 gut')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# ref, train, test m3 skin vs f4 skin

In [None]:
ref_dist, train_dist, test_dist = get_ref_train_test(
    distances,
    samples,
    lambda df: (df.host == "M3") & (df.env_package == "human-skin"),
    lambda df: (df["env_package"] == "human-skin")
)
train_ids = train_dist.columns.to_list()
test_ids = test_dist.columns.to_list()

In [None]:
x = np.array(cloud(ref_dist, train_dist, 2))
x = x.reshape(-1, 1)
y = samples.loc[train_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)

mdl = LogisticRegression()
mdl = mdl.fit(x,y)

x = np.array(cloud(ref_dist, test_dist, 2))
x = x.reshape(-1, 1)
y = samples.loc[test_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)
probs = mdl.predict_proba(x)
preds = probs[:,1]
fpr, tpr, thres = metrics.roc_curve(y, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC m3 skin vs f4 skin')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

# ref, train, test m3 oral vs f4 oral

In [None]:
ref_dist, train_dist, test_dist = get_ref_train_test(
    distances,
    samples,
    lambda df: (df.host == "M3") & (df.env_package == "human-oral"),
    lambda df: (df["env_package"] == "human-oral")
)
train_ids = train_dist.columns.to_list()
test_ids = test_dist.columns.to_list()

In [None]:
x = np.array(cloud(ref_dist, train_dist, 61))
x = x.reshape(-1, 1)
y = samples.loc[train_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)

mdl = LogisticRegression()
mdl = mdl.fit(x,y)

x = np.array(cloud(ref_dist, test_dist, 61))
x = x.reshape(-1, 1)
y = samples.loc[test_ids, "host"] == "M3"
y = np.array(y.to_list())
y = y.reshape(y.size,)
probs = mdl.predict_proba(x)
preds = probs[:,1]
fpr, tpr, thres = metrics.roc_curve(y, preds)
roc_auc = metrics.auc(fpr, tpr)

plt.title('ROC m3 oral vs f4 oral')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
best_auc, best_k = (0,0)
for k in range(1, len(train_ids)):
    x = np.array(cloud(ref_dist, train_dist, k))
    x = x.reshape(-1, 1)
    y = samples.loc[train_ids, "host"] == "M3"
    y = np.array(y.to_list())
    y = y.reshape(y.size,)
    
    mdl = LogisticRegression()
    mdl = mdl.fit(x,y)
    
    x = np.array(cloud(ref_dist, test_dist, k))
    x = x.reshape(-1, 1)
    y = samples.loc[test_ids, "host"] == "M3"
    y = np.array(y.to_list())
    y = y.reshape(y.size,)
    probs = mdl.predict_proba(x)
    preds = probs[:,1]
    fpr, tpr, thres = metrics.roc_curve(y, preds)
    roc_auc = metrics.auc(fpr, tpr)
    if roc_auc > best_auc:
        best_auc, best_k = roc_auc, k
    print(roc_auc, "better!" if best_auc == roc_auc else "", k, len(train_ids))