# PhageHostLearn - v2.2 (Klebsiella) - feature exploration

An AI-based Phage-Host interaction predictor framework with K-loci and receptor-binding proteins at its core. This particular PhageHostLearn is for *Klebsiella pneumoniae* related phages. This notebook follows after having ran the PhageHostLearn_processing steps implemented in the accompanying Jupyter notebook.

In [None]:
RBP_embeddings = pd.read_csv(valencia_dir+'/RBP_embeddings.csv')

loci_embeddings = pd.read_csv(valencia_dir+'/loci_embeddings.csv')
interactions = pd.read_csv(valencia_dir+'/phage_host_interactionsValencia.csv', index_col=0)




#### TSNE

In [None]:
X = np.asarray(frame.iloc[:, :-4])
X_loci = np.asarray(frame.iloc[:, :1024]) # first 1024
X_rbps = np.asarray(frame.iloc[:, 1024:-4]) # second 1024

In [None]:
# colored by label
X_tsne = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(X)
plt.figure(figsize=(10,7))
colors = ["navy", "darkorange"]

for i in range(X_tsne.shape[0]):
    if (frame['origin'][i] == 'prophage') and (labels[i] == 1):
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color='darkorange', marker='+')
    if (frame['origin'][i] == 'prophage') and (labels[i] == 0):
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color='navy', marker='+')
    if (frame['origin'][i] == 'valencia') and (labels[i] == 1):
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color='darkorange', marker='>')
    if (frame['origin'][i] == 'valencia') and (labels[i] == 0):
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color='navy', marker='>')
        
plt.scatter([],[], color='darkorange', marker='+', label='pos+ prophage')
plt.scatter([],[], color='navy', marker='+', label='neg- prophage')
plt.scatter([],[], color='darkorange', marker='>', label='pos+ valencia')
plt.scatter([],[], color='navy', marker='>', label='neg- valencia')
  
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("tSNE of dataset")
plt.xlabel('tSNE component 1')
plt.ylabel('tSNE component 2')
#plt.savefig(results_dir+'/features/all_embeddings_tSNE.png', dpi=400)

In [None]:
# colored by serotype
X_tsne = TSNE(n_components=2, perplexity=61, learning_rate='auto', init='random').fit_transform(X_rbps)
seros = pd.read_csv(klebsiella_dir+'/klebsiella_genomes_031221_serotypes.csv')
unique_seros = list(set(seros['sero']))
seros_count = [list(seros['sero']).count(x) for x in unique_seros]
seros_top = [x for _,x in sorted(zip(seros_count,unique_seros), reverse=True)][:20]
plt.figure(figsize=(10,7))
viridis = cm.get_cmap('rainbow')
crange = np.linspace(0.01, 0.99, len(seros_top))

for i in range(X_tsne.shape[0]):
    this_index = frame['row'][i]
    this_sero = seros['sero'][this_index]
    if (frame['origin'][i] == 'prophage') and (this_sero in seros_top):
        col = viridis(crange[seros_top.index(this_sero)])
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color=col, marker='+')
    if (frame['origin'][i] == 'prophage') and (this_sero not in seros_top):
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color='grey', marker='+')
    if (frame['origin'][i] == 'valencia') and (this_sero in seros_top):
        col = viridis(crange[seros_top.index(this_sero)])
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color=col, marker='>')
    if (frame['origin'][i] == 'valencia') and (this_sero not in seros_top):
        plt.scatter(X_tsne[i,0], X_tsne[i,1], color='grey', marker='>')
         
plt.title("tSNE of dataset, colored by top20 serotypes")
plt.xlabel('tSNE component 1')
plt.ylabel('tSNE component 2')
plt.savefig(results_dir+'/features/rbp_embeddings_tSNE_serocolor.png', dpi=400)

In [None]:
# colored with Valencia RBPs
X_PCA = PCA(n_components=2).fit_transform(X_rbps)

unique_valencia_rbps = list(set(frame['col'][frame['origin']=='valencia']))
colors = cm.get_cmap('rainbow')
crange = np.linspace(0.01, 0.99, len(unique_valencia_rbps))

plt.figure(figsize=(10,7))
for i in range(X_tsne.shape[0]):
    this_rbp = frame['col'][i]
    if (frame['origin'][i] == 'prophage'):        
        plt.scatter(X_PCA[i,0], X_PCA[i,1], color='grey', marker='+')
    if (frame['origin'][i] == 'valencia'):
        col = colors(crange[unique_valencia_rbps.index(this_rbp)])
        plt.scatter(X_PCA[i,0], X_PCA[i,1], color=col, marker='>')
         
plt.title("tSNE of dataset, colored by Valencia RBP")
plt.xlabel('tSNE component 1')
plt.ylabel('tSNE component 2')
#plt.savefig(results_dir+'/features/rbp_embeddings_tSNE_serocolor.png', dpi=400)