In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.spatial import ConvexHull, convex_hull_plot_2d 

from scipy.stats import entropy
from Bio import SeqIO

## Prepare data:

In [2]:
# Natural sequences
merge_ds = pd.read_excel('./Taxonomy_and_convex_hull/data/20210908_natural_with_taxonomy.xls').iloc[:,1:]
merge_ds.head()

Unnamed: 0,seq,source,ShortName,DomainName,TranscriptID,Protein,DomainNo,ID,Species,Phylogeny,Sequences,Sequences_unaligned,MMD_z0,MMD_z1,MMD_z2,header_twist,norm_RE,orthologous_group,PhyGroup
0,0,jgi,Torde1,"TDEL_0F02380_domain_number[1],jgi",3708.0,Equilibrative nucleoside transporter protein,1,4950.0,Torulaspora delbrueckii,cellular organisms; Eukaryota; Opisthokonta; F...,VFLGVYKALYDYEPQTTEELAIREDELLYLLEKSDEWWTVKKDEPV...,TVFLGVYKALYDYEPQTTEELAIREDELLYLLEKSDVDEWWTVKKR...,-0.212743,0.560967,-0.190477,4837_jgi||3708||Equilibrative nucleoside trans...,0.148125,NOG257967_1,Ascomycota
1,1,jgi,Torde1,"TDEL_0F02380_domain_number[2],jgi",3708.0,Equilibrative nucleoside transporter protein,2,4950.0,Torulaspora delbrueckii,cellular organisms; Eukaryota; Opisthokonta; F...,PVINSVRALYDYDQNPDEELTFHENEEFDVYDDQDDWLLVQKTGAC...,APVINSVRALYDYDQAQNPDEELTFHENEEFDVYDDQDPDWLLVQQ...,-0.133021,1.238067,-0.2098,2997_jgi||3708||Equilibrative nucleoside trans...,0.076445,NOG257967_2,Ascomycota
2,2,jgi,Torde1,"TDEL_0F02380_domain_number[3],jgi",3708.0,Equilibrative nucleoside transporter protein,3,4950.0,Torulaspora delbrueckii,cellular organisms; Eukaryota; Opisthokonta; F...,KARKEGKVLYDFIAESQDELSVKQGQTVYILNDKKDWWMCELSGQK...,SKARKEGKVLYDFIAESQDELSVKQGQTVYILNDKKSRDWWMCELV...,-1.02586,0.236221,-0.425561,4599_jgi||3708||Equilibrative nucleoside trans...,-0.175593,NOG257967_3,Ascomycota
3,3,jgi,Torde1,"TDEL_0B05220_domain_number[1],jgi",1310.0,Cdc42-interacting protein CIP4,1,4950.0,Torulaspora delbrueckii,cellular organisms; Eukaryota; Opisthokonta; F...,TGSNKNKVLFAYSKQDADEVSISLRDLVALEAADGGWTKIKNTGET...,GTGSNKNKVLFAYSKQDADEVSISLRDLVALEAADTGSGWTKIKNL...,-0.778632,-0.1245,-0.325014,5418_jgi||1310||Cdc42-interacting protein CIP4...,0.11979,KOG3565_1,Ascomycota
4,4,jgi,Torde1,"TDEL_0B05220_domain_number[2],jgi",1310.0,Cdc42-interacting protein CIP4,2,4950.0,Torulaspora delbrueckii,cellular organisms; Eukaryota; Opisthokonta; F...,MPTRTMQAQYDYEAQGDDELSLTPNDVVNVIRGDDGWTYGE-NGEK...,NMPTRTMQAQYDYEAQGDDELSLTPNDVVNVIRGDDGSGWTYGELN...,-0.155759,-0.226678,-0.148836,4838_jgi||1310||Cdc42-interacting protein CIP4...,-0.255056,KOG3565_2,Ascomycota


In [3]:
# merge the two natural sh3 spreadsheets together 
# note: there should be both the 'In training set' and 'MMD z..' columns 
full_nat_df = nat_df.merge( 
    merge_ds,
    left_on='Sequence_aligned', right_on='Sequences').drop_duplicates(subset='Sequences_unaligned_x').reset_index(drop = True)

train_full_nat_df = full_nat_df[full_nat_df['In Training Set']==1]
train_full_nat_df = train_full_nat_df.drop_duplicates(subset='Sequence_aligned').reset_index(drop = True)
train_full_nat_df.head()

NameError: name 'nat_df' is not defined

In [None]:
# === functional sequences defined by r.e. > 0.5 ===
func_train_full_nat_df = train_full_nat_df[train_full_nat_df.norm_RE > 0.5]
# embeddings
func_train_full_nat_z = func_train_full_nat_df[['MMD_z0', 'MMD_z1', 'MMD_z2']].values

# === nonfunctional sequences definde by r.e. <= 0.5 ===
nonfunc_train_full_nat_df = train_full_nat_df[~(train_full_nat_df.norm_RE > 0.5)]
# embeddings
nonfunc_train_full_nat_z = nonfunc_train_full_nat_df[['MMD_z0', 'MMD_z1', 'MMD_z2']].values

func_train_full_nat_z.shape, nonfunc_train_full_nat_z.shape


## Plot Figure 3D:

Here, we are plotting all of the natural training sequences (5299). The pink points are 162 functional sequences (r.e.>0.5) and  5137 remaining embeddings are blue. 

In [None]:
plt.rcParams.update({'font.size': 12})
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams.update({'font.family': 'Arial'})

dsize = 1
size =15
lw=.5
bins = np.linspace(-2.5,2.5,25)

fig, axs = plt.subplots(1, 2, figsize = [6.0,3], sharex="col", sharey="row", dpi=300)


#axs[0, 1].hist(mmd_nat_coord[good_nat,2], bins, orientation='horizontal', color = '#f47b7b')

# functional latent embeddings
axs[0].scatter(nonfunc_train_full_nat_z[:,0], nonfunc_train_full_nat_z[:,2]
               ,s=size, c = 'cornflowerblue',alpha=.5)

axs[0].scatter(func_train_full_nat_z[:,0], func_train_full_nat_z[:,2],
              c = '#f47c7c', linewidth=lw,s = size,alpha=.5 )

# nonfunctional latent embeddings
axs[1].scatter(nonfunc_train_full_nat_z[:,1], nonfunc_train_full_nat_z[:,2]
                                          ,s=size, c = 'cornflowerblue',alpha=.5)
axs[1].scatter(func_train_full_nat_z[:,1], func_train_full_nat_z[:,2],
              c = '#f47c7c', linewidth=lw,s = size,alpha=.5)




# subplot 0 ticks
axs[0].set_xticks([-2.5, 0, 2.5])
axs[0].set_yticks([-2, 0, 2])

# subplot 1 ticks
axs[1].set_xticks([-2.5, 0, 2.5])
axs[1].set_yticks([-2, 0, 2])
    
axs[0].set_xlabel('Dim 1')
axs[0].set_ylabel('Dim 3')
axs[1].set_xlabel('Dim 2')


plt.tight_layout(pad=.8)
plt.savefig('./figures/Fig3D_natural_func_embeddings.svg', dpi = 300)
plt.show()