## Randomize Sequence

In [12]:
import random
from collections import Counter
import pandas as pd
import numpy as np
from embedder import get_et5

In [4]:
def shuffle_by_nres(seq, zone_len=10):

    chunked_list = [seq[i:i+zone_len] for i in range(0, len(seq), zone_len)]


    randomized = []
    for i in chunked_list:
        li = list(i)
        random.shuffle(li)
        randomized.append("".join(li))
    return "".join(randomized)

In [5]:
alfphasyn_seq = "MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKEGVVHGVATVAEKTKEQVTNVGGAVVTGVTAVAQKTVEGAGSIAAATGFVKKDQLGKNEEGAPQEGILEDMPVDPDNEAYEMPSEEGYQDYEPEA"

monomer_seq = 'MTKEQIQIIKDCVPILQKNGEDLTNEFYKIMFNDYPEVKPMFNMEKQISGEQPKALAMAILMAAKNIENLENMRSFVDKVAITHVNLGVKEEHYPIVGACLLKAIKNLLNPDEATLKAWEVAYGKIAKFYIDIEKKLYDK'

In [6]:
random_as_seqs = {
'as_wt':alfphasyn_seq,
'mono_wt':monomer_seq,
}

In [7]:
for s in [2, 5, 10, 100,1000_0000]:
    for i in range(1000):
        idx1 = f'as_{s}_n_{i}'
        random_as_seqs[idx1] = shuffle_by_nres(alfphasyn_seq, zone_len=s)
        idx2 = f'mono_{s}_n_{i}'
        random_as_seqs[idx2] = shuffle_by_nres(monomer_seq, zone_len=s)

In [8]:
as_seqs = pd.DataFrame(random_as_seqs.items(), columns=['sid','seq'])
as_seqs['sid'] = as_seqs['sid'].str.replace('10000000', 'full')

In [53]:
#as_seqs.to_csv('tables/alphasynuclein_sfuffled_sequences.csv',index=False)

In [54]:
#as_seqs = pd.read_csv('tables/alphasynuclein_sfuffled_sequences.csv')

### Get Embeddings

In [15]:
embeddings = get_et5(as_seqs.seq)

Starting at 09:57:01
.....................................................................................................100/140
....................................................................................................200/140
....................................................................................................300/140
....................................................................................................400/140
....................................................................................................500/140
....................................................................................................600/140
....................................................................................................700/140
....................................................................................................800/140
....................................................................................................900/140
......

In [20]:
from bio_embeddings.project import umap_reduce
from bio_embeddings.visualize import render_scatter_plotly
from bio_embeddings.embed import ProtTransT5XLU50Embedder
import time
from datetime import datetime

embedder = ProtTransT5XLU50Embedder(model_directory= "/data/franco/datasets/prot_embedding_weights/half_prottrans_t5_xl_u50/")

In [16]:
len(as_seqs)

10002

In [17]:
len(embeddings)

10002

In [18]:
as_seqs['embedding'] = embeddings

In [21]:
reduced_embeddings = [embedder.reduce_per_protein(pre) for pre in embeddings]

In [112]:
#import pickle
#with open('emb_10_1000_redeuced.pickle', 'wb')as f:
#    pickle.dump(reduced_embeddings, f)

In [22]:
as_seqs['label'] = as_seqs.sid.str.split('_').str[1] + as_seqs.sid.str.split('_').str[0]

In [23]:
options = {
    'min_dist': .1,
    'spread': 8,
    'n_neighbors': 50,
    'metric': 'euclidean',
    'n_components': 2,
    'random_state': 10
}

projected_embeddings = umap_reduce(reduced_embeddings, **options)
projected_embeddings_dataframe = pd.DataFrame(
    projected_embeddings,
    columns=["component_0", "component_1"],
)
projected_embeddings_dataframe["label"] = list(as_seqs.label)

  return 1.0 / (1.0 + a * x ** (2 * b))
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


UMAP(n_neighbors=50, random_state=10, spread=8, verbose=1)
Wed Oct 26 12:07:06 2022 Construct fuzzy simplicial set
Wed Oct 26 12:07:06 2022 Finding Nearest Neighbors
Wed Oct 26 12:07:06 2022 Building RP forest with 10 trees
Wed Oct 26 12:07:06 2022 NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	Stopping threshold met -- exiting after 3 iterations
Wed Oct 26 12:07:20 2022 Finished Nearest Neighbor Search
Wed Oct 26 12:07:23 2022 Construct embedding


Epochs completed: 100%| ███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ 200/200 [00:09]

Wed Oct 26 12:07:33 2022 Finished embedding





In [24]:
figure = render_scatter_plotly(projected_embeddings_dataframe)
figure.show(renderer='iframe')

In [26]:
figure = render_scatter_plotly(projected_embeddings_dataframe)
figure.show(renderer='iframe')

In [28]:
as_seqs

Unnamed: 0,sid,seq,embedding,label
0,as_wt,MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKE...,"[[0.26118708, -0.047396153, -0.022661319, 0.01...",wtas
1,mono_wt,MTKEQIQIIKDCVPILQKNGEDLTNEFYKIMFNDYPEVKPMFNMEK...,"[[-0.07032994, -0.2915927, 0.28830814, 0.22719...",wtmono
2,as_2_n_0,DMFVKMLGSKKAGEVVAAAETKQKGVEAAAKGTKGEVLVYSGKTKE...,"[[0.10843041, -0.3450426, -0.21672495, 0.08743...",2as
3,mono_2_n_0,MTEKQIIQIKCDPVLIQKGNDETLENFYIKFMDNPYVEKPMFMNKE...,"[[-0.029350068, -0.171996, 0.26766634, 0.39051...",2mono
4,as_2_n_1,MDVFKMGLKSKAEGVVAAAEKTKQGVEAAAGKKTGEVLVYGSTKEK...,"[[0.13493991, 0.0050350963, -0.06172576, 0.019...",2as
...,...,...,...,...
9997,mono_full_n_997,DVKSAIIAPQFEDEEWMAIEYFMKIKRCGHNLKKEEYFMIEVINTT...,"[[-0.032008912, -0.30854803, -0.037422325, 0.3...",fullmono
9998,as_full_n_998,QETEPQKVKTKGEDTGEAEKEYGSVDLEGTEGEVVADASKVAKVLE...,"[[-0.027951447, -0.19980127, 0.039940525, 0.07...",fullas
9999,mono_full_n_998,EWKDKAYVKYHPENYLSGIKTQHKAIQFEKIQFNKAEMQGIGPLLK...,"[[-0.060568176, -0.25262168, 0.15308204, 0.051...",fullmono
10000,as_full_n_999,PEGNSVTQEKKTDGGYQQAVAAAAVGEHGETGLTGEDPVVEPVQAA...,"[[0.05588614, -0.09958536, 0.063227266, 0.1714...",fullas


In [27]:
import pickle

In [30]:
with open('random_as_mono.pickle', 'wb') as pf:
    pickle.dump(as_seqs, pf)

In [131]:
figure = render_scatter_plotly(projected_embeddings_dataframe)
figure.show(renderer='iframe')

In [73]:
figure = render_scatter_plotly(projected_embeddings_dataframe)
figure.show(renderer='iframe')

In [25]:
import plotly
import plotly.express as px

colores = ['rgb(228,26,28)',
 'rgb(55,126,184)',
  'rgb(152,78,163)',
 'rgb(77,175,74)',
 'rgb(255,127,0)',
 'rgb(255,255,51)',
 'rgb(166,86,40)',
 'rgb(247,129,191)',
 'rgb(153,153,153)']

def render_scatter_plotly(embeddings_dataframe):
    if 'label' in embeddings_dataframe.columns:
        fig = px.scatter(embeddings_dataframe,
                         x='component_0',
                         y='component_1',
                         color='label',
                         symbol='label',
                         hover_name=embeddings_dataframe.index,
                         hover_data=["label"],
                         opacity = .8,
                         color_discrete_sequence=colores
                         )
    else:
        fig = px.scatter(embeddings_dataframe,
                         x='component_0',
                         y='component_1',
                         hover_name=embeddings_dataframe.index,
                         )

    fig.update_layout(
        # Remove axes ticks and labels as they are usually not informative
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        scene=dict(
            xaxis=dict(
                showticklabels=False,
                showspikes=False,
                title=""
            ),
            yaxis=dict(
                showticklabels=False,
                showspikes=False,
                title=""
            )
        ),
    )

    return fig


In [136]:
as_df = as_seqs[~as_seqs.sid.str.contains('mono')]
mono_df = as_seqs[as_seqs.sid.str.contains('mono')]

In [143]:
reduced_embeddings = [embedder.reduce_per_protein(pre) for pre in as_df.embedding]

projected_embeddings = umap_reduce(reduced_embeddings, **options)
projected_embeddings_dataframe = pd.DataFrame(
    projected_embeddings,
    columns=["component_0", "component_1"],
)
projected_embeddings_dataframe["label"] = list(as_df.label.str.replace('as',''))


divide by zero encountered in power



UMAP(n_neighbors=50, random_state=10, spread=8, verbose=1)
Wed Oct 26 08:01:19 2022 Construct fuzzy simplicial set
Wed Oct 26 08:01:19 2022 Finding Nearest Neighbors
Wed Oct 26 08:01:19 2022 Building RP forest with 9 trees
Wed Oct 26 08:01:19 2022 NN descent for 12 iterations
	 1  /  12
	 2  /  12
	 3  /  12
	Stopping threshold met -- exiting after 3 iterations
Wed Oct 26 08:01:20 2022 Finished Nearest Neighbor Search
Wed Oct 26 08:01:20 2022 Construct embedding


Epochs completed: 100%| ███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████ 500/500 [00:11]


Wed Oct 26 08:01:31 2022 Finished embedding


In [148]:
figure = render_scatter_plotly(projected_embeddings_dataframe)
figure.show(renderer='iframe')