In [2]:
## Verify that pytorch-geometric is correctly installed
import torch_geometric
import torch_sparse
from torch_geometric.nn import MessagePassing

In [3]:
import esm

import pickle
import numpy as np
import importlib
import pandas as pd
import time


In [4]:
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
# to get rid of random dropout
model= model.eval()

  "Regression weights not found, predicting contacts will not produce correct results."


In [5]:
fpath = '/n/groups/marks/users/david/esm_if/data/bio_all_rm_non_chain.cif' # .pdb format is also acceptable
coords, seqs = esm.inverse_folding.multichain_util.load_complex_coords(fpath, ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H'])

start_t = time.time() # in seconds
ll_fullseq, ll_withcoord = esm.inverse_folding.multichain_util.score_sequence_in_complex(
    model, 
    alphabet,
    coords,
    'C',
    seqs['C']
)
end_t = time.time()

print(f'average log-likelihood on entire sequence: {ll_fullseq:.2f} (perplexity {np.exp(-ll_fullseq):.2f})')
print(f'average log-likelihood excluding missing coordinates: {ll_withcoord:.2f} (perplexity {np.exp(-ll_withcoord):.2f})')
time_taken = end_t - start_t
print('took {} seconds'.format(time_taken))
print('9000 variants will take {} hours'.format(9000*time_taken/60/60))



average log-likelihood on entire sequence: -1.23 (perplexity 3.42)
average log-likelihood excluding missing coordinates: -1.23 (perplexity 3.42)
took 9.082840204238892 seconds
9000 variants will take 22.70710051059723 hours


In [6]:
# setting device on GPU if available, else CPU
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
print()

Using device: cuda



In [20]:
df_test = pd.read_csv('/n/groups/marks/users/david/esm_if/data/seq_to_score/'+ 'df_mut_chris_no_stop.csv')
df_test = df_test[:2]
df_test.to_csv('/n/groups/marks/users/david/esm_if/data/seq_to_score/df_test.csv')

In [21]:
df_test = pd.read_csv('/n/groups/marks/users/david/esm_if/data/seq_to_score/df_test.csv')
df_test

Unnamed: 0.1,Unnamed: 0,seq,conservation,inter,intra,fit,muts,full_mut,stop,muts_m1,mut_seq_chC
0,0,ALAL,-237.603375,-31.439403,-80.617306,-0.001478,ALAL,L58A:W59L:D60A:K63L,False,L59A:W60L:D61A:K64L,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
1,1,ALAE,-236.936851,-31.01177,-80.474616,0.015735,ALAE,L58A:W59L:D60A:K63E,False,L59A:W60L:D61A:K64E,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...


# to test the whole thing

In [9]:
df_to_score[:8]

Unnamed: 0,full_mut,E3,V5L,A66F,D55E,V75C,N99V,R100W,E87M,A66I,V5L/D55E,V5L/A66F,E2,stop,muts_m1,mut_seq_chC
0,D60A:K63A:E79A,0.658741,0.924719,0.92697,0.877962,0.783168,0.747125,1.048427,0.967851,0.930315,1.012619,0.955635,0.2165,False,D61A:K64A:E80A,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
1,D60A:K63A:E79C,0.532164,0.86122,0.938649,0.817779,0.719811,0.681587,0.930138,0.953372,0.891346,0.990992,0.921711,0.178514,False,D61A:K64A:E80C,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
2,D60A:K63A:E79D,0.458571,0.895409,0.931136,0.797239,0.559013,0.565601,0.984797,0.948527,0.915542,0.980943,0.992542,-0.037274,False,D61A:K64A:E80D,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
3,D60A:K63A:E79E,0.850977,0.955051,0.987775,0.961778,0.951983,0.890569,0.943179,0.961443,0.990926,0.988444,1.043746,0.105763,False,D61A:K64A:E80E,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
4,D60A:K63A:E79F,0.609473,0.89723,0.942582,0.84182,0.743454,0.739003,0.941937,0.939176,0.910273,1.019961,0.941611,0.140926,False,D61A:K64A:E80F,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
5,D60A:K63A:E79G,0.139403,0.68893,0.634135,0.41026,0.200918,0.131363,0.963007,0.733209,0.632592,0.923859,0.930863,0.068116,False,D61A:K64A:E80G,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
6,D60A:K63A:E79H,0.585224,0.883803,0.931893,0.89058,0.692569,0.695369,0.936409,0.947056,0.888949,0.997133,0.94896,0.211568,False,D61A:K64A:E80H,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...
7,D60A:K63A:E79I,0.785455,0.929819,0.956014,0.889507,0.870191,0.864626,0.953175,0.96864,0.954402,0.988256,0.98069,0.261156,False,D61A:K64A:E80I,ANVEKMSVAVTPQQAAVMREAVEAGEYATASEIVREAVRDWLAKRE...


In [11]:
# this isn't writing the output to file
import esm

import pickle
import numpy as np
import importlib
import pandas as pd
import time

import torch

models_dir = 'models'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = float(time.time())
print(device)


# batch score
datapath = '/n/groups/marks/users/david/esm_if/data/seq_to_score/'
#print(sys.argv)
#f_name = sys.argv[1]
f_name = 'df_mut_all_no_stop.csv'

pout = datapath + f_name.rstrip('.csv') + '_scores_test_ipy2.csv'
print('writing to {}'.format(pout))
df_to_score = pd.read_csv(datapath+ f_name)

df_to_score = df_to_score[:10]

print('loading model in')
# load model
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
# to get rid of random dropout
model= model.eval()

print('reading structure in')
# read structure in 
cifpath = '/n/groups/marks/users/david/esm_if/data/bio_all_rm_non_chain.cif' # .pdb format is also acceptable
coords, seqs = esm.inverse_folding.multichain_util.load_complex_coords(
    cifpath, 
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
)

with open(pout, 'w') as fout:
    print('starting to score')
    for n,r in df_to_score.iterrows():
        print('scoring one...')
        mut_str = r.muts_m1
        seq_to_score = r.mut_seq_chC
        
        start = time.time()
        print('start scoring')
        ll_fullseq, ll_withcoord = esm.inverse_folding.multichain_util.score_sequence_in_complex(
            model, 
            alphabet,
            coords,
            'C',
            seq_to_score
        )
        write_line = ','.join([mut_str, seq_to_score, str(ll_fullseq), str(ll_withcoord)]) + '\n'
        #print(write_line)
        fout.write(write_line)
        end = time.time()
        it_time = end- start
        print('one it of scoring took {} seconds, total hrs expected to complete:{}'.format(it_time, it_time * len(df_to_score)/60/60))
        

cuda
writing to /n/groups/marks/users/david/esm_if/data/seq_to_score/df_mut_all_no_stop_scores_test_ipy2.csv
loading model in
reading structure in
starting to score
scoring one...
start scoring
one it of scoring took 11.456486940383911 seconds, total hrs expected to complete:0.03182357483439975
scoring one...
start scoring
one it of scoring took 11.801580429077148 seconds, total hrs expected to complete:0.03278216785854764
scoring one...
start scoring
one it of scoring took 11.720196008682251 seconds, total hrs expected to complete:0.032556100024117365
scoring one...
start scoring
one it of scoring took 11.749638557434082 seconds, total hrs expected to complete:0.03263788488176134
scoring one...
start scoring
one it of scoring took 11.826601266860962 seconds, total hrs expected to complete:0.03285167018572489
scoring one...
start scoring
one it of scoring took 11.576789379119873 seconds, total hrs expected to complete:0.03215774827533298
scoring one...
start scoring
one it of scoring t

OSError: [Errno 116] Stale file handle

In [13]:
# open the file and keep open

import esm

import pickle
import numpy as np
import importlib
import pandas as pd
import time

import torch

models_dir = 'models'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = float(time.time())
print(device)


# batch score
datapath = '/n/groups/marks/users/david/esm_if/data/seq_to_score/'
#print(sys.argv)
#f_name = sys.argv[1]
f_name = 'df_mut_all_no_stop.csv'

pout = datapath + f_name.rstrip('.csv') + '_scores_test_ipy3.csv'
print('writing to {}'.format(pout))
df_to_score = pd.read_csv(datapath+ f_name)

df_to_score = df_to_score[:10]

print('loading model in')
# load model
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
# to get rid of random dropout
model= model.eval()

print('reading structure in')
# read structure in 
cifpath = '/n/groups/marks/users/david/esm_if/data/bio_all_rm_non_chain.cif' # .pdb format is also acceptable
coords, seqs = esm.inverse_folding.multichain_util.load_complex_coords(
    cifpath, 
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
)

list_write = []
print('starting to score')
for n,r in df_to_score.iterrows():
    print('scoring one...')
    mut_str = r.muts_m1
    seq_to_score = r.mut_seq_chC

    start = time.time()
    print('start scoring')
    ll_fullseq, ll_withcoord = esm.inverse_folding.multichain_util.score_sequence_in_complex(
        model, 
        alphabet,
        coords,
        'C',
        seq_to_score
    )
    write_line = ','.join([mut_str, seq_to_score, str(ll_fullseq), str(ll_withcoord)]) #+ '\n'
    list_write.append(write_line)
    end = time.time()
    it_time = end- start
    print('one it of scoring took {} seconds, total hrs expected to complete:{}'.format(it_time, it_time * len(df_to_score)/60/60))
    fout = open(pout, 'w')
    fout.write('\n'.join(list_write))
    fout.close()

cuda
writing to /n/groups/marks/users/david/esm_if/data/seq_to_score/df_mut_all_no_stop_scores_test_ipy3.csv
loading model in
reading structure in
starting to score
scoring one...
start scoring
one it of scoring took 11.53757381439209 seconds, total hrs expected to complete:0.03204881615108914
scoring one...
start scoring
one it of scoring took 12.005290508270264 seconds, total hrs expected to complete:0.033348029189639625
scoring one...
start scoring
one it of scoring took 11.570830821990967 seconds, total hrs expected to complete:0.03214119672775269
scoring one...
start scoring
one it of scoring took 11.875778436660767 seconds, total hrs expected to complete:0.032988273435168795
scoring one...
start scoring
one it of scoring took 11.592339038848877 seconds, total hrs expected to complete:0.03220094177458021
scoring one...
start scoring
one it of scoring took 11.812720537185669 seconds, total hrs expected to complete:0.032813112603293525
scoring one...
start scoring
one it of scoring 

In [1]:

import esm

import pickle
import numpy as np
import importlib
import pandas as pd
import time
import sys
import torch

models_dir = 'models'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model_id = float(time.time())
print(device)


# batch score
datapath = '/n/groups/marks/users/david/esm_if/data/seq_to_score/'
#print(sys.argv)
#f_name = sys.argv[1]
f_name = 'df_704_10x_exp_no_stop.csv'

pout = datapath + f_name.rstrip('.csv') + '_scores.csv'
print('writing to {}'.format(pout))
df_to_score = pd.read_csv(datapath+ f_name)

#df_to_score = df_to_score[:10]

print('loading model in')
# load model
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
# to get rid of random dropout
model= model.eval()

print('reading structure in')
# read structure in 
cifpath = '/n/groups/marks/users/david/esm_if/data/bio_all_rm_non_chain.cif' # .pdb format is also acceptable
coords, seqs = esm.inverse_folding.multichain_util.load_complex_coords(
    cifpath, 
    ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']
)


cuda
writing to /n/groups/marks/users/david/esm_if/data/seq_to_score/df_704_10x_exp_no_stop_scores.csv
loading model in


  "Regression weights not found, predicting contacts will not produce correct results."


reading structure in




In [4]:
list_write = []
muts_done = {}
# read in all the lines that have already been done
f_done = open(pout, 'r')
for l in f_done:
    list_write.append(l.rstrip('\n'))
    mut_m1 = l.split(',')[0]
    muts_done[mut_m1] = 1
#print(list_write)
f_done.close()

In [8]:

print('starting to score')
c = 0
for n,r in df_to_score.iterrows():
    mut_str = r.muts_m1
    
    if mut_str not in muts_done:
        print('scoring one...')
        seq_to_score = r.mut_seq_chC

        start = time.time()
        print('start scoring')
        ll_fullseq, ll_withcoord = esm.inverse_folding.multichain_util.score_sequence_in_complex(
            model, 
            alphabet,
            coords,
            'C',
            seq_to_score
        )
        write_line = ','.join([mut_str, seq_to_score, str(ll_fullseq), str(ll_withcoord)]) #+ '\n'
        list_write.append(write_line)
        end = time.time()
        it_time = end- start
        print('one it of scoring took {} seconds, total hrs expected to complete:{}'.format(it_time, it_time * (len(df_to_score)-c)/60/60))
        print('just scored {}'.format(mut_str))

        fout = open(pout, 'w')
        fout.write('\n'.join(list_write))
        fout.close()
    c+=1

starting to score
scoring one...
start scoring
one it of scoring took 12.747006893157959 seconds, total hrs expected to complete:19.768483190139136
just scored L48L:D52D:I53I:R55R:L56L:F74A:R78R:E80F:A81A:R82R
scoring one...
start scoring
one it of scoring took 12.08551025390625 seconds, total hrs expected to complete:18.73925506591797
just scored L48L:D52D:I53I:R55R:L56L:F74R:R78Q:E80S:A81I:R82L
scoring one...
start scoring
one it of scoring took 11.318896293640137 seconds, total hrs expected to complete:17.547433393001555
just scored L48L:D52D:I53I:R55R:L56L:F74L:R78S:E80S:A81Q:R82F
scoring one...
start scoring
one it of scoring took 11.229236602783203 seconds, total hrs expected to complete:17.405316734313963
just scored L48L:D52D:I53I:R55R:L56L:F74R:R78K:E80R:A81H:R82V
scoring one...
start scoring
one it of scoring took 11.435964822769165 seconds, total hrs expected to complete:17.71939216150178
just scored L48L:D52D:I53I:R55R:L56L:F74F:R78L:E80I:A81I:R82L
scoring one...
start scor

KeyboardInterrupt: 