In [1]:
import sys
import os
import torch
import time
import numpy as np
sys.argv = [''] #Remove this if it's not a jupyter notebook!
from running_ufold import *
import random
random.seed(123)

In [2]:
device = 'cpu'

In [3]:
ROOT_DIR = os.path.dirname(os.path.abspath('.'))
ufold_path= os.path.join(ROOT_DIR, 'UFold_dependencies', 'models', 'ufold_train_alldata.pt')
UFoldFeatureExtractor = UFoldModel(device, ufold_path, eval_mode = True)



In [4]:
def random_sequence(length):
    seq = random.choices('CGTA', k=length)
    seq_str = ''
    for bp in seq:
        seq_str += bp
    return seq_str

### Run single input

In [5]:
seq_length = 600

In [6]:
seq = random_sequence(seq_length)
print(len(seq))

600


In [7]:
seq = 'GCCCCCAUCGUCUAACGGUUAGGACACCAGACUUUCAAUCUGACAACGAGAGUUCGACUCUCUCUGGGGGUA'

In [8]:
seq_name = f'seq_len:{seq_length}'
seq_embeddings, seq_lens, seq_ori, seq_name = get_seq_embeddings_seq_ori(seq, seq_name)
print(seq_embeddings.shape)
print(seq_lens.shape)
print(seq_ori.shape)
print(seq_name)
print('\n')
model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens, seq_ori, seq_name)
print(model_features.shape)

torch.Size([1, 17, 80, 80])
torch.Size([1])
torch.Size([1, 80, 4])
('eq_len:600',)


torch.Size([1, 72, 72])


In [9]:
dot_string = from_matrix_to_dot(model_features)

In [10]:
%time
model_features_no_postprocess = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens, seq_ori, seq_name, postprocessing = False)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 7.39 µs


In [11]:
dot_string = from_matrix_to_dot(model_features_no_postprocess)
set(dot_string) == set('(.)')

True

In [12]:
print(model_features_no_postprocess.min())
print(model_features_no_postprocess.mean())
print(model_features_no_postprocess.max())

tensor(-109.1484)
tensor(-24.2549)
tensor(32.1897)


In [13]:
one_hot = from_matrix_to_one_hot(model_features_no_postprocess, device = model_features_no_postprocess.device)
dot_torch = from_one_hot_to_dot(one_hot.squeeze())

In [14]:
dot_torch

'(((((((..((((........))))..((((.......)))).....(((((.......)))))))))))).'

In [15]:
dot_torch == dot_string

True

### Run single input with length > 600 bp

In [16]:
start_time = time.time()
seq = random_sequence(100)
print(len(seq))
seq_name = f'seq_len:{seq_length}'
seq_embeddings, seq_lens, seq_ori, seq_name = get_seq_embeddings_seq_ori(seq, seq_name)
print(seq_embeddings.shape)
print(seq_lens.shape)
print(seq_ori.shape)
print(seq_name)
print('\n')
model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens, seq_ori, seq_name)
print(model_features.shape)
print(time.time()-start_time)

100
torch.Size([1, 17, 112, 112])
torch.Size([1])
torch.Size([1, 112, 4])
('eq_len:600',)


torch.Size([1, 100, 100])
0.2696959972381592


In [17]:
start_time = time.time()
seq = random_sequence(1000)
print(len(seq))
seq_name = f'seq_len:{seq_length}'
seq_embeddings, seq_lens, seq_ori, seq_name = get_seq_embeddings_seq_ori(seq, seq_name)
print(seq_embeddings.shape)
print(seq_lens.shape)
print(seq_ori.shape)
print(seq_name)
print('\n')
model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens, seq_ori, seq_name)
print(model_features.shape)
print(time.time()-start_time)

1000
torch.Size([1, 17, 1008, 1008])
torch.Size([1])
torch.Size([1, 1008, 4])
('eq_len:600',)


torch.Size([1, 1000, 1000])
11.467055320739746


### Post-processing VS pre-processing comparison

As we can see below, both matrixes have the same dimension

In [18]:
print(seq_length)
print(model_features.shape)
print(model_features_no_postprocess.shape)

600
torch.Size([1, 1000, 1000])
torch.Size([1, 72, 72])


As we can see below, both matrixes are symmetric

In [19]:
#check if the matrix is symmetric
model_features_numpy = model_features.squeeze().cpu().numpy()
print(np.allclose(model_features_numpy, model_features_numpy.T, rtol=1e-5, atol=1e-5))

model_features_numpy_no_postprocess = model_features_no_postprocess.squeeze().cpu().numpy()
print(np.allclose(model_features_numpy_no_postprocess, model_features_numpy_no_postprocess.T, rtol=1e-5, atol=1e-5))

True
True


As we can see below, before the postprocessing values were larger.

In [20]:
print('With postprocessing:')
print(model_features_numpy.max())
print(model_features_numpy.min())
print('\n')
print('Without postprocessing:')
print(model_features_numpy_no_postprocess.max())
print(model_features_numpy_no_postprocess.min())

With postprocessing:
1.1868374
0.0


Without postprocessing:
32.18965
-109.14838


Without postprocessing values in the diagonal are not all zeros.

In [21]:
print('With postprocessing:')
print(np.allclose(np.diagonal(model_features_numpy), 0, rtol=1e-5, atol=1e-5))
print('\n')
print('Without postprocessing:')
print(np.allclose(np.diagonal(model_features_numpy_no_postprocess), 0, rtol=1e-5, atol=1e-5))

With postprocessing:
True


Without postprocessing:
False


In [22]:
dot_string = from_matrix_to_dot(model_features)

set(dot_string) == set('(.)')

True

In [23]:
print(dot_string)
print('(((((((..((((........))))..((((.......)))).....(((((.......)))))))))))).')

..((((................................(.............................................((((((((((...(......(.((((..(((((.)))))...(.(.).((...(..........((.........................(.(..(.(((((((............(()))).))).).)....((((((((((((((..(.(((((((..((((((...(((((..(((((((..((.((..)).).)))))......)......).))))..())))))..((..))))))).)..)))))))).....(((.....))).....................)...........(..((((....(..(...(((.(((...(((((((((((.......))))))))))..))).))).(()))))...).....(((((.((.(..((..(..((..(((((...)))))))...))..)........)))))........))..(((.(...(............())....).(.).)))...........(((())...).).......................))))(...(......)..((...((((..............((((.......(........((.....)..))...))))...))))......)))......(((((....))))).............))..........(((((.((......)).))))..))))))).....))))...).......)).......).)....)..........).)).)).((((...((((.(((((.)))..)))).................))))))))))............(((((...).((......(((((())))))..))).)...)))))............)))))...)))))....).......

In [24]:
print(model_features.min())
print(model_features.mean())
print(model_features.max())

tensor(0.)
tensor(0.0004)
tensor(1.1868)


### Vediamo se su un unico input con padding lui lavora bene. In tal caso uso le funzioni che conosco e faccio io il padding.

In [25]:
from ufold.data_generator import Dataset_Cut_concat_new as Dataset_FCN
from ufold.data_generator import Generator_multiple_inputs, Generator_single_input, RNASSDataGenerator
from torch.utils import data

def get_seq_embeddings_seq_ori(sequence, name):
    test_data = Generator_single_input(sequence, name)
    params = {'batch_size': 1,
      'shuffle': False,
      'num_workers': 1,
      'drop_last': False}
    test_set = Dataset_FCN(test_data)
    test_generator = data.DataLoader(test_set, **params) 
    for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(test_generator):
        assert batch == 0
    return seq_embeddings, seq_lens, seq_ori, seq_name

In [26]:
seq_length = 100
seq = random_sequence(seq_length)
seq_name = 'seq_len:{}'.format(seq_length)
test_data = Generator_single_input(seq, seq_name)

In [27]:
result_generator = test_data.get_one_sample(0)[0]
print(type(result_generator))
print(result_generator.shape)

<class 'numpy.ndarray'>
(600, 4)


In [28]:
print(result_generator[0])
print(result_generator[19])
print(result_generator[20])
print('\n')
print(result_generator[20:,:].sum() == 0)

[1. 0. 0. 0.]
[0. 0. 1. 0.]
[1. 0. 0. 0.]


False


In [29]:
### ANALIZZIAMO LA CLASSE RNASSDataGenerator
import _pickle as cPickle
import collections
RNA_SS_data = collections.namedtuple('RNA_SS_data','seq ss_label length name pairs')

with open(os.path.join(ROOT_DIR, 'UFold_dependencies','data', 'TS3.cPickle'), 'rb') as f:
            data = cPickle.load(f,encoding='iso-8859-1')

data_x = np.array([instance[0] for instance in data])
data_y = np.array([instance[1] for instance in data])
pairs = np.array([instance[-1] for instance in data])
seq_length = np.array([instance[2] for instance in data])
#seq = list(p.map(encoding2seq, self.data_x))
seq_max_len = len(data_x[0])
data_name = np.array([instance[3] for instance in data])

  pairs = np.array([instance[-1] for instance in data])


In [30]:
from ufold.data_generator import Dataset_Cut_concat_new_canonicle as Dataset_FCN
from ufold.data_generator import Dataset_Cut_concat_new_merge_multi as Dataset_FCN_merge
from torch.utils import data

device = 'cuda:0'
pickle_data = RNASSDataGenerator(os.path.join(ROOT_DIR, 'UFold_dependencies','data'), 'TS3.cPickle')
params = {'batch_size': 1,
  'shuffle': True,
  'num_workers': 1,
  'drop_last': False}

#train_merge = Dataset_FCN_merge([pickle_data])
train_merge = Dataset_FCN(pickle_data)
train_merge_generator = data.DataLoader(train_merge, **params)

  self.pairs = np.array([instance[-1] for instance in self.data])


In [31]:
import torch.nn.functional as F

def pad_image_batch(image_batch):
    max_height = max([img.shape[-2] for img in image_batch])
    max_width = max([img.shape[-1] for img in image_batch])

    image_batch = [
        # The needed padding is the difference between the
        # max width/height and the image's actual width/height.
        F.pad(img, [0, max_width - img.shape[-1], 0, max_height - img.shape[-2]])
        for img in image_batch
    ]
    return image_batch


"""
#test
a = torch.ones(1, 3, 20, 20)
b = torch.ones(1, 3, 54, 54)
c = torch.ones(1, 3, 10, 10)
image_batch = [a, b, c]
padded_image_batch =  pad_image_batch(image_batch)
print(padded_image_batch[0][0, :, 20:, 20:].sum() == 0)
print(padded_image_batch[2][0, :, 10:, 10:].sum() == 0)
"""

'\n#test\na = torch.ones(1, 3, 20, 20)\nb = torch.ones(1, 3, 54, 54)\nc = torch.ones(1, 3, 10, 10)\nimage_batch = [a, b, c]\npadded_image_batch =  pad_image_batch(image_batch)\nprint(padded_image_batch[0][0, :, 20:, 20:].sum() == 0)\nprint(padded_image_batch[2][0, :, 10:, 10:].sum() == 0)\n'

In [32]:
model_features_list = []
model_features_list_padded = []
seq_embeddings_list = []
seq_lens_list = []
seq_ori_list = []
seq_name_list = []

postprocessing = True


for contacts, seq_embeddings, matrix_reps, seq_lens, seq_ori, seq_name, nc_map, l_len in train_merge_generator:
    seq_embeddings_list.append(seq_embeddings)
    seq_lens_list.append(seq_lens)
    seq_ori_list.append(seq_ori)
    seq_name_list.append(seq_name)

for i in range(len(train_merge_generator)):
    seq_embeddings, seq_lens, seq_ori, seq_name = seq_embeddings_list[i], seq_lens_list[i], seq_ori_list[i], seq_name_list[i]
    model_features_list.append(UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens, seq_ori, seq_name, postprocessing = postprocessing))

print('\n')

seq_embeddings_list_padded = pad_image_batch(seq_embeddings_list)
seq_ori_list_padded = pad_image_batch(seq_ori_list)
#seq_ori_list_padded = seq_ori_list

for i in range(len(train_merge_generator)):
    seq_embeddings, seq_lens, seq_ori, seq_name = seq_embeddings_list_padded[i], seq_lens_list[i], seq_ori_list_padded[i], seq_name_list[i]
    model_features_list_padded.append(UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens, seq_ori, seq_name, postprocessing = postprocessing))





In [33]:
# check if the result is the same
for i in range(len(train_merge_generator)):
    #print(seq_lens_list[i])
    #print(model_features_list[i].shape == model_features_list_padded[i].shape)
    #print(torch.allclose(model_features_list[i], model_features_list_padded[i]))
    print((model_features_list[i] - model_features_list_padded[i]).sum())
    #print(model_features_list[i].sum())

tensor(16.6720)
tensor(29.8296)
tensor(6.3232)
tensor(35.0341)
tensor(30.8638)
tensor(43.8264)
tensor(3.8842)
tensor(10.5110)
tensor(28.9487)
tensor(4.8851)
tensor(6.7665)
tensor(33.3618)
tensor(6.4274)
tensor(17.8265)
tensor(37.1164)
tensor(37.7431)
tensor(0.)
tensor(30.3365)


In [34]:
(model_features_list[i].squeeze() - model_features_list_padded[i].squeeze()).sum()

tensor(30.3365)

Purtroppo con il padding non lavora bene, è stato infatti trainato con batch size = 1 e niente padding sull'input.

### Vediamo se la funzione from_matrix_to_one_hot funziona anche su piu sequenze


In [35]:
from ufold.data_generator import Dataset_Cut_concat_new as Dataset_FCN
postprocessing = True
#seq_len = 120 #lunghezza delle sequenze nel batch
#seqs_length = np.random.randint(low = seq_len, high=seq_len+1, size=5)
#seqs = [random_sequence(seq_length) for seq_length in seqs_length]
seqs = ['UCCCGGUGAUUGGAGCGCUGUGGCACCACUCCUUCCCAUUCCGAACAGGAUAGUGAAAGGCAGCAGCGGGUA',
        'GCCCCCAUCGUCUAACGGUUAGGACACCAGACUUUCAAUCUGACAACGAGAGUUCGACUCUCUCUGGGGGUA',
        'GCACCGUUAGGACACCGACGACUAACGUAGGUUGGACACCGAACGGUUAGGACACCGACACCGUUAGGACAC',
        'CUUUCAAUCUGACAACGAGAGUUCGGUAGGUUGGACAUUCCGAACAGUCUCUGGGGGGAUACGGUUAACACG']
seq_names = ['seq{}'.format(int_id) for int_id in range(len(seqs))]
training_data = Generator_multiple_inputs(seqs, seq_names)
params = {'batch_size': 2,
  'shuffle': False,
  'num_workers': 1,
  'drop_last': False}
training_set = Dataset_FCN(training_data)
training_generator = data.DataLoader(training_set, **params) 

In [36]:
for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(training_generator):
    model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens[0], seq_ori, seq_name, postprocessing = postprocessing)
    one_hot = from_matrix_to_one_hot(model_features, device = model_features.device)
    [print(from_one_hot_to_dot(one_hot[i])) for i in range(one_hot.shape[0])]

.((((.((.........((((..(..(((((((.............))))..)))....)))))))))))..
(((((((..((((........))))..((((.......)))).....(((((.......)))))))))))).
....((((((....(((((.()))))))..)))))...((.((((((..((...))...)))))).))....
..(((((((((.(((((((()))()()))))))))....).).....)))).....................


In [37]:
params = {'batch_size': 1,
  'shuffle': False,
  'num_workers': 1,
  'drop_last': False}
training_set = Dataset_FCN(training_data)
training_generator = data.DataLoader(training_set, **params) 
for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(training_generator):
    model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens[0], seq_ori, seq_name, postprocessing = postprocessing)
    one_hot = from_matrix_to_one_hot(model_features, device = model_features.device)
    [print(from_one_hot_to_dot(one_hot[i])) for i in range(one_hot.shape[0])]
    #print(one_hot.shape)

.((((.((.........((((..(..(((((((.............))))..)))....)))))))))))..
(((((((..((((........))))..((((.......)))).....(((((.......)))))))))))).
....((((((....(((((.()))))))..)))))...((.((((((..((...))...)))))).))....
..(((((((((.(((((((()))()()))))))))....).).....)))).....................


### Vediamo se senza padding, ma con batch di uguale lunghezza lavora bene.

In [None]:
training_data = Generator_multiple_inputs(seqs, seq_names)
params = {'batch_size': 2,
  'shuffle': False,
  'num_workers': 1,
  'drop_last': False}
training_set = Dataset_FCN(training_data)
training_generator = data.DataLoader(training_set, **params) 

results_in_batch_32 = {}
for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(training_generator):
    model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens[0], seq_ori, seq_name, postprocessing = postprocessing)
    for i in range(len(model_features)):
        name = seq_name[i]
        contact_matrix = model_features[i]
        one_hot = from_matrix_to_one_hot(model_features[i], device = device)
        dot_torch = from_one_hot_to_dot(one_hot)
        results_in_batch_32[name] = {'contact_matrix':model_features[i],
                            'seq_embeddings':seq_embeddings[i],
                            'dot_seq': from_matrix_to_dot(model_features[i]),
                            'dot_seq_torch':dot_torch
                           }

In [None]:
training_data = Generator_multiple_inputs(seqs, seq_names)
params = {'batch_size': 1,
  'shuffle': False,
  'num_workers': 1,
  'drop_last': False}
training_set = Dataset_FCN(training_data)
training_generator = data.DataLoader(training_set, **params) 

results_in_batch_1 = {}
for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(training_generator):
    model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens[0], seq_ori, seq_name, postprocessing = postprocessing)
    for i in range(len(model_features)):
        name = seq_name[i]
        one_hot = from_matrix_to_one_hot(model_features[i], device = device)
        dot_torch = from_one_hot_to_dot(one_hot)
        results_in_batch_1[name] = {'contact_matrix':model_features[i],
                                    'seq_embeddings':seq_embeddings[i],
                                    'dot_seq': from_matrix_to_dot(model_features[i]),
                                    'dot_seq_torch':dot_torch
                                   }

In [None]:
#errors check
errors = []
for seq_name in seq_names:
    error = float((results_in_batch_1[seq_name]['contact_matrix'] - results_in_batch_32[seq_name]['contact_matrix']).sum())
    errors.append(error)
    #print(error)
    #print(torch.allclose(results_in_batch_1[seq_name]['contact_matrix'], results_in_batch_32[seq_name]['contact_matrix'], rtol = 0.1))
    assert (results_in_batch_1[seq_name]['dot_seq'] == results_in_batch_32[seq_name]['dot_seq'])
    assert (results_in_batch_1[seq_name]['dot_seq'] == results_in_batch_1[seq_name]['dot_seq_torch'])
    assert (results_in_batch_32[seq_name]['dot_seq'] == results_in_batch_32[seq_name]['dot_seq_torch'])

In [None]:
print(np.max(errors))
print(np.mean(errors))
print(np.min(errors)) 
print(np.median(errors)) 

In [None]:
seq_embeddings.shape

#### Vediamo se con batch size 1 puo lavorare velocemente con sequenze di dimensione diversa (cerco di fare cio che succede nel ufold_predict.py)

In [5]:
from ufold.data_generator import Dataset_Cut_concat_new as Dataset_FCN
from ufold.data_generator import MyRNASSDataGenerator_input

In [6]:
device = 'cuda:0'

In [7]:
test_data = MyRNASSDataGenerator_input(os.path.join(ROOT_DIR, 'UFold_dependencies', 'data'), 'chunk20')

params = {'batch_size': 1,
          'shuffle': False,
          'num_workers': 6,
          'drop_last': True}

test_set = Dataset_FCN(test_data)
test_generator = data.DataLoader(test_set, **params)

  self.data_x = np.array([self.one_hot_600(item) for item in self.seq])


In [None]:
UFoldFeatureExtractor = UFoldModel(device, ufold_path, eval_mode = True)



In [25]:
results_in_batch_1 = {}
for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(test_generator):
    model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens[0], seq_ori, seq_name, postprocessing = True)
    name = seq_name[0]
    one_hot = from_matrix_to_one_hot(model_features.squeeze(), device = device)
    dot_torch = from_one_hot_to_dot(one_hot)
    results_in_batch_1[name] = {
                                    'dot_seq':dot_torch
                                   }
    if batch == 1:
        break

In [26]:
results_in_batch_1

{'ENSG00000166228_0': {'dot_seq': '.........(((.(..(....).......))(()).........................(((((((...((.((((.((..(..)..((((..(..(...((((....((.(.......(((.((((((((....)))))))))..))...(..............)).(((..............(.(((.......))).......)))........).....)))....(....).).......................)).((..((...(.....(((.(((..(....((.()(...(()))((.........(((((.(.))((.((((())..)...)))))).))..)..))))))(....(..........(((.((.)).(())..(((...((((...)))((((.()).......(((((...)).).((((....)..............)))))....)..((((.((((......))))))))))))))).......................)).............).).)).)....................(.).((((............))))...)..........(............(((((.(.....(.......((.)...)......)...)))).(..(........)......)).........)...)...((((..(.......))..)).))..........((..........))..))))...)).)...)))))).)..))...............)().)).....))......(......(...(................)......)...........).........(.)))).......)..).........)..........(..((((...(.).)...)....)))...............((.((((((((...(

In [30]:
pd.DataFrame.from_dict(results_in_batch_1, orient = 'index').to_csv('prova.csv', sep = '\t', header = False)