In [1]:
its_jupyter_notebook = True

# Try individual modules 

In [2]:
import sys
import os
import torch
import numpy as np
import pickle
import time
if its_jupyter_notebook:
    sys.argv = [''] #Remove this if it's not a jupyter notebook!
sys.path.insert(0, '..')

ROOT_DIR = os.path.dirname(os.path.abspath('.'))
original_files_dir = os.path.join(ROOT_DIR, 'dataset', 'original_files')
processed_files_dir = os.path.join(ROOT_DIR, 'dataset', 'processed_files')
ufold_dir = os.path.join(ROOT_DIR, 'UFold_dependencies')
ufold_path= os.path.join(ufold_dir, 'models', 'ufold_train_alldata.pt')
bert_pretrained_dir = os.path.join(ROOT_DIR, 'dataset', 'pre_trained_DNABERT', '6-new-12w-0')

sys.path.insert(0, ufold_dir)
from UFold_dependencies.running_ufold import *
from UFold_dependencies.ufold.data_generator import Dataset_Cut_concat_new as Dataset_FCN
from UFold_dependencies.ufold.data_generator import Generator_multiple_inputs
from transformers import BertModel, BertConfig, DNATokenizer

In [3]:
device = 'cuda:0'

### Ufold

In [4]:
UFoldFeatureExtractor = UFoldModel(device, ufold_path, eval_mode = True)



In [5]:
postprocessing = True
seqs = ['UCCCGGUGAUUGGAGCGCUGGUGAAAGGCAGCUUCCGAACAGGAUAGUGAAAGGCAGCAGCGGGUAAGCGGGUA',
        'GCCCCCAUCGUCUAACGGUUAGGACACCAGACUUUCAAUCUGAUUCCGAACAGGAUAGUGAAAGGCAGCAGCGG',
        'GCACCGUUAGGACACCGACGACUAACGUAGGUUGGACACCGAACAGUGAAAGGCAGCAGGGUUCCGAACAGGAU',
        'CUUUCAAUCUGACAACGAGAGUUCGGUAGGUUGGACAUUCCGAACAUUCCGAACAGGAUAGUGAAAGGCAGCAG']
seq_names = ['seq{}'.format(int_id) for int_id in range(len(seqs))]
print(len(seqs[0]))

74


In [6]:
training_data = Generator_multiple_inputs(seqs, seq_names)
params = {'batch_size': len(seqs),
  'shuffle': False,
  'num_workers': 1,
  'drop_last': False}
training_set = Dataset_FCN(training_data)
training_generator = data.DataLoader(training_set, **params) 

results_in_batch_32 = {}
for batch, (seq_embeddings, seq_lens, seq_ori, seq_name) in enumerate(training_generator):
    model_features = UFoldFeatureExtractor.contact_net_prediction(seq_embeddings, seq_lens[0], seq_ori, seq_name, postprocessing = postprocessing)
    for i in range(len(model_features)):
        name = seq_name[i]
        contact_matrix = model_features[i]
        one_hot = from_matrix_to_one_hot(model_features[i], device = device)
        dot_torch = from_one_hot_to_dot(one_hot)
        results_in_batch_32[name] = {'contact_matrix':model_features[i],
                            'seq_embeddings':seq_embeddings[i],
                            'dot_seq': from_matrix_to_dot(model_features[i]),
                            'dot_seq_torch':dot_torch
                           }

In [7]:
seq_embeddings.shape

torch.Size([4, 17, 80, 80])

In [8]:
model_features.shape

torch.Size([4, 74, 74])

### BERT

In [19]:
config = BertConfig.from_pretrained('https://raw.githubusercontent.com/jerryji1993/DNABERT/master/src/transformers/dnabert-config/bert-config-6/config.json')
tokenizer = DNATokenizer.from_pretrained('dna6')        
model = BertModel.from_pretrained(bert_pretrained_dir, config=config)

<class 'transformers.tokenization_dna.DNATokenizer'>


In [20]:
def build_kmers(sequence, k = 6):
    """
    e.g. input:  "AATCTAGCA", a string of length 9
        output: "AATCTA ATCTAG TCTAGC CTAGCA" a string of 4 6-mers divided by a space char
    """
    kmers = ''
    n_kmers = len(sequence) - k + 1

    for i in range(n_kmers):
        kmer = sequence[i:i + k]
        kmers += kmer + ' '

    return kmers[:-1] #remove last space

In [21]:
kmer_seqs = [build_kmers(seqs[i].replace("U", "T" ), k = 6) for i in range(len(seqs))]
model_inputs = [tokenizer.encode_plus(kmer_seqs[i], add_special_tokens=True, max_length=512)["input_ids"] for i in range(len(seqs))]
model_inputs = torch.tensor(model_inputs, dtype=torch.long)

In [22]:
model_inputs.shape

torch.Size([4, 71])

In [23]:
output = model(model_inputs) 
embedded_seqs = output[0][:, 1:-1,:]
#numpy_embedded_seqs = embedded_seqs.detach().numpy()

In [37]:
seq_ori.shape

torch.Size([4, 80, 4])

In [24]:
embedded_seqs.shape

torch.Size([4, 69, 768])

# Try the model 

In [4]:
### Il dataloader deve rilasciare questo:
# ([
#    [rna1_ss, rna1_bert], 
#    [rna2_ss, rna2_bert]
#  ], {'there_is_interaction':0, ..})

### Nel collate_fn, si trasformera in questo (NT sta per nested tensor):
# ([
#    [rna1_ss, NT(rna1_bert, mask1)], 
#    [rna2_ss, NT(rna2_bert, mask2)]
#  ], {'there_is_interaction':0, ..})


#rna1_ss  con shape (b, M, 3)

#rna2_ss   con shape (b, N, 3)

#rna1_bert   con shape (b, M - 3, 768)

#rna2_bert   con shape (b, N - 3, 768)

#quindi per ricavare M, N basta fare (rna1_bert.shape[1] + 3), (rna2_bert.shape[1] + 3)

In [4]:
import sys
import os
import torch
import numpy as np
import random
import torch
from torch.utils import data
import argparse
import time
if its_jupyter_notebook:
    sys.argv = [''] #Remove this if it's not a jupyter notebook!
sys.path.insert(0, '..')

ROOT_DIR = os.path.dirname(os.path.abspath('.'))
ufold_dir = os.path.join(ROOT_DIR, 'UFold_dependencies')
ufold_path= os.path.join(ufold_dir, 'models', 'ufold_train_alldata.pt')
bert_pretrained_dir = os.path.join(ROOT_DIR, 'dataset', 'pre_trained_DNABERT', '6-new-12w-0')
sys.path.insert(0, ufold_dir)
from train_binary_cl import get_args_parser
from models.binary_classifier import build as build_model
from util.misc import NestedTensor
from UFold_dependencies.ufold.data_generator import Dataset_Cut_concat_new as Dataset_FCN
from UFold_dependencies.ufold.data_generator import Generator_multiple_inputs
from transformers import DNATokenizer
from UFold_dependencies.running_ufold import UFoldModel, from_matrix_to_one_hot


parser = argparse.ArgumentParser('Example script', parents=[get_args_parser()])
args = parser.parse_args()

<class 'transformers.tokenization_dna.DNATokenizer'>


In [5]:
args.device = 'cuda:0'
device = torch.device(args.device)
model, criterion, postprocessors = build_model(args, bert_pretrained_dir, ufold_path)
model = model.to(device)

In [6]:
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)

number of params: 1459700


In [7]:
import torch
from torch.utils.data import DataLoader
import util.misc as utils


batch_size = 128
num_workers = 2

In [24]:
from dataset.data import (
    RNADataset,
    ROOT_DIR,
    EasyPosAugment,
    RegionSpecNegAugment,
    InteractionSelectionPolicy,
    EasyNegAugment,
    HardPosAugment,
    HardNegAugment,
    plot_sample,
    plot_sample2,
    seed_everything,
)

pos_width_multipliers = {1: 0.1, 1.2: 0.1, 1.6: 1.0}
pos_height_multipliers = {1: 0.1, 1.2: 0.1, 1.6: 1.0}
neg_width_windows = {(50, 80): 0.1, (80, 120): 0.4, (120, 200): 0.3, (200, 251): 0.2}
neg_height_windows = {(50, 80): 0.1, (80, 120): 0.4, (120, 200): 0.3, (200, 251): 0.2}

_SUBSET_SIZE: int = 100
policies = [
    EasyPosAugment(
        per_sample=10,
        interaction_selection=InteractionSelectionPolicy.LARGEST,
        width_multipliers=pos_width_multipliers,
        height_multipliers=pos_height_multipliers,
    ),
    EasyNegAugment(
        per_sample=10,
        width_windows=neg_width_windows,
        height_windows=neg_height_windows,
    ),
    HardPosAugment(
        per_sample=0.5,
        interaction_selection=InteractionSelectionPolicy.RANDOM_ONE,
        min_width_overlap=0.3,
        min_height_overlap=0.3,
        width_multipliers=pos_width_multipliers,
        height_multipliers=pos_height_multipliers,
    ),
    HardNegAugment(
        per_sample=0.9,
        width_windows=neg_width_windows,
        height_windows=neg_height_windows,
    ),
]
dataset = RNADataset(
    gene_info_path=os.path.join(processed_files_dir, "df_cdna.csv"),
    interactions_path=os.path.join(processed_files_dir,"df_annotation_files_cleaned.csv"), #subset_valentino.csv
    dot_bracket_path=os.path.join(processed_files_dir,"dot_bracket.txt"),
    df_genes_path = os.path.join(processed_files_dir,"df_genes.csv"),
    augment_policies=policies
)

sampler_train = torch.utils.data.RandomSampler(dataset)
sampler_val = torch.utils.data.SequentialSampler(dataset)
batch_sampler_train = torch.utils.data.BatchSampler(sampler_train, batch_size, drop_last=False)
data_loader_train = DataLoader(dataset, batch_sampler=batch_sampler_train,
                               collate_fn=utils.collate_fn, num_workers=num_workers)

In [25]:
from tqdm.notebook import tqdm

In [26]:
len(data_loader_train)

656

In [None]:
for res in tqdm(data_loader_train):
    """
    [rna1, rna2], target = res
    rna1[0] = rna1[0].to(device)
    rna2[0] = rna2[0].to(device)
    rna1[1].tensors = rna1[1].tensors.to(device)
    rna2[1].tensors = rna2[1].tensors.to(device)
    rna1[1].mask = rna1[1].mask.to(device)
    rna2[1].mask = rna2[1].mask.to(device)
    #print(rna1[0].shape[-1], rna2[0].shape[-1])
    #model(rna1, rna2)
    #print('\n')
    targets_array = np.array([t['is_interacting'] for t in target])
    #print(np.unique(targets_array, return_counts = True))
    #if _ == 3:
    #    break
    """
    continue