In [4]:
import numpy as np
import time
import re
import copy
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Masking seqeunces

Generate a file with seqeunces, masked seqeunces and masked letter.

In [5]:
with open('sequences_t=127.dat', 'r') as f:
    lines = f.readlines()
seqs = list([list(line)[:-1] for line in lines])


In [6]:
amino_acids = ["A","R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", 
                    "F", "P", "S", "T", "W", "Y", "V", "[MASK]"]  
# dict_aminoacids = {aa: idx for idx, aa in enumerate(amino_acids)}
dict_aminoacids = {aa: idx for idx, aa in enumerate(amino_acids)}

def add_mask(sequences, mask_token="[MASK]", mask_probability=0.15):
    sequences_masked = copy.deepcopy(sequences)
    masked_positions = []

    for s in sequences_masked:
        positions = []
        for i in range(len(s)):
            if torch.rand(1).item() < mask_probability:
                positions.append((i, dict_aminoacids[s[i]]))
                s[i] = mask_token
        masked_positions.append(positions)

    return sequences_masked, masked_positions

def convert_to_indices(sequences, dict_aminoacids):
    sequences_indices = []
    for s in sequences:
        indices = [dict_aminoacids[aa] for aa in s]
        sequences_indices.append(indices)
    return sequences_indices

In [7]:
%%time
sequences_masked, masked_positions = add_mask(seqs)
sequences_masked_number = convert_to_indices(sequences_masked, dict_aminoacids)
sequences_number = convert_to_indices(seqs, dict_aminoacids)

CPU times: user 7.36 s, sys: 55.5 ms, total: 7.42 s
Wall time: 7.42 s


In [8]:
print(f"Mean number of masked amioacids: {np.mean([len(pos) for pos in masked_positions]) }, std: {np.std([len(pos) for pos in masked_positions])}")

Mean number of masked amioacids: 12.974226374176329, std: 3.283209939629775


In [None]:
def save_to_file(sequences_masked_number, sequences_number, masked_positions, filename="SequencesMasked.txt"):
    with open(filename, "w") as file:
        file.write("Sequences Masked Number\tSequences Number\tmasked_positions\n")
        for smn, sn, mp in zip(sequences_masked_number, sequences_number, masked_positions):
            file.write(f"{smn}\t{sn}\t{mp}\n")

save_to_file(sequences_masked_number, sequences_number, masked_positions

In [75]:
import ast

def load_seq_mask(filename="SequencesMasked.txt", n=None):
    sequences_masked_number = []
    sequences_number = []
    masked_positions = []

    with open(filename, "r") as file:
        next(file)  # Salta l'intestazione
        for i, line in enumerate(file):
            if n is not None and i >= n:
                break
            smn, sn, mp = line.strip().split("\t")
            # converto la stringa in una lista di numeri
            sequences_masked_number.append(ast.literal_eval(smn))  
            sequences_number.append(ast.literal_eval(sn))  
            masked_positions.append(ast.literal_eval(mp))  

    return sequences_masked_number, sequences_number, masked_positions