# SNP

In [1]:
import pysam
import array
import numpy as np
import pandas as pd

In [2]:
input_bam = "first_chrom.bam"
target_reads = "./first_chrom_target_snp.bam"
untarget_reads = "./first_chrom_untarget_snp.bam"
output_bam = "./first_chrom_premod_snp.bam"
delete_bed = "./snp_delete.bed"
snp_file = "./snps.csv"
n_proc = 4
probability = None

In [3]:
# Читаем файл с мутациями
snips = pd.read_csv(snp_file)

# Создаем Bed-файл с интервалами
with pysam.AlignmentFile(input_bam, "rb") as samfile_input, open(delete_bed, "a") as bed:
    chroms = list(set(samfile_input.references) & set(snips["chromosome"]))
    if len(chroms) == 0:
        # Поменять на raise
        print("Target chromosomes are absent in reference\nPlease, check correctness of your csv file or names of contigs")
    else:
        snips_true = snips.query("chromosome in @chroms")
        for snip_index in snips_true.index:
            chr_name, start = snips_true.loc[snip_index, "chromosome"], snips_true.loc[snip_index, "position"]
            stop = start + 1
            bed.write(f"{chr_name}\t{start}\t{stop}\n")

Самтулс

In [4]:
def nucl_changer(read, number_list, position, nucl, quality, comp, probability=None):
    # По умолчанию создаем гетерозиготную мутацию (вероятность равна 0.05) 
    if (type(comp) == str) and (("YC", 1) in read.get_tags()):
        return read
    
    if type(probability) is not np.float64:
        probability = 0.5
        
    if np.random.choice([0,1], size=1, p=[1 - probability, probability]) == 1:
        ind = number_list.index(position - 1)
        indq, length, rc = ind, 0, read.cigartuples
        for cigar_block in rc:
            if cigar_block[0] in [0, 7, 8]:
                length += cigar_block[1]
                if indq < length:
                    break
            elif cigar_block[0] in [1, 4]:
                length += cigar_block[1]
                indq += cigar_block[1]
#             elif cigar_block[0] == 2:
#                 length += cigar_block[1]
#                 indq -= cigar_block[1]

        if indq < 0:
            print("Warning", read.query_name)
        
        # Мутируем!            
        read.query_sequence = read.query_sequence[:indq] + nucl + read.query_sequence[(indq + 1):]
        if read.query_qualities is not None:
            read.query_qualities[indq] = quality
        if type(comp) == str:  # для компаундов
            read.set_tag("YC", 1)
            
    return read

In [5]:
def snp_maker(read, number_list, snips_true, probability):
    for snip_index in snips_true.index:
        chr_name, position, nucl, quality, comp = snips_true.loc[snip_index, "chromosome"], snips_true.loc[snip_index, "position"], snips_true.loc[snip_index, "nucleotide"], snips_true.loc[snip_index, "quality"], snips_true.loc[snip_index, "compaund"]
        probability = snips_true.loc[snip_index, "probability"]
        comp_counter = []
        if (type(comp) == str):
            if comp not in comp_counter:
                comp_counter.append(comp)
                probability = 0.5
            else:
                probability = 1.0
        if (position - 1) in number_list:
            snp_read = nucl_changer(read, number_list, position, nucl, quality, comp, probability)
    return snp_read

In [6]:
# Ready
snips = pd.read_csv(snp_file)

with pysam.AlignmentFile(target_reads, "rb") as samfile_input, pysam.AlignmentFile(output_bam, "wb", template=samfile_input) as samfile_output:
    chroms = list(set(samfile_input.references) & set(snips["chromosome"]))
    if len(chroms) == 0:
        # Поменять на raise
        print("There are no chromosomes in changed file, which needs to be changed")
    else:
        snips_true = snips.query("chromosome in @chroms")
        reads = samfile_input.fetch()
        for read in reads:
            number_list = read.get_reference_positions()
            read_returned = snp_maker(read, number_list, snips_true, probability)
            samfile_output.write(read_returned)

## CNV

In [1]:
import array
import pysam
import numpy as np
import pandas as pd

In [2]:
input_bam = "first_chrom.bam"
target_reads = "first_chrom_target_cnv.bam"
delete_bed = "cnv_delete.bed"
output_bam = "first_chrom_premod_cnv.bam"
cnv_file = "cnv_fs3.csv"
target_fasta = "./GRCh38_full_analysis_set_plus_decoy_hla.fa"

In [3]:
# Создаем промежуточный bed, чтобы разнести целевой и нецелевой участки 
# (догадываюсь, что у тебя это реализовано иначе, но тем не менее запишу свой вариант)

cnvs = pd.read_csv(cnv_file)

with pysam.AlignmentFile(input_bam, "rb") as samfile_input, open(delete_bed, "w") as bed:
    chroms = list(set(samfile_input.references) & set(cnvs["chromosome"]))
    if len(chroms) == 0:
        # Поменять на raise
        print("Target chromosomes are absent in reference\nPlease, check correctness of your csv file or names of contigs")
    else:
        for chr_name in chroms:  # Проходимся по каждой хромосоме, которая есть и в выравнивании, и в csv
            chrom_subset = cnvs.query("chromosome==@chr_name")
            for cnv_num in chrom_subset.index:
                # Сделать более гибким 1200 (задать, как переменную)
                start, stop = chrom_subset.loc[cnv_num, "position_start"], chrom_subset.loc[cnv_num, "position_finish"]
                bed.write(f"{chr_name}\t{start}\t{stop + 1}\n")

Очень важное замечание: координаты делеций/инсерций:
- Старт -- первый нуклеотид, которого нет / который удвоен
- Стоп -- последний нуклеотид, которого нет / который удвоен

In [4]:
# Считает покрытие в однонуклеотидной позиции
def read_cov_one_nucl(samfile, chr_name, nucl):
    cov_iter = samfile.pileup(chr_name, nucl, nucl + 1, truncate=True)
    for position in cov_iter:
        cov = position.nsegments
    return(cov)


# Сохраняет кусок в +/- размер рида оснований от краев делеции
## Переписать без дублирования кода
## Сделать вменяемо с единицами
def read_del_tails(samfile, chr_name, start, stop, fasta_ref, read_length=250):   # Добавить длину рида в инпут на старте или вычислять из бама
    const = read_length * 4  # В нашем случае - 1000 нуклеотидов
    with pysam.FastaFile(target_fasta) as fasta_ref:
        region_left = fasta_ref.fetch("chr19", start - const - 1, start - 1) # Отнимаем 1, поскольку нужно не включать первый нуклеотид делеции, формат 1-based
        region_right = fasta_ref.fetch("chr19", stop, stop + const) # Аналогичная логика
    return region_left, region_right
        
# Клепаем сплит-риды
def create_split_reads_lb(samfile_input, region_left, region_right, counter, chr_name, start, stop, read_length=250):  
    mapq_probs = [1/80 for i in range(20,60)] + [0.5] # Эмпирическое наблюдение разброса качества рида
    # Рандомно выбираем длину софтклипов (10-200 с каждого края)
    min_soft = int(read_length / 25)
    max_soft = int(read_length * 4 / 5)
    right_num = np.random.choice(range(min_soft, max_soft), size=1)[0]
    left_num = 250 - right_num
    
    # Создаем сплит-рид и добавляем его свойства
    new_split_read = pysam.AlignedSegment(header=samfile_input.header) # Надо сделать нормальный хедер
    new_split_read.query_name = f"SRRread_split_l_{counter}"  #? Это норм?
    new_split_read.query_sequence=f"{region_left[-left_num:]}{region_right[:right_num]}"
    new_split_read.reference_name = chr_name
    if chr_name[3:] == "X":
        new_split_read.reference_id = 22
    elif chr_name[3:] == "Y":
        new_split_read.reference_id = 23
    elif type(chr_name[3:]) == int:
        new_split_read.reference_id = int(chr_name[3:]) - 1
    new_split_read.flag = np.random.choice([163, 99, 147, 83], size=1)[0]  #?  Допинфа от Полины и Кати
    new_split_read.mapping_quality = np.random.choice(a=range(20,61), size=1, p=mapq_probs)[0]
    new_split_read.reference_start = start - 1 - left_num # Потому что BAM 0-based
    new_split_read.next_reference_id = new_split_read.reference_id
    new_split_read.cigartuples = [(0, left_num), (4, right_num)]
    #new_split_read.cigartuples = [(0, 250)]
    new_split_read.query_qualities = array.array('B', np.random.choice(range(18,36), size=read_length).tolist())
    new_split_read.tags = [("NM", 1),
          ("RG", "L1")]  #?   Допинфа от Полины и Кати
    
    # Создаем парный ему рид
    
    pair_new_split_read = pysam.AlignedSegment(header=samfile_input.header) # Надо сделать нормальный хедер
    pair_new_split_read.query_name = f"SRRread_split_l_{counter}"  #? Это норм?
    pair_new_split_read.reference_name = chr_name
    pair_new_split_read.reference_id = new_split_read.reference_id

    pair_new_split_read.mapping_quality = new_split_read.mapping_quality
    pair_new_split_read.cigartuples = [(0, 250)]
    pair_new_split_read.tags = [("NM", 1),
          ("RG", "L1")]  #?   Допинфа от Полины и Кати
    pair_new_split_read.next_reference_id = new_split_read.reference_id

    
    # Если первый рид направлен направо, то:
    if new_split_read.flag in [99, 163]:
        next_start_coeff = np.random.choice(range(2,502), size = 1)[0] # Случайным образом отберем расстояние от второй границы делеции до старта парного рида
        new_split_read.next_reference_start = stop + next_start_coeff
        new_split_read.template_length = pair_new_split_read.next_reference_start - new_split_read.reference_start + 151
        pair_new_split_read.query_sequence=f"{region_right[next_start_coeff:next_start_coeff+250]}"
        if new_split_read.flag == 99:
            pair_new_split_read.flag = 147
        elif new_split_read.flag == 163:
            pair_new_split_read.flag = 83
           
        
    # А если налево, то:    
    elif new_split_read.flag in [83, 147]:
        next_start_coeff = np.random.choice(range(-750, -left_num -160), size = 1)[0] # Считаем, что риды могут перекрываться не более, чем на 90 нуклеотидов
        new_split_read.next_reference_start = start + next_start_coeff - 1
        new_split_read.template_length = - (new_split_read.reference_start - new_split_read.next_reference_start) - 151
        pair_new_split_read.query_sequence = f"{region_left[next_start_coeff:next_start_coeff+250]}"
        if new_split_read.flag == 147:
            pair_new_split_read.flag = 99
        elif new_split_read.flag == 83:
            pair_new_split_read.flag = 163
            
    pair_new_split_read.reference_start = new_split_read.next_reference_start
    pair_new_split_read.query_qualities = array.array('B', np.random.choice(range(18,36), size=read_length).tolist())
    pair_new_split_read.next_reference_start = new_split_read.reference_start     
    pair_new_split_read.template_length = - new_split_read.template_length 
    print(f"first_read: {new_split_read},\
          second_read: {pair_new_split_read}")
    
    return new_split_read, pair_new_split_read

def create_split_reads_rb(samfile_input, region_left, region_right, counter, chr_name, start, stop, read_length=250):  
    mapq_probs = [1/80 for i in range(20,60)] + [0.5] # Эмпирическое наблюдение разброса качества рида
    min_soft = int(read_length / 25)
    max_soft = int(read_length * 4 / 5)
    left_num = np.random.choice(range(min_soft, max_soft), size=1)[0]
    right_num = 250 - left_num
    
    # Создаем сплит-рид и добавляем его свойства
    new_split_read = pysam.AlignedSegment(header=samfile_input.header) # Надо сделать нормальный хедер
    new_split_read.query_name = f"SRRread_split_r_{counter}"  #? Это норм?
    new_split_read.query_sequence=f"{region_left[-left_num:]}{region_right[:right_num]}"
    new_split_read.reference_name = chr_name
    if chr_name[3:] == "X":
        new_split_read.reference_id = 22
    elif chr_name[3:] == "Y":
        new_split_read.reference_id = 23
    elif type(chr_name[3:]) == int:
        new_split_read.reference_id = int(chr_name[3:]) - 1
    new_split_read.flag = np.random.choice([163, 99, 147, 83], size=1)[0]  #?  Допинфа от Полины и Кати
    new_split_read.mapping_quality = np.random.choice(a=range(20,61), size=1, p=mapq_probs)[0]
    new_split_read.reference_start = stop# - left_num # Потому что BAM 0-based
    new_split_read.next_reference_id = new_split_read.reference_id
    new_split_read.cigartuples = [(4, left_num), (0, right_num)]
    #new_split_read.cigartuples = [(0, 250)]
    new_split_read.query_qualities = array.array('B', np.random.choice(range(18,36), size=read_length).tolist())
    new_split_read.tags = (("NM", 1),
          ("RG", "L1"))  #?   Допинфа от Полины и Кати
    
    # Создаем парный ему рид
    
    pair_new_split_read = pysam.AlignedSegment(header=samfile_input.header) # Надо сделать нормальный хедер
    pair_new_split_read.query_name = f"SRRread_split_r_{counter}"  #? Это норм?
    pair_new_split_read.reference_name = chr_name
    pair_new_split_read.reference_id = new_split_read.reference_id

    pair_new_split_read.mapping_quality = new_split_read.mapping_quality
    pair_new_split_read.cigartuples = [(0, 250)]
    pair_new_split_read.tags = (("NM", 1),
          ("RG", "L1"))  #?   Допинфа от Полины и Кати
    pair_new_split_read.next_reference_id = new_split_read.reference_id

    
    # Если первый рид направлен направо, то:
    if new_split_read.flag in [99, 163]:
        next_start_coeff = np.random.choice(range(right_num - 90, right_num + 500), size = 1)[0] # Случайным образом отберем расстояние от второй границы делеции до старта парного рида
        new_split_read.next_reference_start = stop + next_start_coeff
        new_split_read.template_length = pair_new_split_read.next_reference_start - new_split_read.reference_start + 151
        pair_new_split_read.query_sequence=f"{region_right[next_start_coeff:next_start_coeff+250]}"
        if new_split_read.flag == 99:
            pair_new_split_read.flag = 147
        elif new_split_read.flag == 163:
            pair_new_split_read.flag = 83
           
        
    # А если налево, то:    
    elif new_split_read.flag in [83, 147]:
        next_start_coeff = np.random.choice(range(-750, -left_num -160), size = 1)[0] # Считаем, что риды могут перекрываться не более, чем на 90 нуклеотидов
        new_split_read.next_reference_start = start + next_start_coeff - 1
        new_split_read.template_length = - (new_split_read.reference_start - new_split_read.next_reference_start) - 151
        pair_new_split_read.query_sequence = f"{region_left[next_start_coeff:next_start_coeff+250]}"
        if new_split_read.flag == 147:
            pair_new_split_read.flag = 99
        elif new_split_read.flag == 83:
            pair_new_split_read.flag = 163
            
    pair_new_split_read.reference_start = new_split_read.next_reference_start
    pair_new_split_read.query_qualities = array.array('B', np.random.choice(range(18,36), size=read_length).tolist())
    pair_new_split_read.next_reference_start = new_split_read.reference_start     
    pair_new_split_read.template_length = - new_split_read.template_length 
    print(f"first_read: {new_split_read},\
          second_read: {pair_new_split_read}")
    return new_split_read, pair_new_split_read


In [5]:
def decreasing_coverage(samfile_input, samfile_output, chr_name, start, stop, probability):
    reads = samfile_input.fetch(chr_name, start, stop)
    for read in reads:
        if (read.reference_start > start) or (read.reference_start + 250 < stop):
            if np.random.choice([0,1], size=1, p=[1 - probability, probability]) == 0:
                samfile_output.write(read)
        else:
            samfile_output.write(read)

In [10]:
def increasing_coverage(samfile_input, samfile_output, chr_name, start, stop, probability):
    reads = samfile_input.fetch(chr_name, start, stop)
    for read in reads:
        if (read.reference_start > start) or (read.reference_start + 250 < stop):
            if np.random.choice([0,1], size=1, p=[1 - probability, probability]) == 1:
                samfile_output.write(read)
        samfile_output.write(read)

In [11]:
def duplication(samfile_input, samfile_output, chr_name, start, stop, probability):
    increasing_coverage(samfile_input, samfile_output, chr_name, start, stop, probability)
#     print(samfile_input.count_coverage(chr_name, start, stop, read_callback="nofilter"))
    
    
def deletion(samfile_input, samfile_output, chr_name, start, stop, probability, fasta_ref, lp_number=3):  # Добавить переменную гомо-гетеро для делеции и lp_number для количества "парочек"
    
    # Computing coverage, number of split reads and sequence of reference around deletion
    coverage = (int(sum(map(lambda x: read_cov_one_nucl(samfile_input, chr_name, x), [start, stop]))/2))
    split_num = int(coverage/3.5)
    region_left, region_right = read_del_tails(samfile_input, chr_name, start, stop, fasta_ref) # Вычисляем контекст референса до и после делеции
    print(split_num)
        
    # Creating split reads
    counter = 0
    for split_read in range(split_num):
        try:
            counter += 1
            new_read1_lb, new_read2_lb = create_split_reads_lb(samfile_input, region_left, region_right, counter, chr_name, start, stop, probability)
            samfile_output.write(new_read1_lb)
            samfile_output.write(new_read2_lb)
            new_read1_rb, new_read2_rb = create_split_reads_rb(samfile_input, region_left, region_right, counter, chr_name, start, stop, probability)
            samfile_output.write(new_read1_rb)
            samfile_output.write(new_read2_rb)
        except ValueError:
            print("Zdes' byl error")
            continue

    
    # Decreasing coverage
    decreasing_coverage(samfile_input, samfile_output, chr_name, start, stop, probability)  

In [13]:
cnvs = pd.read_csv(cnv_file)

with pysam.AlignmentFile(target_reads, "rb") as samfile_input, pysam.AlignmentFile(output_bam, "wb", template=samfile_input) as samfile_output, pysam.FastaFile(target_fasta) as fasta_ref:
    chroms = list(set(samfile_input.references) & set(cnvs["chromosome"]))
    true_cnvs = cnvs.query("chromosome in @chroms")
    for cnv_index in true_cnvs.index:
        cnv_type, chr_name = true_cnvs.loc[cnv_index, "type"], true_cnvs.loc[cnv_index, "chromosome"]
        start, stop = true_cnvs.loc[cnv_index, "position_start"], true_cnvs.loc[cnv_index, "position_finish"]
        probability = true_cnvs.loc[cnv_index, "probability"]
        if type(probability) is not np.float64:
            probability = 0.5
        if cnv_type == "del":
            deletion(samfile_input, samfile_output, chr_name, start, stop, probability, fasta_ref)
        elif cnv_type == "dup":
            duplication(samfile_input, samfile_output, chr_name, start, stop, probability)

12
first_read: SRRread_split_l_1	99	#18	29320191	60	134M116S	#18	29330435	-29320040	ACAGCTGTAGACTCACAGTTCTCTTTTTATTGTAAAGATGATTTCCCTGAAGTATTTTTAAGCTGGAAAGATTTTCTACAAGCTGGTGAAAAATCGCTCACTGCAGGAGATCCTATGCATTGGCAGAAATCCAGGGCACCTCGGCCCTCTGCTGGCCTCACACCCCCTCCAGGTGCCTGCCCATCATACAGTCCAGCCTGTCTTTGAGGCTAAGAGAAGATGATTGTCATCTCCAGCTGCCCCACAAATC	array('B', [34, 23, 23, 34, 34, 34, 27, 26, 27, 22, 28, 18, 21, 31, 19, 24, 32, 29, 30, 29, 27, 19, 23, 18, 32, 24, 21, 30, 24, 19, 22, 34, 20, 19, 28, 34, 33, 34, 30, 32, 28, 32, 23, 32, 23, 34, 32, 28, 35, 19, 34, 20, 18, 25, 33, 19, 32, 33, 29, 29, 35, 32, 23, 29, 19, 23, 23, 26, 25, 28, 18, 35, 33, 25, 28, 31, 34, 22, 35, 18, 25, 25, 33, 20, 22, 27, 25, 21, 23, 30, 28, 30, 34, 32, 30, 24, 24, 22, 35, 31, 26, 35, 24, 33, 18, 22, 20, 18, 21, 32, 23, 34, 26, 19, 21, 29, 26, 23, 33, 21, 33, 34, 25, 32, 32, 25, 24, 23, 26, 24, 29, 31, 22, 24, 26, 18, 31, 35, 24, 22, 23, 28, 28, 20, 29, 31, 29, 20, 33, 25, 24, 28, 32, 19, 20, 30, 28, 27, 23, 31, 29, 29, 22, 2

## Frameshift and another microevents

In [2]:
import array
import pysam
import numpy as np
import pandas as pd

In [3]:
input_bam = "second_nano.bam"
target_reads = "second_nano_target_fs.bam"
delete_bed = "fs_delete.bed"
output_bam = "second_nano_premod_fs.bam"
fs_file = "fs4.csv"

In [24]:
fs = pd.read_csv(fs_file)

with pysam.AlignmentFile(input_bam, "rb") as samfile_input, open(delete_bed, "w") as bed:
    chroms = list(set(samfile_input.references) & set(fs["chromosome"]))
    if len(chroms) == 0:
        # Поменять на raise
        print("Target chromosomes are absent in reference\nPlease, check correctness of your csv file or names of contigs")
    else:
        fs_true = fs.query("chromosome in @chroms")
        for fs_local in fs_true.index:
            chr_name = fs_true.loc[fs_local, "chromosome"]
            if pd.isna(fs_true.loc[fs_local, "position_finish"]):
                start, stop = fs_true.loc[fs_local, "position_start"], fs_true.loc[fs_local, "position_start"] + 1
            else:
                start, stop = fs_true.loc[fs_local, "position_start"], int(fs_true.loc[fs_local, "position_finish"] + 1)
            bed.write(f"{chr_name}\t{start}\t{stop}\n")    

Что-то на самтулсном

In [1]:
def cigar_del(read, ind_list):
    print(f"del_ind_list: {ind_list}\n\
            read_name: {read.query_name}")
    cigar_before = read.cigartuples        # Настраиваем строку cigar
    cigar_after, cur_num, flag = [], 0, "before_del"
    for cigartuple in cigar_before:
        if cigartuple[0] in [1, 2, 4, 5]:   # Делеции, инсерции и клипы не учитываются в референсных позициях
            cigar_after.append(cigartuple)
            
        elif cigartuple[0] == 0:
            if flag == "before_del":
                if cur_num + cigartuple[1] < ind_list[0]:
                    cigar_after.append(cigartuple)
                    cur_num += cigartuple[1]
                elif cur_num + cigartuple[1] == ind_list[0]:
                    cigar_after.append(cigartuple)
                    cur_num += cigartuple[1]
                    cigar_after.append((2, len(ind_list)))
                    #cur_num += len(ind_list)
                    flag = "into_del"
                    
                elif cur_num + cigartuple[1] > ind_list[0]:
                    if cur_num + cigartuple[1] - 1 <= ind_list[-1]:
                        cigar_after.append((0, ind_list[0] - cur_num))
                        cigar_after.append((2, len(ind_list)))
                        cur_num += (ind_list[0] - cur_num)
                        #cur_num += len(ind_list)
                        flag = "after_del"
                    elif cur_num + cigartuple[1] - 1 > ind_list[-1]:
                        cigar_after.append((0, ind_list[0] - cur_num))
                        cigar_after.append((2, len(ind_list)))
                        cigar_after.append((0, cigartuple[1] - (ind_list[0] - cur_num) - len(ind_list)))
                        cur_num += cigartuple[1]
                        flag = "after_del"
                        
            elif flag == "into_del":
                if cur_num + cigartuple[1] - 1 <= ind_list[-1]:
                    cur_num += cigartuple[1]
                elif cur_num + cigartuple[1] - 1 > ind_list[-1]:
                    cigar_after.append((0, cur_num + cigartuple[1] - 1 - ind_list[-1]))
                    cur_num += cigartuple[1]
                    
            elif flag == "after_del":
                cigar_after.append(cigartuple)

    return cigar_after

In [10]:
def cigar_ins(read, ins_len, start_index):
#     if start_index is None:
#         start_index = stop_index - 2
    cigar_before = read.cigartuples
    cigar_after, cur_num, flag = [], 0, "before_ins"
    
    for cigartuple in cigar_before:
        tuplength = cigartuple[1]
        tupletype = cigartuple[0]
        
        if flag == "before_ins":
            if tupletype != 2:
                if cur_num + tuplength < start_index:
                    cigar_after.append(cigartuple)
                    cur_num += tuplength
                    
                elif cur_num + tuplength == start_index:
                    cigar_after.append(cigartuple)
                    cur_num += tuplength
                    if cur_num + ins_len < 250:
                        cigar_after.append((1, ins_len))
                        cur_num += ins_len
                    else:
                        new_ins_len = 250 - cur_num
                        cigar_after.append((1, new_ins_len))
                        cur_num += new_ins_len
                        break
                    flag = "after_ins"
                    
                elif cur_num + tuplength > start_index:
                    if cur_num + tuplength + ins_len >= 250:
                        cigar_after.append((tupletype, start_index - cur_num))
                        cur_num += start_index - cur_num
                        if cur_num + ins_len <= 250:
                            cigar_after.append((1, ins_len))
                            cur_num += ins_len
                            cigar_after.append((tupletype, 250 - cur_num))
                            break
                        else:
                            cigar_after.append((1, 250 - cur_num))
                            break
                        
                    elif cur_num + tuplength + ins_len < 250:
                        cigar_after.append((tupletype, start_index - cur_num))
                        cigar_after.append((1, ins_len))
                        cigar_after.append((tupletype, tuplength - (start_index - cur_num)))
                        cur_num += tuplength
                        cur_num += ins_len
                        flag = "after_ins"
            else:
                cigar_after.append(cigartuple)
                    
        elif flag == "after_ins":
            if tupletype != 2:
                if cur_num + tuplength <= 250:
                    cigar_after.append(cigartuple)
                    cur_num += tuplength
                else:
                    cigar_after.append((tupletype, 250 - cur_num))
                    break
            else:
                cigar_after.append(cigartuple)

    return cigar_after

In [14]:
def microdeletion(read, number_list, chr_name, start, stop, probability):
    print(start, stop)
    if np.random.choice([0,1], size=1, p=[1 - probability, probability]) == 1:
        ind_list = []
        for nucl in range(start, stop): # проходимся по нуклеотидам
            if (nucl - 1) in number_list:
                ind_list.append(number_list.index(nucl - 1)) # 0-based позиция нуклеотида в риде
        if len(ind_list) == 0:
            return(read)
        
        read.cigartuples = cigar_del(read, ind_list)

        if ind_list[-1] < read.query_length - 1:  # Если последний нуклеотид из делетированных не на конце рида
            read.query_sequence = read.query_sequence[:ind_list[0]] + read.query_sequence[(ind_list[-1]+1):]              
        else:
            read.query_sequence = read.query_sequence[:ind_list[0]]
#     print(f"del: {read.query_name}\n\
#             len: {len(read.query_sequence)}\n\
#             cigar: {sum([i[1] for i in read.cigartuples])}, {read.cigartuples}\n\
#             start: {read.reference_start}")
    return read
    
    
def microinsertion(read, number_list, chr_name, start, stop, seq, probability):
    if np.random.choice([0,1], size=1, p=[1 - probability, probability]) == 1:
        ins_len = len(seq)
        if (start - 1 > number_list[0]) and (start < number_list[-1]):
            start_index, stop_index = number_list.index(start ), number_list.index(start)
            read.query_sequence = read.query_sequence[:start_index] + seq + read.query_sequence[stop_index:]
            read.cigartuples = cigar_ins(read, ins_len, start_index)
        elif (start - 1 > number_list[0]):
            start_index = number_list.index(start)
            read.query_sequence = read.query_sequence[:start_index] + seq
            read.cigartuples = cigar_ins(read, ins_len, start_index)
        if len(read.query_sequence) > 250:
            read.query_sequence = read.query_sequence[:250]
    print(f"ins: {read.query_name}\n\
        len: {len(read.query_sequence)}\n\
        cigar: {sum([i[1] for i in read.cigartuples])}, {read.cigartuples}\n\
        start: {read.reference_start},\n\
        flag: {read.flag}")
    return read

In [6]:
def microchanger(read, number_list, fs_true):  # Вместо mode в перспективе будет колонка в csv для каждого варианта
    for fs_local in fs_true.index:
        chr_name, fs_type, start, seq = fs_true.loc[fs_local, "chromosome"], fs_true.loc[fs_local, "type"], fs_true.loc[fs_local, "position_start"], fs_true.loc[fs_local, "sequence"]
        probability = fs_true.loc[fs_local, "probability"]
        if type(probability) is not np.float64:
            probability = 0.5
        if pd.isna(fs_true.loc[fs_local, "position_finish"]):
            stop = fs_true.loc[fs_local, "position_start"] + 1
        else:
            stop = int(fs_true.loc[fs_local, "position_finish"] + 1)
            
        if (start in number_list) or ((stop - 1) in number_list): # Так как размер делеций предполагается меньше размера рида, если часть делеции находится на риде, то либо начало, либо конец обязательно попадут на рид
            if fs_type == "del":
                read = microdeletion(read, number_list, chr_name, start, stop, probability)
            elif fs_type == "ins":
                read = microinsertion(read, number_list, chr_name, start, stop, seq, probability)
    return read

In [15]:
fs = pd.read_csv(fs_file)

with pysam.AlignmentFile(target_reads, "rb") as samfile_input, pysam.AlignmentFile(output_bam, "wb", template=samfile_input) as samfile_output:
    chroms = list(set(samfile_input.references) & set(fs["chromosome"]))
    fs_true = fs.query("chromosome in @chroms")
    reads = samfile_input.fetch()
    for read in reads:
        number_list = read.get_reference_positions()
        read_returned = microchanger(read, number_list, fs_true)
        samfile_output.write(read_returned)

ins: SRR1295433.151980473
        len: 250
        cigar: 254, [(0, 198), (2, 4), (0, 46), (1, 4), (0, 2)]
        start: 29353402,
        flag: 163
ins: SRR1295554.160759353
        len: 250
        cigar: 254, [(0, 197), (2, 4), (0, 53)]
        start: 29353403,
        flag: 163
ins: SRR1295554.160759367
        len: 250
        cigar: 254, [(0, 196), (2, 4), (0, 46), (1, 4), (0, 4)]
        start: 29353404,
        flag: 163
ins: SRR1295433.151980466
        len: 250
        cigar: 250, [(0, 250)]
        start: 29353408,
        flag: 147
ins: SRR1295433.151980467
        len: 250
        cigar: 250, [(0, 238), (1, 4), (0, 8)]
        start: 29353412,
        flag: 83
ins: SRR1295554.160759350
        len: 250
        cigar: 250, [(0, 250)]
        start: 29353415,
        flag: 147
ins: SRR1295433.151980478
        len: 250
        cigar: 250, [(0, 250)]
        start: 29353419,
        flag: 99
ins: SRR1295433.151980468
        len: 250
        cigar: 254, [(0, 179), (2, 4), (0

ins: SRR1295554.160761321
        len: 250
        cigar: 250, [(0, 250)]
        start: 29387978,
        flag: 113
29388020 29388024
ins: SRR1295433.151982349
        len: 250
        cigar: 250, [(0, 250)]
        start: 29387987,
        flag: 147
29388020 29388024
ins: SRR1295433.151982350
        len: 250
        cigar: 250, [(0, 250)]
        start: 29387989,
        flag: 147
29388020 29388024
ins: SRR1295433.151982359
        len: 250
        cigar: 250, [(0, 250)]
        start: 29388000,
        flag: 99
29388020 29388024
del_ind_list: [19, 20, 21, 22]
            read_name: SRR1295433.151982359
29388020 29388024
del_ind_list: [15, 16, 17, 18]
            read_name: SRR1295554.160761322
29388020 29388024
29388020 29388024
29388020 29388024
ins: SRR1295554.160761732
        len: 250
        cigar: 250, [(0, 193), (1, 20), (0, 37)]
        start: 29396182,
        flag: 83
ins: SRR1295554.160761733
        len: 250
        cigar: 255, [(0, 191), (1, 20), (0, 25), (1, 5), (0, 1

In [21]:
fs

Unnamed: 0,chromosome,type,position_start,position_finish,sequence,probability
0,chr19,del,29379425,,,0.5
1,chr19,del,29379416,29379418.0,,0.5
2,chr19,del,29379435,29379439.0,,0.5
3,chr19,del,29376000,29376005.0,,0.5
4,chr19,del,29365400,29365403.0,,0.5
5,chr19,ins,29364000,,GTG,0.5
6,chr19,ins,29364020,,GTTGTG,0.5
7,chr19,ins,29363991,,ATTA,0.5
8,chr19,ins,29396400,,AGTCC,0.5
9,chr19,ins,29353650,,TTTT,0.5
