# Create 1MB of query that contains 500 local matches

# Set error rate

In [45]:
#error_rate = "0"    #   0%
#error_rate = "0025" # 2.5%
#error_rate = "005"  #   5%
#error_rate = "0075" # 7.5%
error_rate = "010"  #  10%

# Local matches sampled with certain error rate
match_file = "local_matches_" + error_rate + ".fastq"

# 1MB of sequence where local matches will be inserted into
query_file = "one_line.fasta"

# Output file
query_with_insertions_file = "query_with_insertions_" + error_rate + ".fasta"

## Read in local matches and 1MB query sequence

In [46]:
import sys
import os
from Bio import SeqIO
import pandas as pd

def check_file_exists(fn):
    if not os.path.exists(fn):
        raise SystemError("Error: File does not exist\n")

In [47]:
check_file_exists(match_file)

local_matches = list(SeqIO.parse(match_file, "fastq"))
local_matches[0].seq

Seq('GGTGGCTTAAAAGGAAGAACAGGCACCTGATAGCGAGGGGAGCAGCCTGT', SingleLetterAlphabet())

In [48]:
check_file_exists(query_file)

query = list(SeqIO.parse(query_file, "fasta"))
assert (len(query) == 1),"Query contains more than one sequence, pick which one to insert to."

## Assign random location to each local match

In [49]:
query_len = len(query[0].seq)
query_len

1048576

In [50]:
import random
random.seed(42)
ran_ind_list = random.sample(range(query_len), len(local_matches))
ran_ind_list.sort()
ran_ind_list[0:5]

[851, 1199, 3546, 4520, 6805]

In [51]:
# shuffle local matches so that they would appear in the query in a random order
random.shuffle(local_matches) 

for i in range(len(local_matches)):
    local_matches[i].description = ran_ind_list[i]

local_matches[0:5]

[SeqRecord(seq=Seq('GAGACCTCTGGTAGTATGAAGAGGTTCTTATGACACAAGACTGGTCGACGGACT...CGA', SingleLetterAlphabet()), id='l100-109', name='l100-109', description=851, dbxrefs=[]),
 SeqRecord(seq=Seq('GCGGGATCTTAGGACCTCGGTTAAATTATTTTCGCTAGATCACCCAAGCGTTCT...TAG', SingleLetterAlphabet()), id='l150-101', name='l150-101', description=1199, dbxrefs=[]),
 SeqRecord(seq=Seq('AAGGGGCCTCAGGTTCTGTTACTTGTTAAATATGGGCGGGAAGTGCTGCAGTTT...ACG', SingleLetterAlphabet()), id='l200-124', name='l200-124', description=3546, dbxrefs=[]),
 SeqRecord(seq=Seq('AGTAGCATTTGAAGCTGGGCACACACTGGCGGGTAGGAGAACGATCGCTGTTGA...TAG', SingleLetterAlphabet()), id='l150-97', name='l150-97', description=4520, dbxrefs=[]),
 SeqRecord(seq=Seq('AGATCCTCCAGTGAGCGTCTCAAACCCTCAATTGAGACCTGTCTAAGAAGGATT...AAA', SingleLetterAlphabet()), id='l200-43', name='l200-43', description=6805, dbxrefs=[])]

## Insert local matches into query at random positions

In [52]:
original_query = query[0].seq

In [53]:
insertion_length = 0
query_with_insertions = original_query[0:local_matches[0].description]

for i in range(0, len(local_matches) - 1):
    prefix = query_with_insertions[0:local_matches[i].description + insertion_length]
    insertion = local_matches[i].seq
    postfix = original_query[local_matches[i].description:local_matches[i+1].description]
    
    query_with_insertions = prefix + insertion + postfix 
    insertion_length += len(local_matches[i].seq)

# edge case: insert last local match
query_with_insertions = query_with_insertions[0:local_matches[-1].description + insertion_length]
query_with_insertions += local_matches[-1].seq
insertion_length += len(local_matches[-1].seq)
query_with_insertions += original_query[local_matches[-1].description:]

In [54]:
assert (len(query_with_insertions) - len(original_query)) == insertion_length,"Wrong length of sequence inserted."

In [55]:
with open(query_with_insertions_file, 'w') as f:
    f.write('>1\n')
    f.write(str(query_with_insertions))
    
f.close()