# Create 1MB of query that contains 500 local matches

# Set error rate

In [162]:
error_rate = "0"    #   0%
#error_rate = "0025" # 2.5%
#error_rate = "005"  #   5%
#error_rate = "0075" # 7.5%
#error_rate = "010"  #  10%

# ------- INPUT -------
# Local matches sampled with certain error rate
match_file = "local_matches_" + error_rate + ".fastq"

# 1MB of sequence where local matches will be inserted into
query_file = "one_line.fasta"

# ------- OUTPUT ------- 
# query sequence file
query_with_insertions_file = "query_with_insertions_" + error_rate + ".fasta"
# ground truth file
ground_truth_file = "ground_truth_" + error_rate + ".tsv"

## Read in local matches and 1MB query sequence

In [163]:
import sys
import os
from Bio import SeqIO
import pandas as pd

def check_file_exists(fn):
    if not os.path.exists(fn):
        raise SystemError("Error: File does not exist\n")

In [164]:
check_file_exists(match_file)

local_matches = list(SeqIO.parse(match_file, "fastq"))
local_matches[0].seq

Seq('CCAATGGATATTGGCTCGACACCACAGCGTCGATTCTTTCGTTAACCGCT', SingleLetterAlphabet())

In [165]:
check_file_exists(query_file)

query = list(SeqIO.parse(query_file, "fasta"))
assert (len(query) == 1),"Query contains more than one sequence, pick which one to insert to."

## Assign random location to each local match

In [166]:
query_len = len(query[0].seq)
query_len

1048576

In [167]:
import random
random.seed(42)
ran_ind_list = random.sample(range(query_len), len(local_matches))
ran_ind_list.sort()
ran_ind_list[0:5]

[851, 1199, 3546, 4520, 6805]

In [168]:
# shuffle local matches so that they would appear in the query in a random order
random.shuffle(local_matches) 

for i in range(len(local_matches)):
    local_matches[i].description = ran_ind_list[i]

local_matches[0:5]

[SeqRecord(seq=Seq('GAGTCCGAAATGTACATAGAATTTATGTAATTCTAGTTCGTTTGGGCGATTGCA...TCG', SingleLetterAlphabet()), id='l100-109', name='l100-109', description=851, dbxrefs=[]),
 SeqRecord(seq=Seq('TGATTGATCCGATTCACACCATATAGTAGGGCTGGTAACACCCCTCTCGAGTGT...GGC', SingleLetterAlphabet()), id='l150-101', name='l150-101', description=1199, dbxrefs=[]),
 SeqRecord(seq=Seq('CAGCACCGGCGCGGGCAAGAAAATCCATAGGGAGAATCAGCAATTAGTTCGACT...CCG', SingleLetterAlphabet()), id='l200-124', name='l200-124', description=3546, dbxrefs=[]),
 SeqRecord(seq=Seq('CGATGTGTGATCCTTAAACTTCGGGCAACTTACGCTGTCGCCCATCAGACAGTC...CTA', SingleLetterAlphabet()), id='l150-97', name='l150-97', description=4520, dbxrefs=[]),
 SeqRecord(seq=Seq('CCGCATTCTGGTGACACAGACCGCCTGAATGCCATCCCTGCGTCTCCAAATGAA...GAT', SingleLetterAlphabet()), id='l200-43', name='l200-43', description=6805, dbxrefs=[])]

## Insert local matches into query at random positions

In [169]:
original_query = query[0].seq

id_list = []
position_list = []
length_list = []

In [170]:
insertion_length = 0
query_with_insertions = original_query[0:local_matches[0].description]

for i in range(0, len(local_matches)):
    match = local_matches[i]
    
    # gather ground truth
    id_list.append(match.name)
    insertion_position = match.description + insertion_length
    position_list.append(insertion_position)
    length_list.append(len(match.seq))
    
    prefix = query_with_insertions[0:insertion_position]
    insertion = match.seq
    
    if (i < (len(local_matches) - 1)):
        postfix = original_query[match.description:local_matches[i+1].description]
    else:
        # edge case: insert last local match
        postfix = original_query[local_matches[-1].description:]
        
    insertion_length += len(match.seq)
    query_with_insertions = prefix + insertion + postfix 

In [171]:
assert (len(query_with_insertions) - len(original_query)) == insertion_length,"Wrong length of sequence inserted."

In [175]:
with open(query_with_insertions_file, 'w') as f:
    f.write('>1\n')
    f.write(str(query_with_insertions))
    
f.close()

## Create ground truth file

In [176]:
ground_truth = {'id':id_list,
                'position':position_list,
                'length':length_list}
 
ground_truth_df = pd.DataFrame(ground_truth)
ground_truth_df.to_csv(ground_truth_file, index=False, sep='\t')
ground_truth_df.head()

Unnamed: 0,id,position,length
0,l100-109,851,100
1,l150-101,1299,150
2,l200-124,3796,200
3,l150-97,4970,150
4,l200-43,7405,200
