## Create segment files

In [52]:
import pandas as pd
import sys
import os
from Bio import SeqIO

#------------ INPUT ------------ 
ref_file = "../100kb/ref_rep0.fasta"
seg_file = "../100kb/split/ref_seg_rep0.txt"
rep = 0
bins = 4

#------------ OUTPUT ------------
outfile_prefix = "../100kb/split/rep" + str(rep) + "_seg"

In [53]:
segments = pd.read_csv(seg_file, header=None, sep='\t')
segments.columns = ["bin_id", "ref_id", "start", "length"]
reference_segments = list(SeqIO.parse(ref_file, "fasta"))

In [54]:
# assuming a single reference sequence
for index, row in segments.iterrows():
    bin_id = row["bin_id"]
    ref_id = row["ref_id"]
    start = row["start"]
    length = row["length"]
    with open(outfile_prefix + str(bin_id) + ".fasta", 'w') as f:
        f.write('>' + str(bin_id) + '\t' + "ref=" + str(ref_id) + ",start=" + str(start) + ",length=" + str(length) + '\n')
        f.write(str(reference_segments[0].seq[start:start+length]))
    f.close()

## Distribute queries

In [14]:
import numpy as np
import pandas as pd
from Bio import SeqIO

#------------ INPUT ------------ 
query_file = "../100kb/queries/rep0_e0.025.fastq"
search_file = "../100kb/search/rep0_e0.025.out"

bins = 64
rep = 0
er = 0.025
#------------ OUTPUT ------------ 
output_prefix = "../100kb/queries/rep" + str(rep)

In [20]:
matches = pd.read_csv(search_file, sep='\t', header=None)
matches.columns = ["read_id", "matches"]
matches[['read_id','meta']] = matches['read_id'].str.split(' ',expand=True)
matches = matches.assign(match_cols=matches['matches'].str.split(',')).explode('matches')
matches = matches.drop(["matches", "meta"], axis = 1)
matches = matches.explode("match_cols")
matches.columns = ["read_id", "bin_id"]
matches = matches.replace(r'^\s*$', np.nan, regex=True).dropna() # drop empty rows
matches = matches.reset_index(drop = True)
matches["bin_id"] = pd.to_numeric(matches["bin_id"])
matches

Unnamed: 0,read_id,bin_id
0,0,8
1,1,8
2,1,52
3,1,12
4,2,51
5,2,1
6,4,30
7,5,62
8,6,49
9,7,28


In [21]:
queries = list(SeqIO.parse(query_file, "fastq"))
for bin_id in list(range(bins)):
    bin_matches = matches[matches["bin_id"]==bin_id]
    seg_out_file = output_prefix + "_seg" + str(bin_id) + "_e" + str(er) + ".fastq"
    with open(seg_out_file, "w") as output_handle:
        for query in queries:
            if (query.name in list(bin_matches["read_id"])):
                SeqIO.write(query, output_handle, "fastq")

In [24]:
import subprocess
remove_empty_files = "find . -type f -empty -delete"
subprocess.call(remove_empty_files, shell=True, cwd='../100kb/queries')

0