In [35]:
from Bio import SeqIO
import numpy as np
from PIL import Image
from skimage.filters import gaussian
import pandas as pd

from scipy import spatial
from cmath import inf

In [38]:
def get_pos(record):
    des = record.description
    tile_num = int(des.split(" ")[0].split(":")[4])
    x_pos = int(des.split(" ")[0].split(":")[5])
    y_pos = int(des.split(" ")[0].split(":")[6])
    seq = str(record.seq)
    return tile_num, x_pos, y_pos, seq


def generate_img(coordinates, op_path, r, blurred, sigma):
    x = coordinates[:, 0]
    y = coordinates[:, 1]
    x_min = np.min(x)
    y_min = np.min(y)
    x_range = 30000
    y_range = 30000
    img = np.zeros(shape=(30000, 30000))
    for i in range(0, len(x)):
        for j in range(0, r):
            for k in range(0, r):
                img[max(0, x[i] - x_min - j), max(0, y[i] - y_min - k)] = 1024 * 2
                img[min(x_range - 1, x[i] - x_min + j), max(0, y[i] - y_min - k)] = (
                    1024 * 2
                )
                img[
                    min(x_range - 1, x[i] - x_min + j),
                    min(y_range - 1, y[i] - y_min + k),
                ] = (
                    1024 * 2
                )
                img[max(0, x[i] - x_min - j), min(y_range - 1, y[i] - y_min + k)] = (
                    1024 * 2
                )
    if blurred:
        img = gaussian(img, sigma=sigma)
    im = Image.fromarray(img)
    new_im = im.convert("L")
    new_im.save(op_path, dpi=(300.0, 300.0))


def process_seq_info(fastq_path, tile):
    x_coordinate = []
    y_coordinate = []
    seq_set = []
    for record in SeqIO.parse(fastq_path, "fastq"):
        if record is not None:
            tile_num, x_pos, y_pos, seq = get_pos(record)
            if tile_num == tile:
                x_coordinate.append(x_pos)
                y_coordinate.append(y_pos)
                seq_set.append(seq)
    coordinates = np.column_stack(
        (
            max(x_coordinate) - np.array(x_coordinate),
            max(y_coordinate) - np.array(y_coordinate),
            # np.array(x_coordinate) - min(x_coordinate),
            # np.array(y_coordinate) - min(y_coordinate)
        )
    )
    return coordinates, np.array(seq_set)


def count_nearest_pts(src, dst, radius):
    """Counting the number of nearest neighbors for each given point.

    Args:
        src (numpy array): (N, 2) shape array. Build the kd tree based on this.
        dst (numpy array): (N, 2) shape array. For each point in this array, find the nearest neighbors in src array.
        radius (int): The maximum searching radius.

    Returns:
        res, idx: res is the distance for the point and its neighbor, 'inf' means no neighbor in given search radius. 
        idx is the index for the neighbor in src array.
    """
    tree = spatial.KDTree(src)
    res, idx = tree.query(dst, k=1, distance_upper_bound=radius)
    for i in range(0, len(idx)):
        if len(np.argwhere(idx == idx[i])) > 1:
            res[i] = inf
    return res, idx


def tell_me_sequence(source_coord, target_coord, seq_set):
    count_nearest_pts(source_coord, target_coord, 20)
    tree = spatial.KDTree(source_coord)
    res, idx = tree.query(target_coord, k=1, distance_upper_bound=20)
    for i in range(0, len(idx)):
        if len(np.argwhere(idx == idx[i])) > 1:
            res[i] = inf
    return seq_set[idx]


In [33]:
fastq_path = "/Users/qinhanhou/Desktop/DeindlLab/0729Poly/0824_NewData/data/0824.fastq"
coordinates, seq_set = process_seq_info(fastq_path, 1101)
print(coordinates)
print(seq_set)
generate_img(coordinates, "/Users/qinhanhou/Desktop/DeindlLab/0729Poly/0824_NewData/data/test.png", 3, True, 4)


[[14017 27611]
 [13951 27609]
 [14457 27605]
 ...
 [16800   166]
 [16820   136]
 [10822     0]]
['CGTCAACCATACCAGCAGAGGAAGCATCAGCACCAGCACGCTCCCAAGCAT'
 'TTTTATCGAAGCGCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGC'
 'GTAATTATACTCATCGCGAATATCCTTAAGAGGGCGTTCAGCAGCCAGCTT' ...
 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'
 'CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC'
 'CTTCCCCCTTTTTTTTTTTTTTTTTNNNNNNNNTTTTTTTTNNNNNNNNTT']


In [39]:
print(tell_me_sequence(coordinates, np.array([[14014, 27612], [13964, 27623]]), seq_set))

['CGTCAACCATACCAGCAGAGGAAGCATCAGCACCAGCACGCTCCCAAGCAT'
 'TTTTATCGAAGCGCGCATAAATTTGAGCAGATTTGTCGTCACAGGTTGCGC']
