In [1]:
!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa

--2025-02-08 20:36:57--  http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/phix.fa
Resolving d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)... 13.227.44.207, 13.227.44.33, 13.227.44.91, ...
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.227.44.207|:80... connected.
200 OKequest sent, awaiting response... 
Length: 5528 (5.4K) [application/octet-stream]
Saving to: ‘phix.fa’


2025-02-08 20:36:58 (48.4 MB/s) - ‘phix.fa’ saved [5528/5528]



In [3]:
def read_genome(filename):
    genome = ''
    with open(filename, 'r') as f:
        for line in f:
            if not line[0] == '>':
                genome += line.rstrip() # strip out any whitespace, newline,...
    return genome

In [4]:
genome = read_genome("phix.fa")

In [6]:
def naive(whole_genome : str, reference : str) -> list:
    """
    Naive exact matching
    @param whole_genome the whole genome
    @param reference the short sequence to match
    @return the list of matching offsets
    """
    matching_offsets = []
    outer_loop_len = len(whole_genome) - len(reference) + 1
    for i in range(outer_loop_len):
        matched = True
        for j in range(len(reference)):
            if whole_genome[i + j] != reference[j]:
                matched = False
                break
        if matched:
            matching_offsets.append(i)
    return matching_offsets

In [7]:
test_genome = "ATCGAAAATTTCGATCGAAAATTTCAAAAAAA"
test_reference = "AAA"
test_matched = naive(test_genome, test_reference)
test_matched

[4, 5, 17, 18, 25, 26, 27, 28, 29]

In [10]:
# Matching real reads
!wget http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.first1000.fastq

--2025-02-08 21:01:48--  http://d28rh4a8wq0iu5.cloudfront.net/ads1/data/ERR266411_1.first1000.fastq
13.227.44.91, 13.227.44.207, 13.227.44.144, ...a8wq0iu5.cloudfront.net)... 
Connecting to d28rh4a8wq0iu5.cloudfront.net (d28rh4a8wq0iu5.cloudfront.net)|13.227.44.91|:80... connected.
200 OKequest sent, awaiting response... 
Length: 254384 (248K) [audio/mpeg]
Saving to: ‘ERR266411_1.first1000.fastq’


2025-02-08 21:01:49 (6.32 MB/s) - ‘ERR266411_1.first1000.fastq’ saved [254384/254384]



In [11]:
def read_fastq(filename):
    sequences = []
    qualities = []
    with open(filename, "r") as fh:
        while True:
            fh.readline()
            seq = fh.readline().rstrip()
            fh.readline()
            Q_encoded = fh.readline().rstrip()
            if len(seq) == 0:
                break
            sequences.append(seq)
            qualities.append(Q_encoded)
    return sequences, qualities

In [12]:
seqs, _ = read_fastq("ERR266411_1.first1000.fastq")

In [17]:
num_matched = 0
n = 0
for sequence_fastq in seqs:
    # Note that the accuracy decrease as we move to the end of the sequence, so if we only take the first prefix of 30 bases, matched proportion is gonna be high!
    # sequence_fastq = sequence_fastq[:30]
    if len(naive(genome, sequence_fastq)) > 0:
        num_matched += 1
    n += 1
print("Percentage of matched = {:.3f}".format(float(num_matched)/n))

Percentage of matched = 0.007


In [19]:
# Also, note that the sequencer may sometimes switch to the coding strand so the interchanging between the double strands
# may lead to the little success if we only restrict the matching process to the template strand

In [38]:
# This function's implementation here is different from that in the "Longest_Common_Prefix" module. 
# This also involves the reverse of the complementary strand because DNA-Polymerase III can only read from 5' to 3'.
# Applying this to the slide analogy, after the template strand is chopped up, its subsequences have to be flipped before stuck on the board.
# So we have to flip the complementary strand once again to compensate for that flip-flop.
def complement_strand(template_strand):
    complimentary_key = {'A':'T', 'T':'A', 'C':'G', 'G':'C', 'N':'N'}
    try:
        complement = "".join([complimentary_key[base] for base in reversed(template_strand)])
        return complement
    except:
        print("The template strand should only contain A, T, C, and G")

In [39]:
num_matched_ = 0
n_ = 0
for sequence_fastq in seqs:
    # Note that the accuracy decrease as we move to the end of the sequence, so if we only take the first prefix of 30 bases, matched proportion is gonna be high!
    sequence_fastq = sequence_fastq[:30]
    matching_set = naive(genome, sequence_fastq)
    matching_set.extend(naive(genome, complement_strand(sequence_fastq)))
    if len(matching_set) > 0:
        num_matched_ += 1
    n_ += 1
print("Percentage of matched = {:.3f}".format(float(num_matched_)/n_))

Percentage of matched = 0.932
