In [1]:
import numpy 
from Bio import SeqIO
import gzip

In [2]:
def read_fastq(file_path):
    sequences = []
    
    # Handling gzipped files
    with gzip.open(file_path, "rt") as handle:
        for record in SeqIO.parse(handle, "fastq"):
            sequences.append(str(record.seq))
    
    return sequences


def align_and_score(sequence, barcode, end_length):
    """
    Implement your alignment algorithm here. 
    - `sequence`: the protein sequence to be aligned.
    - `barcode`: the barcode sequence to align to the protein sequence.
    - `end_length`: the length of sequence ends to consider for alignment.
    
    You might:
    - Slice the ends of the protein sequence based on `end_length`.
    - Perform alignment with the barcode.
    - Compute and return the alignment score.
    """
    # Example: If considering the end of the sequence
    sequence_end = sequence[-end_length:]
    
    # TODO, alignment score
    
    score = None
    return score

In [3]:
from Bio import pairwise2
from Bio.SubsMat import MatrixInfo



In [4]:
# Example sequence and barcode
sequence = "TGTTGTGTAAGGTGTACCTCGTTCAGTTACGTATTGCTGGTTAAGAGTCTTGTGTCCCAGTTACCAGGCAGCACCACCCCCAAGGGGTTATGCTAGTTATTGCTCAGCGGTGGCAGCAGCCAACTCAGCTTCCTTTCGGGCTTTGTTCGCGGCCGGATCTCAGTGGTGGTGGTGGTGGTGCTCGAGCCAGTCATTCTCCTTAGTATAAGGGTGTGACCAGATGGCAACTACTGAAGCACGACTTGAGCCAAGCGTTGTACATCCCTTCGATGTCTTCCGGAGAGCCACCTTTCTTAGCCAAAAATGGCTTGATAGTGGCGGTGATAGGATAGATATTTGCGATAAGATAACGAAGTGGGATATGGGGCACGGTGCGTACTCCGTCGACCCCTTTCTTTGAACGGTGATGACGAAGCCCAACTTCGTACTGGTAGTCTAACCATTCACGGTTATAGTCGCGCCAGGTAGTGTCCAGAATCCAGGCTCCAAAGCGAGCGCGTACACGTTCCAGGTATTCCTTAATAGGCTCTCCTGTATCCGGATTGGAGAAGTAATAAATCAAATGCTCATTAAATGTTAGCAACCATCCCACAAGTCAAGGATCTCATCAACTTGATCTTTCAGAACGTCACGCGCCTTTTTCAAGTACATTACGTCCTTTTCACCTAACATGGCAGTCTTCTTCAGAAGGTCAAACTCAAGGTCCGTGATGGGTGACTTCTGAGCACGCCCATAATCATCTCCGGGATGTCCGAGGGAGTCATCTATATCTCCTTCTTAAAGTTAAACAAAATTATTTCTAGAGGGGAATTGTTATCCGCTCACAATTCCCCTATAGTGAGTCGTATTAATTTCGGGATGGAATTTCTAGAGTCCTGAAGTAAGTGTGGGTGAGCAATACATAG"  # Example with barcode at the end

barcodes = {"Barcode01" : "AAGAAAGTTGTCGGTGTCTTTGTG",
            "Barcode02" : "TCGATTCCGTTTGTAGTCGTCTGT",
            "Barcode03" : "GAGTCTTGTGTCCCAGTTACCAGG"
            }

config = {
    "start_gap1": 40,
    "end_gap1": 40,
    "open_gap1": 0,
    "extend_gap1": -40,
    "start_gap2": 40,
    "end_gap2": 40,
    "open_gap2": -160,
    "extend_gap2": -160,
    "min_score_barcode_front": 60.0,
    "front_window_size": 150,
    "rear_window_size": 150,
}

In [5]:
def scoring_matrix():
    return {
        ('A', 'A'): 96,  ('A', 'C'): -316, ('A', 'G'): -192, ('A', 'T'): -369, ('A', 'N'): 0,
        ('C', 'A'): -316,('C', 'C'): 100,  ('C', 'G'): -352, ('C', 'T'): -295, ('C', 'N'): 0,
        ('G', 'A'): -192,('G', 'C'): -352, ('G', 'G'): 98,   ('G', 'T'): -329, ('G', 'N'): 0,
        ('T', 'A'): -369,('T', 'C'): -295, ('T', 'G'): -329, ('T', 'T'): 100,  ('T', 'N'): 0,
        ('N', 'A'): 0,   ('N', 'C'): 0,    ('N', 'G'): 0,    ('N', 'T'): 0,    ('N', 'N'): 0,
    }

In [8]:
def get_theoretical_max_score(barcode, scoring_matrix):
    # Get the match score for a matching pair of nucleotides
    match_score = 0
    for i in range(len(barcode)):
        if barcode[i] != 'N':
            match_score += scoring_matrix[(barcode[i], barcode[i])]
    
    
    # The theoretical max score is N * match score
    return match_score

def get_best_scored_segment(seq1, seq2, config, top_k=20):
    # Only consider the first 100bp of seq2 for alignment
    seq1 = seq1[:100]
    sm = scoring_matrix()
    # Perform local alignment using Smith-Waterman algorithm
    alignments = pairwise2.align.localds(seq1, seq2, sm, -40, -40)
    # Get top-k alignments
    top_alignments = alignments[:top_k]
    
    best_score = 0
    best_alignment = None
    
    for alignment in top_alignments:
        # Unpack alignment data
        aligned_seq1, aligned_seq2, score, begin, end = alignment
        
        # Recalculate score from 'begin' to 'end'
        segment_score = 0
        for i in range(begin, end):

            # Use the scoring matrix to calculate the score
            segment_score += sm.get((aligned_seq1[i], aligned_seq2[i]), 0)
        
        # Update if the score of this segment is the best so far
        if segment_score > best_score:
            best_score = segment_score
            best_alignment = alignment
    
    # Return the best-scoring alignment segment
    return best_alignment

for barcode_name, barcode_seq in barcodes.items():

    sm  = scoring_matrix()

    theoretical_max_score = get_theoretical_max_score(barcode_seq, sm)

    print(theoretical_max_score)

    alignment = get_best_scored_segment(sequence, barcode_seq, config)
    
    if alignment:
        aligned_seq1, aligned_seq2, score, begin, end = alignment
        norm_score = (score / theoretical_max_score) * 100
        print("Aligned Sequences:")
        print(aligned_seq1)
        print(aligned_seq2)
        print("Alignment Score:", norm_score)
        print("Alignment starts at position", begin, "and ends at position", end)


2364
Aligned Sequences:
TGTTGTGTAAGGTGTACCTCGTTCAGTTAC--GTAT-TGCTGGTTAAGAGTCTTGTGTCCCAGTTACCAGGCAGCACCACCCCCAAGGGGTTATGCTAGTTAT
-----------------------AAG--A-AAGT-TGT-C-GG-T----GTCTT-TGTG--------------------------------------------
Alignment Score: 54.145516074450086
Alignment starts at position 24 and ends at position 58
2380
Aligned Sequences:
TGTTGTGTAAGGTGTACCTCG-TTCAGTTACGTATTGCTG-GTTAAGA-GTCTTGTGTCCCAGTTACCAGGCAGCACCACCCCCAAGGGGTTATGCTAGTTAT
------------------TCGATTC-----CGT-TTG-T-AG-T----CGTC-TGT-----------------------------------------------
Alignment Score: 59.15966386554622
Alignment starts at position 18 and ends at position 56
2370
Aligned Sequences:
TGTTGTGTAAGGTGTACCTCGTTCAGTTACGTATTGCTGGTTAAGAGTCTTGTGTCCCAGTTACCAGGCAGCACCACCCCCAAGGGGTTATGCTAGTTAT
--------------------------------------------GAGTCTTGTGTCCCAGTTACCAGG--------------------------------
Alignment Score: 100.0
Alignment starts at position 44 and ends at position 68


In [82]:
def get_best_scored_segment_global(seq1, seq2, config, top_k=5):
    # Perform global alignment using Needleman-Wunsch algorithm
    sm = scoring_matrix()
    alignments = pairwise2.align.globalds(seq1, seq2, sm, 
                                          config["open_gap1"], 
                                          config["extend_gap1"])
    
    # Get top-k alignments
    top_alignments = alignments[:top_k]

    best_score = 0
    best_alignment = None
    
  
    
    for alignment in top_alignments:
        # Unpack alignment data
        aligned_seq1, aligned_seq2, score, begin, end = alignment
        
        # Recalculate score for the entire aligned segment
        segment_score = 0
        for i in range(len(aligned_seq1)):
            # Use the scoring matrix to calculate the score
            segment_score += sm.get((aligned_seq1[i], aligned_seq2[i]), 0)
        
        # Update if the score of this segment is the best so far
        if segment_score > best_score:
            best_score = segment_score
            best_alignment = alignment
    
    # Return the best-scoring alignment segment
    return best_alignment


for barcode_name, barcode_seq in barcodes.items():
    alignment = get_best_scored_segment_global(sequence, barcode_seq, config)
    
    if alignment:
        aligned_seq1, aligned_seq2, score, begin, end = alignment
        print("Aligned Sequences:")
        print(aligned_seq1)
        print(aligned_seq2)
        print("Alignment Score:", score)
        print("Alignment starts at position", begin, "and ends at position", end)


Aligned Sequences:
TGTGTGCATGTACTCGTTCAGTTACGTATTGCTGGACGAAGAACTCAAGTCAAAGGCATCTCAATCCCGCGAAATTAATACGACTCACTATAGGGGAATTGTCAGCGGATAACAATTCCCCTCTAAATAATTTTGTTTAACTTT
-----------------------A---A------GA--AAG---T----T-----G--------T--CG-G-------T--G--T--C----------TT-T-----G-------------------------TG---------
Alignment Score: -2436.0
Alignment starts at position 0 and ends at position 144
Aligned Sequences:
TGTGTGCATGTACTCGTTCAGTTACGTATTGCTGGACGAAGAACTCAAGTCAAAGGCATCTCAATCCCGCGAAATTAATACGACTCACTATAGGGGAATTGTCAGCGGATAACAATTCCCCTCTAAATAATTTTGTTTAACTTT
-------------------------------------------------TC----G-A--T---T--C-CG---TT--T--G--------T------A--GT---C-G-------------TC----------TG--------T
Alignment Score: -2420.0
Alignment starts at position 0 and ends at position 144
Aligned Sequences:
TGTGTGCATGTACTCGTTCAGTTACGTATTGCTGGACGAAGAACTCAAGTCAAAGGCATCTCAATCCCGCGAAATTAATACGACTCACTATAGGGGAATTGTCAGCGGATAACAATTCCCCTCTAAATAATTTTGTTTAACTTT
---------------G---AG-T-C----T--T-------G---T---GT------

In [53]:
config

{'start_gap1': 40,
 'end_gap1': 40,
 'open_gap1': 40,
 'extend_gap1': 40,
 'start_gap2': 40,
 'end_gap2': 40,
 'open_gap2': 160,
 'extend_gap2': 160,
 'min_score_barcode_front': 60.0,
 'front_window_size': 150,
 'rear_window_size': 150}