In [1]:
from Bio import SeqIO
from Bio import pairwise2
import numpy as np
import glob
import os
import pandas as pd

def align_ab1(input_file, reference_file):
    """
    Aligns the sequence from an AB1 file (from position 0 to 750) to a reference sequence,
    and calls variants based on the positions from the reference.

    :param input_file: Path to the input AB1 file
    :param reference_file: Path to the reference sequence file in FASTA format
    """
    try:
        # Read the AB1 file
        record = SeqIO.read(input_file, "abi")
        ab1_seq = record.seq[:750]

        # Read the reference file
        reference = SeqIO.read(reference_file, "fasta")
        ref_seq = reference.seq

        # Align the AB1 sequence to the reference
        alignments = pairwise2.align.globalms(ref_seq, ab1_seq, 2, -1, -1000, -1)
        if not alignments:
            print("No alignment found.")

        # Take the best alignment
        alignment = alignments[0]
        aligned_ref, aligned_seq, start, end = alignment[0], alignment[1], alignment[3], alignment[4]

        # Find pattern "ATGCCG" in aligned_ref 
        pattern = "ATGCCG"
        pattern_start = aligned_ref.find(pattern)
        end_pattern = "CCACTGA"
        pattern_end = aligned_ref.find(end_pattern) + len(end_pattern)
        
        # Trim the aligned sequences
        aligned_seq = aligned_seq[pattern_start:pattern_end]
        aligned_ref = aligned_ref[pattern_start:pattern_end]

        # Find the variants
        variants = []
        ref_pos = start
        for ref_char, seq_char in zip(aligned_ref, aligned_seq):
            if seq_char != ref_char and ref_char != 'N' and seq_char != '-':
                variant = f"{ref_char}{ref_pos + 1}{seq_char}"
                variants.append(variant)
            if ref_char != '-':
                ref_pos += 1

        variants = "_".join(variants)

    except Exception as e:
        print(f"An error occurred: {str(e)}")

    file_name = os.path.basename(input_file)

    return file_name, variants


In [20]:
reference_file = "/home/emre/tam-lqv.fasta"
rows = ["A", "B", "C", "D", "E", "F", "G", "H"]

reference_dict = {"Well" : [], "Variant" : []}
for row in rows:

    inputs = glob.glob(f"/home/emre/Sanger_sequences/Row_{row}_trace*/*.ab1")

    for input_file in inputs:
        file_name, variants = align_ab1(input_file, reference_file)
        well_name = f'{file_name}'
        reference_dict["Well"].append(well_name)
        reference_dict["Variant"].append(variants)


df = pd.DataFrame(reference_dict)
#df.to_csv(f"/home/emre/Sanger_sequences/Row_{row}_trace-Oct24-11-23-39/variants.csv", index=False)


In [22]:
df.to_csv(f"/home/emre/Sanger_sequences/variants.csv", index=False)


In [4]:
# Read the AB1 file
record = SeqIO.read(inputs[3], "abi")
ab1_seq = record.seq[:750]

# Read the reference file
reference = SeqIO.read(reference_file, "fasta")
ref_seq = reference.seq

# Align the AB1 sequence to the reference
alignments = pairwise2.align.globalms(ref_seq, ab1_seq, 2, -1, -1000, -0.5)
if not alignments:
    print("No alignment found.")

# Take the best alignment
alignment = alignments[0]
aligned_ref, aligned_seq, start, end = alignment[0], alignment[1], alignment[3], alignment[4]

# Find pattern "ATGCCG" in aligned_ref 
pattern = "ATGCCG"
pattern_start = aligned_ref.find(pattern)
print(pattern_start)
pattern_end = pattern_start + len(ref_seq)

# Trim the aligned sequences
aligned_seq = aligned_seq[pattern_start:pattern_end]
aligned_ref = aligned_ref[pattern_start:pattern_end]

# Find the variants
variants = []
ref_pos = start
for ref_char, seq_char in zip(aligned_ref, aligned_seq):
    if seq_char != ref_char and ref_char != 'N' and seq_char != '-':
        variant = f"{ref_char}{ref_pos + 1}{seq_char}"
        variants.append(variant)
    if ref_char != '-':
        ref_pos += 1

57


In [5]:
aligned_ref.find("ATGCCG")

0

In [6]:
# Align the AB1 sequence to the reference
X, Y = 'GCAGCGTC', 'AGTGCGGCAGTGTCCTTAG'
#alignments = pairwise2.align.globalms(X, Y, 2, -1, -1000, -1)
# Local alignment
alignments = pairwise2.align.localms(Y, X, 2, -1, -100, -2)
if not alignments:
    print("No alignment found.")

# Take the best alignment
alignment = alignments[0]
aligned_ref, aligned_seq, start, end = alignment[0], alignment[1], alignment[3], alignment[4]

In [7]:
alignment[0]

'AGTGCGGCAGTGTCCTTAG'

## Global vs Local alignment 

In [22]:
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
from Bio.Seq import Seq

# Define two sequences
seq1 = Seq("AGTGCGGCAGTGTCCTTAG")
seq2 = Seq("GCAGCGTC")

# Global alignment
# global_alignments = pairwise2.align.globalxx(seq1, seq2)
# print("Global alignments:")
# for alignment in global_alignments:
#     print(format_alignment(*alignment))

# Local alignment
local_alignments = pairwise2.align.localxx(seq1, seq2)
print("\nLocal alignments:")
for alignment in local_alignments:
    print(format_alignment(*alignment))

# Semi-global alignment (free-end gaps)
# For semi-global, we use the same function as global, but with a zero score for open and extend gaps
semi_global_alignments = pairwise2.align.globalxx(seq1, seq2, one_alignment_only=False, penalize_end_gaps=False)
print("\nSemi-global (free-end gaps) alignments:")
for alignment in semi_global_alignments:
    print(format_alignment(*alignment))



Local alignments:
7 GCAGT-GTCC
  ||||  || |
1 GCAG-CGT-C
  Score=7

6 GGCAGT-GTCC
  | |||  || |
1 G-CAG-CGT-C
  Score=7

4 GCGGCAGT-GTCC
  |   |||  || |
1 G---CAG-CGT-C
  Score=7

2 GTGCGGCAGT-GTCC
  |     |||  || |
1 G-----CAG-CGT-C
  Score=7

4 GCGGCAGT-GTCC
  ||   ||  || |
1 GC---AG-CGT-C
  Score=7

2 GTGCGGCAGT-GTCC
  |  |   ||  || |
1 G--C---AG-CGT-C
  Score=7

7 GCAGTGTCC
  ||||.|| |
1 GCAGCGT-C
  Score=7

6 GGCAGTGTCC
  | |||.|| |
1 G-CAGCGT-C
  Score=7

4 GCGGCAGTGTCC
  |   |||.|| |
1 G---CAGCGT-C
  Score=7

2 GTGCGGCAGTGTCC
  |     |||.|| |
1 G-----CAGCGT-C
  Score=7

4 GCGGCAGTGTCC
  ||   ||.|| |
1 GC---AGCGT-C
  Score=7

2 GTGCGGCAGTGTCC
  |  |   ||.|| |
1 G--C---AGCGT-C
  Score=7

4 GCG-GCAGTGTCC
  ||  ||   || |
1 GC-AGC---GT-C
  Score=7

2 GTGCG-GCAGTGTCC
  |  |  ||   || |
1 G--C-AGC---GT-C
  Score=7

4 GCGGCAGTGTCC
  ||.||   || |
1 GCAGC---GT-C
  Score=7

2 GTGCGGCAGTGTCC
  |  |.||   || |
1 G--CAGC---GT-C
  Score=7

4 GC-GGCAGTGTCC
  || | |   || |
1 GCAG-C---GT-C
  Score