## Bioinformatics Libraries

### Biopython

#### Example A: Parsing FASTA files & computing GC content
- Purpose: Load sequence data from FASTA files and analyze.

In [None]:
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

for record in SeqIO.parse("../data/fasta_example.fasta", "fasta"):
    gc_content = gc_fraction(record.seq) * 100
    print(f"{record.id}: {gc_content:.2f}% GC")


In [None]:
# have bam, convert it to bam
import pysam
bamfile = pysam.AlignmentFile("../data/bam_example.bam", "rb")
for read in bamfile.fetch("chr1", 1000, 2000):
    print(read.query_name, read.query_sequence)

In [None]:

import vcfpy

reader = vcfpy.Reader.from_path("../data/vcf_example.vcf")
for record in reader:
    alt_alleles = [alt.value for alt in record.ALT]  # extract alt allele strings
    print(record.CHROM, record.POS, record.REF, alt_alleles)


### Basic Sequence Handling
BioPython provides powerful objects to work with biological sequences. Here, we import multiple modules to create and manipulate DNA, RNA, or protein sequences.

In [None]:
from Bio.Seq import Seq
from Bio.SeqUtils import MeltingTemp as mt
from Bio.Data import CodonTable
from skbio import DNA
from skbio.alignment import global_pairwise_align_nucleotide

print("### Biopython Sequence Basics")
# Define DNA sequence
seq = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
print("Original:", seq)
print("Complement:", seq.complement())
print("Reverse:", seq[::-1])
print("Reverse Complement:", seq.reverse_complement())
print("Transcription:", seq.transcribe())
print("Translation:", seq.translate())

print("\n### scikit-bio Sequence Analysis")
sequence = DNA("ATGCGTAGCTAG")
print("GC content:", sequence.gc_content())
print("Nucleotide frequencies (1-mers):", sequence.kmer_frequencies(k=1))
print("3-mers:", sequence.kmer_frequencies(k=3))

print("\n### Biopython Codon Table")
standard_table = CodonTable.unambiguous_dna_by_name["Standard"]
print("Start codons:", standard_table.start_codons)
print("Stop codons:", standard_table.stop_codons)

print("\n### Biopython Motif Search")
motif = "TATA"
seq2 = Seq("TATAAGCGTATAAATAGCGGTATA")
positions = [i for i in range(len(seq2)) if seq2[i:i+len(motif)] == motif]
print(f"Motif '{motif}' found at positions:", positions)

print("\n### Biopython Melting Temperature")
dna_seq = "ATCGATCGATCG"
print("Melting temperature (Wallace rule):", mt.Tm_Wallace(dna_seq))

print("\n### scikit-bio Sequence Alignment")
seq1 = DNA("ACGTA")
seq2 = DNA("ACCTA")
alignment, score, _ = global_pairwise_align_nucleotide(seq1, seq2)
print("Alignment score:", score)
print(alignment)

### scikit-bio

#### Example: Pairwise Sequence Alignment
- Purpose: Align two sequences to identify similarities.

In [None]:

from skbio import DNA, read
from skbio.alignment import local_pairwise_align_ssw

# Read two sequences from the FASTA file
sequences = list(read("../data/fasta_example.fasta", format="fasta", constructor=DNA))

# Assign the sequences
seq1 = sequences[0]
seq2 = sequences[1]

# Run alignment
alignment, score, _ = local_pairwise_align_ssw(seq1, seq2)

# Output
print("Alignment Score:", score)
print(alignment)


### vcfpy

#### Example: Parsing and filtering VCF files
- Purpose: Handle and filter genomic variant data.

In [None]:

import vcfpy

# Open the VCF file using vcfpy
reader = vcfpy.Reader.from_path("../data/vcf_example.vcf")

# Loop through records and filter by QUAL > 40
for record in reader:
    if record.QUAL is not None and record.QUAL > 40:
        alt_alleles = ",".join(alt.value for alt in record.ALT)
        print(f"{record.CHROM}:{record.POS} {record.REF}->{alt_alleles} QUAL:{record.QUAL}")
