1. Install Biopython Package

In [4]:
# Install Biopython if you haven't already
!pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


2. Import Libraries and Configure Entrez

In [5]:
from Bio import Entrez, SeqIO

# Always tell NCBI who you are (replace with your actual email address)
Entrez.email = "your.email@example.com"

3. Retrieve Genome Sequence Data via NCBI (similar to entrez_fetch)

In [6]:
accession_id = "NC_001477"

handle = Entrez.efetch(db="nuccore", id=accession_id, rettype="fasta", retmode="text")
fasta_record = handle.read()
handle.close()

print(f"Retrieved FASTA record for {accession_id}:\n{fasta_record[:200]}...") # Print first 200 chars

Retrieved FASTA record for NC_001477:
>NC_001477.1 Dengue virus 1, complete genome
AGTTGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAGCTTGCTTAACGTAGTTCTAACAGT
TTTTTATTAGAGAGCAGATCTCTGATGAACAACCAACGGAAAAAGACGGGTCGACCGTCTTTCAATATGC
TGAAACGCGCGAG...


4. Parse and Display Sequence Attributes (similar to xmlToList and accessing R list elements)

In [7]:
from io import StringIO
import re

# Use StringIO to treat the string as a file handle for SeqIO.read
record = SeqIO.read(StringIO(fasta_record), "fasta")

print(f"Accession: {record.id}")
print(f"Definition: {record.description}")
print(f"Length: {len(record.seq)}")
print(f"Sequence (first 100 bases): {record.seq[:100]}...")

# Attempt to extract organism from definition
match = re.search(r'\[(.*?)\]', record.description)
organism = match.group(1) if match else "N/A (organism not found in simple FASTA description)"
print(f"Organism: {organism}")

Accession: NC_001477.1
Definition: NC_001477.1 Dengue virus 1, complete genome
Length: 10735
Sequence (first 100 bases): AGTTGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAGCTTGCTTAACGTAGTTCTAACAGTTTTTTATTAGAGAGCAGATCTCTGATGAAC...
Organism: N/A (organism not found in simple FASTA description)


### 5. Save FASTA record to a file and then read it back (similar to `read.fasta`)

In [8]:
# First, let's save the fetched FASTA record to a file
file_name = "den1.fasta"
with open(file_name, "w") as output_handle:
    output_handle.write(fasta_record)

print(f"FASTA record saved to '{file_name}'")

# Now, read the FASTA file back into a SeqRecord object
from Bio import SeqIO

dengueseq_fromfile = SeqIO.read(file_name, "fasta")

print(f"\nType of object read from file: {type(dengueseq_fromfile)}")
print(f"Identifier (name) of the sequence: {dengueseq_fromfile.id}")

# Accessing the sequence itself
dengueseq_fromfile_seq = dengueseq_fromfile.seq

print(f"\nType of the sequence object: {type(dengueseq_fromfile_seq)}")
print(f"First 100 bases of the sequence: {dengueseq_fromfile_seq[:100]}...")
print(f"Length of the sequence: {len(dengueseq_fromfile_seq)}")

FASTA record saved to 'den1.fasta'

Type of object read from file: <class 'Bio.SeqRecord.SeqRecord'>
Identifier (name) of the sequence: NC_001477.1

Type of the sequence object: <class 'Bio.Seq.Seq'>
First 100 bases of the sequence: AGTTGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAGCTTGCTTAACGTAGTTCTAACAGTTTTTTATTAGAGAGCAGATCTCTGATGAAC...
Length of the sequence: 10735


6. Calculating Sequence Statistics (Length, Base Composition, GC Content)

In [9]:
import collections
from Bio.SeqUtils import gc_fraction

# Length of the sequence
sequence_length = len(dengueseq_fromfile_seq)
print(f"Length of the sequence: {sequence_length}")

# Base composition
# Convert Seq object to string for easy counting with collections.Counter
base_counts = collections.Counter(str(dengueseq_fromfile_seq).upper())
print(f"\nBase composition:")
for base in ['A', 'T', 'C', 'G']:
    print(f"   {base}: {base_counts[base]}")

# Calculate GC content
gc_content_percentage = gc_fraction(dengueseq_fromfile_seq) * 100
print(f"\nGC content: {gc_content_percentage:.3f}%")

Length of the sequence: 10735

Base composition:
   A: 3426
   T: 2299
   C: 2240
   G: 2770

GC content: 46.670%


7. Frequencies of DNA Words (k-mers)

In [10]:
import collections

def count_kmers(sequence_obj, k):
    """Counts k-mer frequencies in a Bio.Seq.Seq object."""
    sequence_str = str(sequence_obj).upper() # Convert to string and uppercase
    kmer_counts = collections.Counter()
    for i in range(len(sequence_str) - k + 1):
        kmer = sequence_str[i : i + k]
        kmer_counts[kmer] += 1
    return kmer_counts

# Count 1-mers (mononucleotides)
dengueseq_table_1bp = count_kmers(dengueseq_fromfile_seq, 1)
print(f"\n1-mer counts (dengueseq_table_1bp):\n{dengueseq_table_1bp}")

# Count 2-mers (dinucleotides)
dengue_table_2bp = count_kmers(dengueseq_fromfile_seq, 2)
print(f"\n2-mer counts (dengue_table_2bp):\n{dengue_table_2bp}")

# Count 3-mers (trinucleotides)
dengue_table_3bp = count_kmers(dengueseq_fromfile_seq, 3)
print(f"\n3-mer counts (dengue_table_3bp) - first 10 for brevity:\n{dict(list(dengue_table_3bp.items())[:10])}...")

# Retrieve specific k-mer counts
print(f"\n1-mer count for 'C': {dengueseq_table_1bp['C']}")
print(f"2-mer count for 'TT': {dengue_table_2bp['TT']}")
print(f"3-mer count for 'ATG': {dengue_table_3bp['ATG']}")



1-mer counts (dengueseq_table_1bp):
Counter({'A': 3426, 'G': 2770, 'T': 2299, 'C': 2240})

2-mer counts (dengue_table_2bp):
Counter({'AA': 1108, 'GA': 976, 'CA': 901, 'AG': 890, 'TG': 832, 'GG': 787, 'AC': 720, 'AT': 708, 'CT': 555, 'TT': 529, 'CC': 523, 'GT': 507, 'GC': 500, 'TC': 497, 'TA': 440, 'CG': 261})

3-mer counts (dengue_table_3bp) - first 10 for brevity:
{'AGT': 135, 'GTT': 136, 'TTG': 162, 'TGT': 160, 'TTA': 95, 'TAG': 133, 'GTC': 106, 'TCT': 134, 'CTA': 131, 'TAC': 88}...

1-mer count for 'C': 2240
2-mer count for 'TT': 529
3-mer count for 'ATG': 292


8. Finding Specific DNA Words in a Sequence (similar to matchPattern)

In [11]:
import re

# Convert the Bio.Seq.Seq object to a string (equivalent to c2s() in R)
dengueseq_string = str(dengueseq_fromfile_seq).upper() # Convert to uppercase for consistent matching
print(f"First 100 characters of the sequence string:\n{dengueseq_string[:100]}...")

# Define the pattern to search for
pattern = "ATG"

# Find all occurrences of the pattern
# re.finditer returns an iterator yielding match objects for all non-overlapping matches
# Add 1 to match.start() to make it 1-indexed
matches = [match.start() + 1 for match in re.finditer(pattern, dengueseq_string)]

print(f"\nOccurrences of '{pattern}' found at positions (1-indexed):\n{matches}")
print(f"Total number of '{pattern}' occurrences: {len(matches)}")

First 100 characters of the sequence string:
AGTTGTTAGTCTACGTGGACCGACAAGAACAGTTTCGAATCGGAAGCTTGCTTAACGTAGTTCTAACAGTTTTTTATTAGAGAGCAGATCTCTGATGAAC...

Occurrences of 'ATG' found at positions (1-indexed):
[95, 137, 224, 236, 298, 318, 365, 377, 404, 413, 470, 533, 551, 568, 581, 592, 621, 642, 655, 666, 673, 767, 861, 911, 929, 935, 999, 1014, 1034, 1153, 1242, 1329, 1392, 1407, 1515, 1520, 1535, 1549, 1567, 1665, 1712, 1786, 1799, 1823, 1830, 1835, 1884, 1923, 1930, 1956, 1977, 2135, 2168, 2191, 2265, 2297, 2326, 2363, 2384, 2408, 2428, 2461, 2487, 2566, 2618, 2637, 2664, 2669, 2726, 2744, 2799, 2845, 2866, 2892, 2920, 2969, 3003, 3011, 3085, 3112, 3120, 3140, 3159, 3256, 3282, 3345, 3349, 3416, 3464, 3514, 3527, 3542, 3553, 3566, 3572, 3608, 3622, 3627, 3643, 3650, 3677, 3683, 3707, 3722, 3731, 3849, 3860, 3869, 3872, 3910, 3963, 3967, 3977, 3983, 4015, 4045, 4075, 4091, 4143, 4154, 4200, 4203, 4238, 4249, 4254, 4368, 4371, 4379, 4392, 4404, 4463, 4531, 4575, 4578, 4664, 4694, 4755, 48