# Q1. Write a Biopython code snippet to read a DNA sequence from a FASTA file and print its length.

In [None]:
from Bio import SeqIO
fasta_file = 'sequence.fasta'
sequence = SeqIO.read(fasta_file, "fasta")
length = len(sequence)
print(f'Sequence length: {length}')

# Q2. Write a Biopython code snippet to reverse complement a DNA sequence.

In [3]:
from Bio.Seq import Seq
sequence = Seq("ATGCCAGCTACCT")
reversed_seq = sequence.reverse_complement()
print(f'Reversed complement: {reversed_seq}')

Reversed complement: AGGTAGCTGGCAT


# Q3. Write a Biopython code snippet to calculate the GC content of a DNA sequence.

In [4]:
from Bio.SeqUtils import GC
sequence = "ATGCCAGCTACCT"
gc_content = GC(sequence)
print(f'GC content: {gc_content}')

GC content: 53.84615384615385


# Q4. Write a Biopython code snippet to perform a BLAST search against the NCBI database using a query sequence.

In [7]:
from Bio.Blast import NCBIWWW
sequence = "ATGCCAGCTACCT"
result_handle = NCBIWWW.qblast("blastn", "nt", sequence)

# Q5. Write a Biopython code snippet to parse a PDB file and extract the amino acid sequence.

In [None]:
from Bio.PDB import PDBParser
pdb_file = "protein.pdb"
parser = PDBParser()
structure = parser.get_structure("protein", pdb_file)
model = structure[0]
chain = model["A"]
amino_acids = [residue.get_resname() for residue in chain]

# Q6. Write a Biopython code snippet to calculate the molecular weight of a protein sequence.

In [10]:
from Bio.SeqUtils.ProtParam import ProteinAnalysis
protein = ProteinAnalysis("MATVDEERPKHHEQ")
mol_wt = protein.molecular_weight()
print(f'Molecular weight: {mol_wt}')

Molecular weight: 1706.8345000000002


# Q7. Write a Biopython code snippet to retrieve the DNA sequence of a gene from the GenBank database using its accession number.

In [13]:
from Bio import Entrez, SeqIO
accession_number = "NM_123456"
Entrez.email = "debnathk1997@gmail.com"
handle = Entrez.efetch(db="nucleotide", id=accession_number, rettype="fasta")
record = SeqIO.read(handle, "fasta")
record.seq

Seq('ATGGCTTTTTCATCACCGTCTGACTTCAAGAGATACCATGTCTTTTCGAGTTTC...TGA')

# Q8. Write a Biopython code snippet to perform a multiple sequence alignment (MSA) using the ClustalW algorithm.

In [None]:
from Bio.Align.Applications import ClustalwCommandline

input_file = "sequence.fasta"
output_file = "alignment.aln"
clustalw_cline = ClustalwCommandline("clustalw2", infile=input_file, outfile=output_file)
clustalw_cline()

# Q9. Write a Biopython code snippet to parse a phylogenetic tree from a Newick format file and visualize it.

In [None]:
from Bio import Phylo
tree_file  = "file.newick"
tree = Phylo.read(tree_file, "newick")
Phylo.draw(tree)

# Q10. Write a Biopython code snippet to retrieve a protein sequence from the NCBI database using its GI number.

In [None]:
from Bio import Entrez, SeqIO
gi_number = "123456"
Entrez.email = "debnathk1997@gmail.com"
handle = Entrez.efetch(db="protein", id=gi_number, rettype="fasta")
record = SeqIO.read(handle, "fasta")
sequence = record.seq

# Q11. Write a Python function that takes a DNA sequence as input and returns the reverse complement sequence using Biopython.

In [1]:
from Bio.Seq import Seq

def reverse_complement(seq):
    rev_seq = seq.reverse_complement()
    return rev_seq

if __name__ == "__main__":
    seq = Seq("ATTCCAGCTAAGTTCAAG")
    rev_seq = reverse_complement(seq)
    print(f'DNA: {seq}')
    print(f'Reverse Complement: {rev_seq}')

DNA: ATTCCAGCTAAGTTCAAG
Reverse Complement: CTTGAACTTAGCTGGAAT


# Q12. Write a Python function that reads a FASTA file containing multiple DNA sequences, calculates the GC content for each sequence, and returns the sequence with the highest GC content.

In [18]:
from Bio import SeqIO
from Bio.SeqUtils import GC

def highest_GC(fasta_file):
   max_gc_seq = ""
   max_gc = 0.0

   for record in SeqIO.parse(fasta_file, "fasta"):
    gc_seq = GC(record.seq)
    if gc_seq > max_gc:
       max_gc = gc_seq
       max_gc_seq = record
    
    return max_gc_seq, max_gc

if __name__ == "__main__":
    fasta_file = "ls_orchid.fasta"
    max_gc_seq, max_gc = highest_GC(fasta_file)
    print(f"The sequence with highest GC content is: {max_gc_seq.seq}, with GC content: {max_gc}%")

The sequence with highest GC content is: CGTAACAAGGTTTCCGTAGGTGAACCTGCGGAAGGATCATTGATGAGACCGTGGAATAAACGATCGAGTGAATCCGGAGGACCGGTGTACTCAGCTCACCGGGGGCATTGCTCCCGTGGTGACCCTGATTTGTTGTTGGGCCGCCTCGGGAGCGTCCATGGCGGGTTTGAACCTCTAGCCCGGCGCAGTTTGGGCGCCAAGCCATATGAAAGCATCACCGGCGAATGGCATTGTCTTCCCCAAAACCCGGAGCGGCGGCGTGCTGTCGCGTGCCCAATGAATTTTGATGACTCTCGCAAACGGGAATCTTGGCTCTTTGCATCGGATGGAAGGACGCAGCGAAATGCGATAAGTGGTGTGAATTGCAAGATCCCGTGAACCATCGAGTCTTTTGAACGCAAGTTGCGCCCGAGGCCATCAGGCTAAGGGCACGCCTGCTTGGGCGTCGCGCTTCGTCTCTCTCCTGCCAATGCTTGCCCGGCATACAGCCAGGCCGGCGTGGTGCGGATGTGAAAGATTGGCCCCTTGTGCCTAGGTGCGGCGGGTCCAAGAGCTGGTGTTTTGATGGCCCGGAACCCGGCAAGAGGTGGACGGATGCTGGCAGCAGCTGCCGTGCGAATCCCCCATGTTGTCGTGCTTGTCGGACAGGCAGGAGAACCCTTCCGAACCCCAATGGAGGGCGGTTGACCGCCATTCGGATGTGACCCCAGGTCAGGCGGGGGCACCCGCTGAGTTTACGC, with GC content: 59.5945945945946%


# Q13. Write a Python function that performs a BLAST search for a given DNA sequence against the NCBI nucleotide database using Biopython. The function should return a list of hit descriptions.


In [27]:
from Bio.Blast import NCBIWWW
from Bio import SearchIO, SeqIO

def dna_blast(dna_seq):

    result_handle = NCBIWWW.qblast("blastn", "nt", dna_seq)
    hit_desc = SearchIO.read(result_handle, "blast-xml")

    return hit_desc

if __name__ == "__main__":
    fasta_file = [rec for rec in SeqIO.parse("ls_orchid.fasta", "fasta")]
    dna = fasta_file[0]
    dna_seq = dna.seq
    hit_desc = dna_blast(dna_seq)
    print(hit_desc)

Program: blastn (2.14.1+)
  Query: No (740)
         definition line
 Target: nt
   Hits: ----  -----  ----------------------------------------------------------
            #  # HSP  ID + description
         ----  -----  ----------------------------------------------------------
            0      1  gi|2765658|emb|Z78533.1|  C.irapeanum 5.8S rRNA gene an...
            1      1  gi|751868575|gb|KJ939539.1|  Cypripedium irapeanum vouc...
            2      1  gi|347626748|emb|FR720328.1|  Cypripedium irapeanum ITS...
            3      1  gi|347626747|emb|FR720327.1|  Cypripedium molle ITS1, 5...
            4      1  gi|751868574|gb|KJ939538.1|  Cypripedium dickinsonianum...
            5      1  gi|402502938|gb|JQ660885.1|  Cypripedium dickinsonianum...
            6      1  gi|2765653|emb|Z78528.1|  C.plectrochilum 5.8S rRNA gen...
            7      1  gi|340742827|gb|JN018077.1|  Cypripedium plectrochilum ...
            8      1  gi|350999033|gb|JF796899.1|  Cypripedium arietin

# Q14. Write a function to perform global alignment of two DNA sequences.

In [52]:
from Bio import pairwise2

def align_sequences(seq1, seq2):

    alns = pairwise2.align.globalxx(seq1, seq2)
    for aln in alns:
        print(pairwise2.format_alignment(*aln))


if __name__ == "__main__":
    align_sequences("ATGACACGTACGTACGTACGTACGTACGTACGTATG", "CAGACACGTACGTACGTTTGAACGTACGTACGTACA")

-ATGACACGTACGTACGT---A-CGTACGTACGTACGTATG
 | |||||||||||||||   | |||||||||||||  |  
CA-GACACGTACGTACGTTTGAACGTACGTACGTAC--A--
  Score=31

-ATGACACGTACGTACG-T--A-CGTACGTACGTACGTATG
 | |||||||||||||| |  | |||||||||||||  |  
CA-GACACGTACGTACGTTTGAACGTACGTACGTAC--A--
  Score=31

-ATGACACGTACGTACG--T-A-CGTACGTACGTACGTATG
 | ||||||||||||||  | | |||||||||||||  |  
CA-GACACGTACGTACGTTTGAACGTACGTACGTAC--A--
  Score=31

-ATGACACGTACGTACGT----ACGTACGTACGTACGTATG
 | |||||||||||||||    ||||||||||||||  |  
CA-GACACGTACGTACGTTTGAACGTACGTACGTAC--A--
  Score=31

-ATGACACGTACGTACG-T---ACGTACGTACGTACGTATG
 | |||||||||||||| |   ||||||||||||||  |  
CA-GACACGTACGTACGTTTGAACGTACGTACGTAC--A--
  Score=31

-ATGACACGTACGTACG--T--ACGTACGTACGTACGTATG
 | ||||||||||||||  |  ||||||||||||||  |  
CA-GACACGTACGTACGTTTGAACGTACGTACGTAC--A--
  Score=31



# Q15. Write a Python function that reads a GenBank file and extracts the coding sequence (CDS).


In [44]:
from Bio import SeqIO

def extract_cds(genbank_file):

    with open(genbank_file, "r") as f:
        genbank_record = SeqIO.read(f, "genbank")

    cds_seqs = ""
    for feature in genbank_record.features:
        if feature.type == "CDS":
            cds_seqs = feature.extract(genbank_record.seq)

    return cds_seqs

if __name__ == "__main__":
    genbank_file = "sequence.gb"
    cds_seqs = extract_cds(genbank_file)
    print(cds_seqs)

ATGTCTCAAGGTAATCTTTATATTTTATCTGCACCAAGTGGCGCAGGAAAATCTTCATTAATTTCAGCGTTATTGGCATCAGATAGCTCAACTCAAAAAATGGTTTCTGTGTCACATACGACCCGTGCCCCACGCCCGGGTGAAGTTGAAGGCGTACACTATTATTTTGTATCAAAAGAAGAGTTTGAATCACTCATTGAGCAAGATTTATTTCTAGAATATGCCAAAGTTTTTGGTGGCAATTATTATGGAACCTCTTTACCTGCGATTGAGGAAAATTTAGCAAAAGGCATTGATGTATTTTTAGATATTGATTGGCAGGGCGCCCAACAAATCCGTAAAAAAGTGCCTTCAGTAAAAAGCATTTTTATTTTACCGCCTTCATTGCCTGAATTAGAGCGTCGTTTAATTGGTCGAGGGCAAGATAGTGAAGAGGTTATCGCTGAACGAATGTCAAAAGCGATGAGTGAAATTTCGCATTATGACGAATATGATTATGTCATTGTGAATGATGATTTTGAGAAAACATTAAAAGATTTACAAAGTATTTTGCAATCGGAACGCTTAACTAAAGATTATCAACAAAAACAAAATGCAATGTTAATTCAACAGCTACTAGCAAAATAG
