# Python Bioinformatics Notebook

In [None]:

# To be run in terminal, not notebook:
# conda create -n bioinfo python=3.11
# conda activate bioinfo
# conda install -c conda-forge biopython scikit-bio numpy pandas matplotlib jupyter
# pip install PyVCF


## Check environment

In [None]:
import Bio
import skbio
import numpy
import pandas
import matplotlib
import vcf

print("All Python packages successfully installed.")

## Section 1: Bioinformatics Data Formats

In bioinformatics, data comes in specialized file formats designed to handle biological sequences, annotations, genomic alignments, variant information, and gene expression data. Here are the most important formats:



- **FASTA Format**: Stores nucleotide or protein sequences

In [None]:
>gene1_human
ATGGCGTACGCTAGCTAGCTA
>gene2_mouse
ATGCTAGCTAGCTAGTGACTG

In [None]:
#count of sequences
from Bio import SeqIO
for record in SeqIO.parse("../data/fasta_example.fasta", "fasta"):
    print(record.id, record.seq)


- **FASTQ Format**: Stores sequencing reads and their quality scores from sequencing machines.


In [None]:
@SEQ_ID_1
GATCTGACTGACTG
+
IIIIIIIIIIIIIH
@SEQ_ID_2
ATCGATCGTAGCTA
+
IIIIIIIHHHHHHG

In [None]:

from Bio import SeqIO
for record in SeqIO.parse("../data/fastq_example.fastq", "fastq"):
    print(record.id, record.seq, record.letter_annotations["phred_quality"])


- **GenBank Format**: Stores sequences and their annotations, such as gene locations and organism information.

In [None]:
LOCUS       SCU49845     25 bp    DNA             PLN       21-JUN-1999
DEFINITION  Example GenBank entry.
ACCESSION   SCU49845
VERSION     SCU49845.1
KEYWORDS    .
SOURCE      Artificial Sequence
  ORGANISM  Artificial Sequence
            .
FEATURES             Location/Qualifiers
     gene            1..10
                     /gene="example_gene"
ORIGIN
        1 atggcgtaaa tagctagcta ctagc
//

In [None]:

from Bio import SeqIO
record = SeqIO.read("../data/gb_example.gb", "genbank")
print(record.annotations)
for feature in record.features:
    print(feature.type, feature.location)


- **GFF/GTF/BED Formats**: Define genomic feature locations, like gene start and end positions.

In [None]:
chr1    1000    5000    Gene1
chr2    7000    9000    Gene2
chr3    10000	11000    Gene2

In [None]:
import pandas as pd

bed = pd.read_csv("../data/bed_example.bed", sep="\t", header=None,
                  names=["chrom", "start", "end", "name"])
print(bed)


- **SAM/BAM Formats**: Contain alignments of read data to reference genomes, including mapping scores and alignments.

In [None]:
@SQ SN:chr1 LN:10000
seq1    0   chr1    1000    255 10M *   0   0   ACGTAGCTAG  *
seq2    0   chr1    1020    255 10M *   0   0   ACGTAGCTAC  *

In [None]:
# have bam, convert it to bam
import pysam
bamfile = pysam.AlignmentFile("../data/example.bam", "rb")
for read in bamfile.fetch("chr1", 1000, 2000):
    print(read.query_name, read.query_sequence)

- **VCF Format**: Records genomic variants with reference, alternate alleles, and quality scores.

In [None]:
##fileformat=VCFv4.2
##source=ExampleSource
#CHROM POS     ID  REF ALT QUAL FILTER INFO
chr1   10176   .   A   AC  50   PASS   DP=20
chr1   10352   .   T   TA  60   PASS   DP=25
chr1   10616   .   C   G   40   q10    DP=10

In [None]:

import vcf
vcf_reader = vcf.Reader(open("../data/vcf_example.vcf"))
for record in vcf_reader:
    print(record.CHROM, record.POS, record.REF, record.ALT)

- **Gene Expression Format**: Stores expression values in matrix format (RNA-seq counts).

In [None]:
gene    sample1 sample2 sample3
gene1   100     150     200
gene2   300     250     400
gene3   500     450     600

In [None]:
import pandas as pd
expr = pd.read_csv("../data/geneexpression_example.txt", sep="\s+", index_col=0)
print(expr.head())

print("Mean expression per sample:")
print(expr.mean())


## Section 2: Bioinformatics Libraries

### Biopython

#### Example A: Parsing FASTA files & computing GC content
- Purpose: Load sequence data from FASTA files and analyze.

In [None]:
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

for record in SeqIO.parse("../data/fasta_example.fasta", "fasta"):
    gc_content = gc_fraction(record.seq) * 100
    print(f"{record.id}: {gc_content:.2f}% GC")


In [None]:
# have bam, convert it to bam
import pysam
bamfile = pysam.AlignmentFile("../data/bam_example.bam", "rb")
for read in bamfile.fetch("chr1", 1000, 2000):
    print(read.query_name, read.query_sequence)

In [None]:

import vcf
vcf_reader = vcf.Reader(open("../data/vcf_example.vcf"))
for record in vcf_reader:
    print(record.CHROM, record.POS, record.REF, record.ALT)

#### Example B: Sequence translation and complement
- Purpose: Find Complement, Reverse Complement and Translated Protein

In [None]:

from Bio.Seq import Seq

sequence = Seq("ATGGCGTACGCTAGCTAGCTA")
print("Original:", sequence)
print("Complement:", sequence.complement())
print("Reverse Complement:", sequence.reverse_complement())
print("Translated Protein:", sequence.translate())


### scikit-bio

#### Example: Pairwise Sequence Alignment
- Purpose: Align two sequences to identify similarities.

In [None]:

from skbio.alignment import local_pairwise_align_ssw

seq1 = "AGTACACTGGT"
seq2 = "AGTAGACTGGT"

alignment, score, _ = local_pairwise_align_ssw(seq1, seq2)
print("Alignment Score:", score)
print(alignment)


### PyVCF

#### Example: Parsing and filtering VCF files
- Purpose: Handle and filter genomic variant data.

In [None]:

import vcf

vcf_reader = vcf.Reader(open("../data/vcf_example.vcf", "r"))
for record in vcf_reader:
    if record.QUAL > 40:
        print(f"{record.CHROM}:{record.POS} {record.REF}->{record.ALT} QUAL:{record.QUAL}")


## Section 3: Practical Tutorials

### Tutorial 1: Sequence Analysis and Alignment (Biopython & scikit-bio)

#### Step 1: Parse sequences (Biopython)

In [None]:
from Bio import SeqIO

sequences = list(SeqIO.parse("../data/fasta_example.fasta", "fasta"))
seq1 = sequences[0].seq
seq2 = sequences[1].seq

print(f"Seq1: {seq1}\nSeq2: {seq2}")

#### Step 2: Perform Alignment (scikit-bio)

In [None]:
from skbio.alignment import local_pairwise_align_ssw

alignment, score, _ = local_pairwise_align_ssw(seq1, seq2)

print("Alignment Score:", score)
print(alignment)

#### Step 3: Visualization (Simple Matplotlib Example)

In [None]:
import matplotlib.pyplot as plt

scores = [alignment[0].score, alignment[1].score]
labels = ["Sequence 1", "Sequence 2"]

plt.bar(labels, scores)
plt.ylabel('Alignment Score')
plt.title('Pairwise Alignment Scores')
plt.show()

## Section 4: Visualization and Interactive Analysis

### Example A: Sequence Alignment Visualization (matplotlib)
Purpose: Visualize simple alignment scores as a bar plot.

In [None]:
import matplotlib.pyplot as plt

# Example data
labels = ['Seq1 vs Seq2', 'Seq1 vs Seq3', 'Seq2 vs Seq3']
scores = [90, 75, 85]

# Plot
plt.bar(labels, scores)
plt.ylabel('Alignment Score')
plt.title('Sequence Alignment Scores')
plt.show()

### Example B: Variant Quality Distribution (seaborn)
Purpose: Visualize variant quality scores from a VCF file.


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import vcf

vcf_reader = vcf.Reader(open('../data/vcf_example.vcf'))
quals = [record.QUAL for record in vcf_reader]

sns.histplot(quals, kde=True)
plt.xlabel('Variant Quality Score')
plt.title('Distribution of Variant Quality Scores')
plt.show()

### Example C: Interactive Volcano Plot (plotly)
Purpose: Interactively visualize RNA-seq results.


In [None]:
import plotly.express as px
import pandas as pd

df = pd.DataFrame({
    'gene': ['gene1', 'gene2', 'gene3', 'gene4', 'gene5'],
    'log2FoldChange': [2.0, -1.5, 0.8, -2.1, 1.2],
    'pvalue': [0.01, 0.03, 0.2, 0.001, 0.05]
})

fig = px.scatter(df,
                 x='log2FoldChange',
                 y=-np.log10(df['pvalue']),
                 hover_name='gene',
                 color=df['pvalue'] < 0.05,
                 title='Interactive Volcano Plot')

fig.show()