# 01_ATBI
## Modern File Formats


### Metadata

In [None]:
# Packages
# If error in pyvcf, execute the following:
!pip install "setuptools<58" --upgrade
!pip install pandas numpy scipy biopython pysam htseq dendropy scikit-learn seaborn sequana-fastqc matplotlib==3.2.2 seqlogo pyvcf IPython

In [5]:
# Imports
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
# Directory information
## if not working -> View > Table of contents > folder icon (Files) -> Mount Drive
base_dir = "/content/drive/MyDrive/ATBI_2024/ATBI_1/"
data_dir = base_dir + "data/"

In [7]:
# Filenames
fasta = "ABL_GENE.fa"
fastq = "single_end.fastq"
sam = "sam_file.sam"
vcf_file = "ALL.chrY.genome_strip_hq.20101123.svs.low_coverage.genotypes.vcf"

In [8]:
# Other
ref = "CACGGACATCACCATGAAGCACAAGCTGGGCGGGGGCCAGTACGGGGAGGTGTACGAGGGCGTGTGGAAGAAATACAGCCTGACGGTGGCCGTGAAGACCTTGAAGGAGGACACCATGGAGGTGGAAGAGTTCTTGAAAGAAGCTGCAGTCATGAAAGAGATCAAACACCCTAACCTGGTGCAGCTCCTTGGGGTCTGCACCCGGGAGCCCCCGTTCTATATCATCACTGAGTTCATGACCTACGGGAACCTCCTGGACTACCTGAGGGAGTGCAACCGGCAGGAGGTGAACGCCGTGGTGCTGCTGTACATGGCCACTCAGATCTCGTCAGCCATGGAGTACCTGGAGAAGAAAAACTTCATCCACAGAGATCTTGCTGCCCGAAACTGCCTGGTAGGGGAGAACCACTTGGTGAAGGTAGCTGATTTTGGCCTGAGCAGGTTGATGACAGGGGACACCTACACAGCCCATGCTGGAGCCAAGTTCCCCATCAAATGGACTGCACCC"

### 1. FASTA

In [None]:
from Bio import SeqIO, Seq

# read FASTA file


In [None]:
# store as DNA


In [15]:
# Reminder:
from IPython.display import Image, display

display(Image(base_dir + "/images/ProteinBiosynthesis.png"))

In [None]:
# transkribe into RNA


In [None]:
# translate into amino acid sequence


In [None]:
# Reminder: Genetic Code

display(Image(base_dir + "/images/GeneticCode.jpg"))

### 2. FASTQ

In [None]:
# read FASTQ file



In [None]:
# print sequence information


In [None]:
# Quality analysis: FastQC in Python using Sequana-FastQC
from sequana import FastQC, PacbioSubreads



In [None]:
# collect sequences
seqs = []


In [None]:
# sequence alignment (simple example)
s1 = seqs[0]
s2 = seqs[1]




In [None]:
# translation and sequence alignment


In [None]:
### FastQC

### 3. SAM/BAM

In [None]:
import pysam

# read SAM file


In [None]:
# store SAM file in pandas dataframe
columns = ["QNAME", "FLAG", "RNAME", "POS", "MAPQ", "CIGAR", "RNEXT", "PNEXT", "TLEN", "SEQ", "QUAL", "AS: alignment score", "XN: number of ambiguous bases", "XM: number of mismatches", "XO: number of gap opens", "XG: number of gap extensions", "NM: edit distance", "MD: string representation of the mismatched reference bases", "YT: UU indicates the read was not part of a pair" ]


In [None]:
# print SAM dataframe
df_sam


In [None]:
# File conversion: SAM to BAM



In [None]:
from sequana import BAM

# plot BAM information


In [None]:
# Point Mutation Analysis with Position Weight Matrix
from collections import Counter


In [None]:
# add colnames to pwm


In [None]:
### IGV: Integrative Genomics Viewer

### 4. VCF

In [None]:
# using vcfpy
import vcf

# read VCF file


In [None]:
# collect records


In [None]:
for record in records:
  print("Record:\t" + str(record))
  print("Chromosome:\t" + record.CHROM)
  print("Reference Sequence:\t" + str(record.POS))
  print("Reference Sequence:\t" + record.REF)
  print("Deletion?\t" + str(record.is_deletion))
  print("InDel?\t" + str(record.is_indel))
  print("Is SNP?\t" + str(record.is_snp))
  print("Alternate allele frequency:\t" + str(record.aaf))

  break


In [None]:
# using pysam
v2 = pysam.VariantFile(data_dir + vcf_file)

# collect records
records2 = []
for i in v2.fetch():
  records2.append(i)

In [None]:
for record in records2:
  print("Record:\t" + str(record))
  print("Chromosome:\t" + record.crom)
  print("Reference Sequence:\t" + str(record.pos))
  break

In [None]:
# collect alternate allele frequencies


In [None]:
# plot aafs


In [None]:
### Galaxy Tools

### 5. PDB

In [None]:
from Bio import PDB

In [None]:
# download PDB files


In [None]:
# parse PDB files


Meta information about PDB files

In [None]:
# 4HHB


In [None]:
# 1MBN
[print(x + str(":\t") + str(pdb2.header[x])) for x in pdb2.header]

In [None]:
# 1EMY
[print(x + str(":\t") + str(pdb3.header[x])) for x in pdb3.header]

In [None]:
# Structures
pdbs = {"4HHB" : pdb1,
        "1MBN" : pdb2,
        "1EMY" : pdb2}
for key in pdbs.keys():
  for structure in pdbs[key]:
    for chain in structure:
      print(key + ":\tChain " + str(chain.id) + "\tNumber of residues: " + str(len(list(chain.get_residues()))) + "\tNumber of Atoms: " + str(len(list(chain.get_atoms()))) )
    print()


In [None]:
# Residues
for i, res in enumerate(pdbs["4HHB"][0]["A"].get_residues()):
  print(pdbs["4HHB"][0]["A"][i+1])
  for atom in res:
    print(atom, atom.serial_number, atom.element)

  if i > 3:
    break


In [None]:
# FASTA Sequence
# a)
d3to1 = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
 'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N',
 'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W',
 'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}

for key in pdbs.keys():
  for structure in pdbs[key]:
    for chain in structure:
      seq = ""
      for res in chain.get_residues():
        if res.get_resname() in d3to1.keys():
          seq += d3to1[res.get_resname()]
      print(key + ":\tChain " + str(chain.id) + " " + seq)
    print()


In [None]:
### pymol