# Amplicon 4 Analysis
This notebook takes FASTQ files and uses dms_tools2 to calculate the frequency of codons at mutagenezed sites in the Zika Virus NS5 protein.

FASTQ files for each cell population are specified in `samplesheet.csv`.

## Notebook setup
Imports

In [72]:
import dms_tools2
import gzip
import numpy as np
import os
import pandas as pd
import plotnine as p9
import regex
from Bio import SeqIO
from dms_tools2.ipython_utils import showPDF

# print imports and versions
# copied from https://stackoverflow.com/questions/40428931/package-for-listing-version-of-packages-used-in-a-jupyter-notebook
print('\n'.join(f'{m.__name__}=={m.__version__}' \
                for m in globals().values() if getattr(m, '__version__', None)))

dms_tools2==2.6.8
numpy==1.19.1
pandas==0.25.3
plotnine==0.6.0
regex==2.5.83


Input data

In [82]:
sequencing_dir = '/shared/ngs/illumina/bloom_lab/211223_M03100_0703_000000000-K5NB5/Unaligned/Project_bloom_lab/'
samplesheet = 'data/samplesheet.csv'
reference_file = 'data/ns5.fasta'
alignspecs_file = 'data/alignspecs.txt'

Output data

In [48]:
output_dir = 'output/'

Analysis parameters

Notebook aesthetics

In [3]:
p9.theme_set(p9.theme_classic())
CBPALETTE_RICH = ['#648FFF', '#FFB000', '#DC267F', '#785EF0', '#FE6100']

## Load data
Load samples

In [22]:
samples = pd.read_csv(samplesheet)
samples.insert(0, 'name', (samples['source'] + '-' + samples['read'].apply(str)))
samples['file_long'] = sequencing_dir + samples['file']
display(samples)

Unnamed: 0,name,source,stage,sorted,read,file,file_long
0,plasmid-1,plasmid,,,1,BR_WT_Plasmid_S1_R1_001.fastq.gz,/shared/ngs/illumina/bloom_lab/211223_M03100_0...
1,plasmid-2,plasmid,,,2,BR_WT_Plasmid_S1_R2_001.fastq.gz,/shared/ngs/illumina/bloom_lab/211223_M03100_0...


Load reference sequence

In [23]:
reference = SeqIO.read(reference_file, 'fasta')
ref_seq = reference.seq

print(f'The reference sequence name is: {reference.id}')
print(f'The nucleotide sequence is:\n{ref_seq}')

The reference sequence name is: NS5
The nucleotide sequence is:
GGAGGTGGGACGGGAGAGACTCTGGGAGAGAAGTGGAAAGCTCGTCTGAATCAGATGTCGGCCCTGGAGTTCTACTCTTATAAAAAGTCAGGTATCACTGAAGTGTGTAGAGAGGAGGCTCGCCGTGCCCTCAAGGATGGAGTGGCCACAGGAGGACATGCCGTATCCCGGGGAAGTGCAAAGCTCAGATGGTTGGTGGAGAGAGGATATCTGCAGCCCTATGGGAAGGTTGTTGACCTCGGATGTGGCAGAGGGGGCTGGAGCTATTATGCCGCCACCATCCGCAAAGTGCAGGAGGTGAGAGGATACACAAAGGGAGGTCCCGGTCATGAAGAACCCATGCTGGTGCAAAGCTATGGGTGGAACATAGTTCGTCTCAAGAGTGGAGTGGACGTCTTCCACATGGCGGCTGAGCCGTGTGACACTCTGCTGTGTGACATAGGTGAGTCATCATCTAGTCCTGAAGTGGAAGAGACACGAACACTCAGAGTGCTCTCTATGGTGGGGGACTGGCTTGAAAAAAGACCAGGGGCCTTCTGTATAAAGGTGCTGTGCCCATACACCAGCACTATGATGGAAACCATGGAGCGACTGCAACGTAGGCATGGGGGAGGATTAGTCAGAGTGCCATTGTCTCGCAACTCCACACATGAGATGTACTGGGTCTCTGGGGCAAAGAGCAACATCATAAAAAGTGTGTCCACCACAAGTCAGCTCCTCCTGGGACGCATGGATGGCCCCAGGAGGCCAGTGAAATATGAGGAGGATGTGAACCTCGGCTCGGGTACACGAGCTGTGGCAAGCTGTGCTGAGGCTCCTAACATGAAAATCATCGGCAGGCGCATTGAGAGAATCCGCAATGAACATGCAGAAACATGGTTTCTTGATGAAAACCACCCATACAGGACATGGGCCTACCATGGGAGCTACGAAGCC

Read in alignment specs:

In [25]:
with open (subamplicon_alignspecs, 'r') as file:
    alignspecs = file.read().replace('\n','')
alignspecs

'1951,2285,33,32'

## Count codon frequencies
Use `dms_tools2` to count codon frequencies in sequencing data

Setup:

In [71]:
# Set up output dir
codon_counts_dir = os.path.join(output_dir + 'codon_counts')
os.makedirs(codon_counts_dir, exist_ok=True)

# Make samplesheet
batchfile = 'data/batchfile.csv'
batchfile_list = []
for index, sample in samples.iterrows():
    if sample['read'] == 1:
        batchfile_list.append(sample[['name', 'file_long']])
batchfile_df = pd.DataFrame(batchfile_list)
batchfile_df = batchfile_df.rename(columns={'file_long': 'R1'})
display(batchfile_df)

print(f'Writing batchfile to {batchfile}')
batchfile_df.to_csv(batchfile, index=False)
print('Done.')


Unnamed: 0,name,R1
0,plasmid-1,/shared/ngs/illumina/bloom_lab/211223_M03100_0...


Writing batchfile to data/batchfile.csv
Done.


Command:

In [83]:
ncpus = 16

R1_trim = 200
R2_trim = 200

print('Running dms_tools2')
log = ! dms2_batch_bcsubamp \
        --batchfile {batchfile} \
        --refseq {reference_file} \
        --alignspecs {alignspecs_file} \
        --outdir {codon_counts_dir} \
        --summaryprefix summary \
        --R1trim {R1_trim} \
        --R2trim {R2_trim} \
        --ncpus {ncpus} \

samples['codon_counts_file'] = codon_counts_dir + '/' + samples['name'] + '_codoncounts.csv'

# check that expected codon counts files created
# assert all(map(os.path.isfile, samples.codon_counts_file)), '\n'.join(log)

print(f"Processed sequencing data to create codon counts files in {codon_counts_dir}")

Running dms_tools2
Processed sequencing data to create codon counts files in output/codon_counts
