In [284]:
import time
import pandas as pd
import numpy as np
import os
import boto3
s3 = boto3.client('s3')
from Bio import Entrez
from Bio import SeqIO
from Bio import SeqRecord
from Bio import Seq
import re
Entrez.email = "lucy.li@czbiohub.org"
api_key = "1a6a75bc7f8a5a3088510eb4f1b35eefa009"

# Download reference sequences

In [18]:
genuses = ["Culiseta", "Culex", "Aedes"]
taxids = Entrez.read(Entrez.esearch(term=' OR '.join(["("+x+"[All Names] AND Genus[Rank])" for x in genuses]), db="taxonomy"))["IdList"]
genus_taxids = dict(zip(genuses, taxids))


In [19]:
coi_ids = {}
for x in genus_taxids:
    search_term = '[Organism:exp] AND ("mitochondrion"[title] OR "mitochondrial"[title] OR "COI"[title]) AND (1000:30000[Sequence Length]) NOT (predicted[title])'
    coi_ids[x] = Entrez.read(Entrez.esearch(term="txid"+genus_taxids[x]+search_term, db="nucleotide", retmax=1000, api_key=api_key))["IdList"]
    time.sleep(0.1)




In [None]:
seqs = {}
for x in coi_ids:
    seqs[x] = Entrez.efetch(db="nucleotide", id=coi_ids[x], rettype="fasta", retmode="text").read()


In [None]:
with open('mito.fasta', 'w') as f:
    f.write(''.join(list(seqs.values())))



In [None]:
%%bash
bowtie2-build mito.fasta mito

# bowtie reads to reference and assemble aligned reads

In [20]:
metadata = pd.read_csv("../../data/metadata/CMS001_CMS002_MergedAnnotations.csv")

In [None]:
file_sizes = metadata["read1"].apply(lambda x: s3.head_object(Bucket='czb-seqbot', Key='/'.join(x.split("/")[3:]))["ContentLength"])




In [None]:
list(metadata[file_sizes==file_sizes.min()].read1)

In [None]:
metadata["read1"].str.split("/").apply(lambda x: x[4]).unique()

In [None]:
def get_command (filename):
    fn = os.path.basename(filename)
    basefn = fn.replace(".fastq.gz", "")
    fastqfn = fn.replace("fastq.gz", "fastq")
    bamfn = fn.replace("fastq.gz", "bam")
    command = "aws s3 cp "+filename+" .; "
    command += "bowtie2 -U "+fn+" -x mito | samtools view -bS | samtools sort > "+bamfn+"; "
    command += "samtools index "+bamfn+"; "
    command += "samtools fastq "+bamfn+" -F 4 > "+fastqfn+"; "
    command += "gzip "+fastqfn+"; "
    command += "megahit -r "+fn+" -o "+basefn+"; "
    command += "mv "+basefn+"/final.contigs.fa "+basefn+".fasta; "
    command += "rm -rf "+fn+" "+bamfn+" "+bamfn+".bai "+basefn
    return(command)
    
    
    
    

In [None]:
command_str = '\n'.join(list(metadata["read1"].apply(get_command)))
with open ("mito_commands", "w") as f:
    f.write(command_str)




# Compare genomes

In [None]:
%%bash
for x in $(ls CMS*.fasta); do sample=${x/_R1_001.fasta/}; perl -pi -e "s/^>/>${sample}__/g" $x; done
cat CMS*.fasta > all_mito_contigs.fasta
makeblastdb -in all_mito_contigs.fasta -input_type fasta -dbtype nucl -out all_mito
run_blast () {
  prefix="${1%.*}"
  blastn -query $1 -db all_mito -out ${prefix}_blast.txt -outfmt '6 qseqid sseqid evalue qcovs qstart qend sstart send pident qseq sseq' -task dc-megablast
}
export -f run_blast
ls CMS*.fasta | parallel -j 128 run_blast {}
mkdir alignments




In [21]:
blast_colnames = ['qseqid', 'sseqid', 'evalue', 'qcovs', 'qstart', 'qend', 'sstart', 'send', 'pident', 'qseq', 'sseq']

In [151]:
blast_filenames = [x for x in os.listdir() if x.endswith("blast.txt")]

In [152]:
blast_results = pd.concat([pd.read_csv(x, sep="\t", header=None, names=blast_colnames) for x in blast_filenames])
blast_results = blast_results.assign(qsample=blast_results["qseqid"].str.split("__").apply(lambda x: x[0]),
                                     ssample=blast_results["sseqid"].str.split("__").apply(lambda x: x[0]),
                                     qlen=(blast_results["qend"]-blast_results["qstart"]))

In [153]:
num_unique = blast_results.groupby("qseqid")["ssample"].nunique()

In [154]:
filtered_blast_results = blast_results[blast_results["qseqid"].isin(num_unique[num_unique==num_unique.max()].index)].groupby("qseqid", as_index=False).apply(lambda x: x.sort_values(by="qlen").groupby("ssample", as_index=False).first())

In [246]:
filtered_blast_metrics = filtered_blast_results.groupby("qseqid").apply(lambda x: pd.DataFrame({"num_long":[(x["qlen"]>=x["qlen"].max()*0.8).sum()], "mean_qlen":[x["qlen"].mean()]}))

In [251]:
subset_criterion1 = filtered_blast_metrics["mean_qlen"]>filtered_blast_metrics["mean_qlen"].max()*0.3
subset_criterion2 = filtered_blast_metrics["num_long"]>20
qseqid_subset = filtered_blast_metrics[subset_criterion1 & subset_criterion2].reset_index()["qseqid"].tolist()






In [252]:
def create_seq_record (blast_row):
    sequence = Seq.Seq(blast_row["sseq"].replace("-", ""))
    id_name = blast_row["ssample"]
    if blast_row["sstart"]>blast_row["send"]:
        output_seq = SeqRecord.SeqRecord(seq=sequence).reverse_complement(id=id_name, description="")
    else:
        output_seq = SeqRecord.SeqRecord(seq=sequence, id=id_name, description="")
    return (output_seq)

def get_best_blast_hit (df, outfn, qlen_threshold=0.8):
    selected_blast_results = df[df["qlen"]>df["qlen"].max()*qlen_threshold]
    selected_sequences = selected_blast_results.apply(create_seq_record, axis=1).tolist()
    SeqIO.write(selected_sequences, outfn, "fasta")
    return ([selected_blast_results, selected_sequences])


In [244]:
alignments = {}
for qseqid_x in qseqid_subset:
    alignments[qseqid_x] = get_best_blast_hit(filtered_blast_results[filtered_blast_results["qseqid"]==qseqid_x],
                                              outfn=os.path.join("alignments", qseqid_x+".fasta"),
                                              qlen_threshold=0.8)
    print (qseqid_x+".fasta written to file")

In [None]:
%%bash
ls alignments/*.fasta | parallel muscle -in {} -out {.}.aln
mkdir trees

In [324]:
aln_fn = [x for x in os.listdir("alignments") if x.endswith(".aln")]
aligned_sequences = [list(SeqIO.parse(os.path.join("alignments", x), "fasta")) for x in aln_fn]

In [325]:
num_gap_chunks = [np.mean([re.sub("\-+", "-", str(x.seq)).count('-')/len(x.seq) for x in aligned_seq]) for aligned_seq in aligned_sequences]

In [326]:
num_gaps = [np.mean([x.seq.count('-')/len(x.seq) for x in aligned_seq]) for aligned_seq in aligned_sequences]

In [327]:
alignment_qual = pd.DataFrame({"filename":aln_fn, "seqs":aligned_sequences, "num_gap_chunks":num_gap_chunks, "num_gaps":num_gaps})
alignment_qual = alignment_qual.assign(rank=alignment_qual["num_gap_chunks"].rank()+alignment_qual["num_gaps"].rank())

In [335]:
alignment_qual.sort_values(by="rank").head(n=20)

Unnamed: 0,filename,seqs,num_gap_chunks,num_gaps,rank
341,CMS_002_20b_Rb_S132_L004__k141_857.aln,"[(-, A, C, G, C, T, C, A, A, G, A, T, C, G, -,...",0.01765,0.117949,81.0
219,CMS_039_RNA_A_S9__k141_1912.aln,"[(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,...",0.01661,0.1264,88.0
119,CMS_040_RNA_A_S21__k141_1635.aln,"[(-, -, -, -, -, -, -, -, -, -, C, T, C, A, T,...",0.0179,0.115679,96.0
194,CMS_007_RNA_A_S12__k141_3208.aln,"[(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,...",0.017839,0.119364,97.0
202,CMS_004_RNA_A_S2__k141_3446.aln,"[(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,...",0.017781,0.12113,97.0
346,CMS_002_26e_Rb_S150_L004__k141_2016.aln,"[(-, -, C, A, G, G, C, T, A, A, G, G, T, T, A,...",0.018025,0.102701,104.0
337,CMS_022_RNA_A_S6__k141_1252.aln,"[(C, T, C, A, T, T, T, A, T, T, A, T, C, G, T,...",0.017942,0.11622,104.0
215,CMS_033_RNA_A_S8__k141_3882.aln,"[(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,...",0.017692,0.123334,106.0
243,CMS_001_17_S6_L001__k141_930.aln,"[(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,...",0.017892,0.120317,107.0
82,CMS_002_23a_Rb_S138_L004__k141_2824.aln,"[(-, -, -, -, -, -, -, -, -, -, -, -, -, -, -,...",0.018031,0.115942,112.0


In [332]:
# command to make phylogenetic trees
"parallel iqtree -nt AUTO -s {} -pre trees/{/.} ::: "+' '.join(alignment_qual.sort_values(by="rank").head(n=20)["filename"].apply(lambda x: os.path.join("alignments", x)).tolist())

'parallel iqtree -nt AUTO -s {} -pre trees/{/.} ::: alignments/CMS_002_20b_Rb_S132_L004__k141_857.aln alignments/CMS_039_RNA_A_S9__k141_1912.aln alignments/CMS_040_RNA_A_S21__k141_1635.aln alignments/CMS_007_RNA_A_S12__k141_3208.aln alignments/CMS_004_RNA_A_S2__k141_3446.aln alignments/CMS_002_26e_Rb_S150_L004__k141_2016.aln alignments/CMS_022_RNA_A_S6__k141_1252.aln alignments/CMS_033_RNA_A_S8__k141_3882.aln alignments/CMS_001_17_S6_L001__k141_930.aln alignments/CMS_002_23a_Rb_S138_L004__k141_2824.aln alignments/CMS_002_25a_Rb_S140_L004__k141_230.aln alignments/CMS_028_RNA_A_S17__k141_2290.aln alignments/CMS_002_36a_Rb_S170_L004__k141_4533.aln alignments/CMS_002_47h_Rb_S1_L004__k141_3353.aln alignments/CMS_002_42a_Rb_S177_L004__k141_3099.aln alignments/CMS_045_RNA_A_S2__k141_1567.aln alignments/CMS_002_40a_Rb_S174_L004__k141_1081.aln alignments/CMS_035_RNA_A_S20__k141_4750.aln alignments/CMS_002_29b_Rb_S160_L004__k141_3408.aln alignments/CMS_002_29e_Rb_S164_L004__k141_3073.aln'

In [None]:
%%bash
ls alignments/*.aln | parallel iqtree -nt AUTO -s {} -pre trees/{/.}