This notebook attempts to compare whether multiple sequences for the same sample are substantially different by plotting as series of snipit views

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import re

In [2]:
base_dir = os.path.expanduser("~/Dropbox/Zambia_nCoV/Data/Sequencing_Data")
seqdata_fn = "allsequencedata.csv"
fasta_fn = "allsequences.fasta"
fasta_filt_fn = "current.fasta"

In [3]:
#Read in file
seqsum_df = pd.read_csv(os.path.join(base_dir,seqdata_fn),sep="\t")
#Add new column with a dash (makes sorting easier later on)
seqsum_df['unique_mutations'] = "-"
#seqsum_df = seqsum_df[['SampleID','SeqID','SeqRun','qc.overallScore','qc.overallStatus','deletions']]
#Get list of sequences in C29
duplist = seqsum_df['SampleID'][seqsum_df['SeqRun'].str.contains("C29")].tolist()
#Remove NTC
duplist.remove('NTC') 
#Pull out all records in the df that match the list
#duplist_df = seqsum_df[seqsum_df['SampleID'].isin(duplist)]

In [4]:
#Change to base_dir for ease of programming
os.chdir(base_dir)

# View outputs with snipit

In [343]:
#Cycle through each entry to identify all duplicates and 
for s in sl:
    cmd = "rm snp_plot.png"
    os.system(cmd)
    #pull out duplicates and important fields
    dupes = seqsum_df['SeqID'][seqsum_df['SampleID'].str.contains(s)]
    #output list to csv
    dupes.to_csv("seqkit", index = False, header = False)
    #Grep the list to get a fasta file
    cmd = "seqkit grep -n -f seqkit allsequences.fasta -o current.fasta"
    os.system(cmd)
    #Make an alignment
    cmd = "mafft current.fasta > current.aln.fasta"
    os.system(cmd)
    #Make a snipit plot
    cmd = "snipit current.aln.fasta"
    os.system(cmd)
    #Rename plot
    cmd = "mv snp_plot.png " + s +".png"
    os.system(cmd)

# Look for mutations in nextclade output

In [8]:
mut_list = []
uniques = []
#Cycle through each entry to identify all duplicates and 
for dupes in duplist:
    #print("Sample ID %s" % dupes)
    #pull out duplicates and important fields
    duplicates_df = seqsum_df[['SampleID','SeqID','deletions','insertions','substitutions']][seqsum_df['SampleID'] == dupes]
    #combine the mutation columns into one
    duplicates_df['mutations'] = duplicates_df['deletions'] + "," + duplicates_df['substitutions']
    #Make a list of all the SeqIDs
    SeqID_list = duplicates_df['SeqID'].tolist()
    
    #Cycle through duplicates
    for seqid in SeqID_list:
        #Make a list of all the mutations in the target seq
        target_list = duplicates_df['mutations'][duplicates_df['SeqID']==seqid].str.split(",",expand = True).iloc[0].tolist()
        #Make a list of all the mutations in other runs
        others_list = duplicates_df['mutations'][duplicates_df['SeqID']!=seqid].str.split(",",expand = True).iloc[0].tolist()
        #Ensure that the list has only unique entries
        others_list = list(set(others_list))
        #Cycle through each mutation
        for mut in target_list:
            #Ensure that there is a record
            if mut == mut:
                #Only record if it is not found in the others_list
                if mut not in others_list:
                    #print("      %s is unique" % mut)
                    #Append each mutation to the list
                    mut_list.append(mut)
        
        #If non zero string then add to df
        if len(mut_list) == 0:
            uniques = ['None']
        #else:
        #    print("   SeqID: %s" % seqid)
        #    print("      %d unique Mutations" % len(mut_list))
        
        #convert list into a string
        uniques = ",".join(mut_list)
        #Add uniquemutations back into the main df
        seqsum_df.loc[seqsum_df['SeqID']== seqid, 'unique_mutations'] = uniques
        #Reset mut_list
        mut_list = []
        uniques = []

# View with seqkit

In [11]:
count = 0
#Get a unique list of all sample IDs with unique mutations
SampleIDs = list(set(seqsum_df['SampleID'][seqsum_df['unique_mutations']!="-"].to_list()))
#Crete data frame with all sample info
for SampleID in SampleIDs:
    print("SampleID: %s" % SampleID)
    #Extract all SeqIDs associated with that sample
    samples_df = seqsum_df[['SampleID','SeqID','unique_mutations']][seqsum_df['SampleID']==SampleID]
    #Make a list of all SeqIDs
    seqids = samples_df['SeqID']
    #output list to csv for making a fasta
    seqids.to_csv("seqkit", index = False, header = False)
    #Grep the list to get a fasta file
    cmd = "seqkit grep -n -f seqkit allsequences.fasta -o current.fasta"
    os.system(cmd)
    #Make an alignment
    cmd = "mafft current.fasta > current.aln.fasta"
    os.system(cmd)
    for seq in seqids:
        muts = samples_df['unique_mutations'][samples_df['SeqID'] == seq].str.split(",",expand = True).iloc[0].tolist()
        for mut in muts:
            if mut != "":
                #Remove any letters from the mutation
                print("   %s" % mut)
                mut = re.sub("A|C|G|T", "", mut)
                if "-" in mut:
                    #Take the start of the range
                    start = mut.split(sep="-")[0]
                    finish = mut.split(sep="-")[1]
                else:
                    start = str(int(mut) - 0)
                    finish = str(int(mut) + 0)
                cmd = "seqkit subseq -r " + start + ":" + finish + " current.aln.fasta"
                !{cmd}
        print("")
    print("")
    count += 1
    if count > 0:
        break


SampleID: 97968
   C7124T
>97968
t
>97968_P33
n
   A11332G
>97968
g
>97968_P33
n

   28274
>97968
g
>97968_P33
g
   C6402T
>97968
n
>97968_P33
t


